diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c2d564dfda..5a883b64ed 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-publish-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8c99e3de6a..c8bd28d4bb 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -24,41 +24,41 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -68,34 +68,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12
     with:
       build_command: |
         sccache -z;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4e45ae29f6..1c2395cb68 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 56d422b489..5b1297b63c 100755
--- a/README.md
+++ b/README.md
@@ -255,106 +255,54 @@ pairwise_distance(in1, in2, out=output, metric="euclidean")
 
 ## Installing
 
-RAFT itself can be installed through conda, [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), pip, or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on installing and building RAFT and using it in downstream projects.
+RAFT's C++ and Python libraries can both be installed through Conda and the Python libraries through Pip. 
 
-### Conda
+
+### Installing C++ and Python through Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-headers` RAFT headers
-- `libraft` (optional) shared library of pre-compiled template instantiations and runtime APIs.
-- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
-- `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
+- `libraft-headers` C++ headers
+- `libraft` (optional) C++ shared library containing pre-compiled template instantiations and runtime API.
+- `pylibraft` (optional) Python library
+- `raft-dask` (optional) Python library for deployment of multi-node multi-GPU algorithms that use the RAFT `raft::comms` abstraction layer in Dask clusters.
+- `raft-ann-bench` (optional) Benchmarking tool for easily producing benchmarks that compare RAFT's vector search algorithms against other state-of-the-art implementations.
+- `raft-ann-bench-cpu` (optional) Reproducible benchmarking tool similar to above, but doesn't require CUDA to be installed on the machine. Can be used to test in environments with competitive CPUs.
+
+Use the following command, depending on your CUDA version, to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command.
+```bash
+# for CUDA 11.8
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=11.8
+```
 
-Use the following command to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command.
 ```bash
-mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft
+# for CUDA 12.0
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0
 ```
 
-You can also install the conda packages individually using the `mamba` command above.
+Note that the above commands will also install `libraft-headers` and `libraft`.
+
+You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project:
+```bash
+# for CUDA 12.0
+mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
+```
 
-After installing RAFT, `find_package(raft COMPONENTS compiled distributed)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-23.12/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
-### Pip
+### Installing Python through Pip
 
-pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install):
+`pylibraft` and `raft-dask` both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install):
 ```bash
 pip install pylibraft-cu11 --extra-index-url=https://pypi.nvidia.com
 pip install raft-dask-cu11 --extra-index-url=https://pypi.nvidia.com
 ```
 
-### CMake & CPM
-
-RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it easy to include in downstream cmake projects. RAPIDS-CMake provides a convenience layer around CPM. Please refer to [these instructions](https://github.com/rapidsai/rapids-cmake#installation) to install and use rapids-cmake in your project.
-
-#### Example Template Project
+These packages statically build RAFT's pre-compiled instantiations and so the C++ headers and pre-compiled shared library won't be readily available to use in your code.
 
-You can find an [example RAFT](cpp/template/README.md) project template in the `cpp/template` directory, which demonstrates how to build a new application with RAFT or incorporate RAFT into an existing cmake project.
+The [build instructions](https://docs.rapids.ai/api/raft/nightly/build/) contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ and Python from source](https://docs.rapids.ai/api/raft/nightly/build/#building-c-and-python-from-source) section of the build instructions.
 
-#### CMake Targets
-
-Additional CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available. RAFT headers require, at a minimum, the CUDA toolkit libraries and RMM dependencies.
-
-| Component   | Target              | Description                                              | Base Dependencies                      |
-|-------------|---------------------|----------------------------------------------------------|----------------------------------------|
-| n/a         | `raft::raft`        | Full RAFT header library                                 | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS |
-| compiled    | `raft::compiled`    | Pre-compiled template instantiations and runtime library | raft::raft                             |
-| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs                      | raft::raft, UCX, NCCL                  |
-
-### Source
-
-The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
-1. Create an environment with the needed dependencies:
-```
-mamba env create --name raft_dev_env -f conda/environments/all_cuda-118_arch-x86_64.yaml
-mamba activate raft_dev_env
-```
-```
-./build.sh raft-dask pylibraft libraft tests bench --compile-lib
-```
+You can find an example [RAFT project template](cpp/template/README.md) in the `cpp/template` directory, which demonstrates how to build a new application with RAFT or incorporate RAFT into an existing CMake project.
 
-The [build](docs/source/build.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](docs/source/build.md#building-raft-c-from-source-in-cmake) section of the build instructions.
-
-## Folder Structure and Contents
-
-The folder structure mirrors other RAPIDS repos, with the following folders:
-
-- `bench/ann`: Python scripts for running ANN benchmarks
-- `ci`: Scripts for running CI in PRs
-- `conda`: Conda recipes and development conda environments
-- `cpp`: Source code for C++ libraries.
-  - `bench`: Benchmarks source code
-  - `cmake`: CMake modules and templates
-  - `doxygen`: Doxygen configuration
-  - `include`: The C++ API headers are fully-contained here (deprecated directories are excluded from the listing below)
-    - `cluster`: Basic clustering primitives and algorithms.
-    - `comms`: A multi-node multi-GPU communications abstraction layer for NCCL+UCX and MPI+NCCL, which can be deployed in Dask clusters using the `raft-dask` Python package.
-    - `core`: Core API headers which require minimal dependencies aside from RMM and Cudatoolkit. These are safe to expose on public APIs and do not require `nvcc` to build. This is the same for any headers in RAFT which have the suffix `*_types.hpp`.
-    - `distance`: Distance primitives
-    - `linalg`: Dense linear algebra
-    - `matrix`: Dense matrix operations
-    - `neighbors`: Nearest neighbors and knn graph construction
-    - `random`: Random number generation, sampling, and data generation primitives
-    - `solver`: Iterative and combinatorial solvers for optimization and approximation
-    - `sparse`: Sparse matrix operations
-      - `convert`: Sparse conversion functions
-      - `distance`: Sparse distance computations
-      - `linalg`: Sparse linear algebra
-      - `neighbors`: Sparse nearest neighbors and knn graph construction
-      - `op`: Various sparse operations such as slicing and filtering (Note: this will soon be renamed to `sparse/matrix`)
-      - `solver`: Sparse solvers for optimization and approximation
-    - `stats`: Moments, summary statistics, model performance measures
-    - `util`: Various reusable tools and utilities for accelerated algorithm development
-  - `internal`: A private header-only component that hosts the code shared between benchmarks and tests.
-  - `scripts`: Helpful scripts for development
-  - `src`: Compiled APIs and template instantiations for the shared libraries
-  - `template`: A skeleton template containing the bare-bones file structure and cmake configuration for writing applications with RAFT.
-  - `test`: Googletests source code
-- `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs)
-- `notebooks`: IPython notebooks with usage examples and tutorials
-- `python`: Source code for Python libraries.
-  - `pylibraft`: Python build and source code for pylibraft library
-  - `raft-dask`: Python build and source code for raft-dask library
-- `thirdparty`: Third-party licenses
 
 ## Contributing
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a867a71f68..63e0fd5ba9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -82,7 +82,7 @@ for FILE in .github/workflows/*.yaml; do
 done
 
 for FILE in .github/workflows/*.yaml; do
-  sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
index c804ecb3a1..19d78f4cd9 100644
--- a/cpp/bench/prims/distance/masked_nn.cu
+++ b/cpp/bench/prims/distance/masked_nn.cu
@@ -46,10 +46,10 @@ struct Params {
   AdjacencyPattern pattern;
 };  // struct Params
 
-__global__ void init_adj(AdjacencyPattern pattern,
-                         int n,
-                         raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
-                         raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
+RAFT_KERNEL init_adj(AdjacencyPattern pattern,
+                     int n,
+                     raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
+                     raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
 {
   int m          = adj.extent(0);
   int num_groups = adj.extent(1);
diff --git a/cpp/bench/prims/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu
index c9dcae6985..634c749a54 100644
--- a/cpp/bench/prims/sparse/convert_csr.cu
+++ b/cpp/bench/prims/sparse/convert_csr.cu
@@ -30,7 +30,7 @@ struct bench_param {
 };
 
 template <typename index_t>
-__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
+RAFT_KERNEL init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
 {
   index_t r = blockDim.y * blockIdx.y + threadIdx.y;
   index_t c = blockDim.x * blockIdx.x + threadIdx.x;
diff --git a/cpp/include/raft/cluster/detail/agglomerative.cuh b/cpp/include/raft/cluster/detail/agglomerative.cuh
index 624e67b7fa..f2c83abdd3 100644
--- a/cpp/include/raft/cluster/detail/agglomerative.cuh
+++ b/cpp/include/raft/cluster/detail/agglomerative.cuh
@@ -155,9 +155,7 @@ void build_dendrogram_host(raft::resources const& handle,
 }
 
 template <typename value_idx>
-__global__ void write_levels_kernel(const value_idx* children,
-                                    value_idx* parents,
-                                    value_idx n_vertices)
+RAFT_KERNEL write_levels_kernel(const value_idx* children, value_idx* parents, value_idx n_vertices)
 {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < n_vertices) {
@@ -179,12 +177,12 @@ __global__ void write_levels_kernel(const value_idx* children,
  * @param labels
  */
 template <typename value_idx>
-__global__ void inherit_labels(const value_idx* children,
-                               const value_idx* levels,
-                               std::size_t n_leaves,
-                               value_idx* labels,
-                               int cut_level,
-                               value_idx n_vertices)
+RAFT_KERNEL inherit_labels(const value_idx* children,
+                           const value_idx* levels,
+                           std::size_t n_leaves,
+                           value_idx* labels,
+                           int cut_level,
+                           value_idx n_vertices)
 {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
index ef046ab4ff..49ac6ae704 100644
--- a/cpp/include/raft/cluster/detail/connectivities.cuh
+++ b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -107,7 +107,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
 };
 
 template <typename value_idx>
-__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz)
+RAFT_KERNEL fill_indices2(value_idx* indices, size_t m, size_t nnz)
 {
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
index ade3a6e348..593d7d8fa9 100644
--- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -434,7 +434,7 @@ template <uint32_t BlockDimY,
           typename LabelT,
           typename CounterT,
           typename MappingOpT>
-__global__ void __launch_bounds__((WarpSize * BlockDimY))
+__launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL
   adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
                         IdxT n_clusters,
                         IdxT dim,
diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
index 5a1479a81f..0b5dec4e19 100644
--- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -92,12 +92,12 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
  *    initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(index_type_t n,
-                                        index_type_t d,
-                                        index_type_t k,
-                                        const value_type_t* __restrict__ obs,
-                                        const value_type_t* __restrict__ centroids,
-                                        value_type_t* __restrict__ dists)
+RAFT_KERNEL computeDistances(index_type_t n,
+                             index_type_t d,
+                             index_type_t k,
+                             const value_type_t* __restrict__ obs,
+                             const value_type_t* __restrict__ centroids,
+                             value_type_t* __restrict__ dists)
 {
   // Loop index
   index_type_t i;
@@ -173,11 +173,11 @@ static __global__ void computeDistances(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n,
-                                    index_type_t k,
-                                    value_type_t* __restrict__ dists,
-                                    index_type_t* __restrict__ codes,
-                                    index_type_t* __restrict__ clusterSizes)
+RAFT_KERNEL minDistances(index_type_t n,
+                         index_type_t k,
+                         value_type_t* __restrict__ dists,
+                         index_type_t* __restrict__ codes,
+                         index_type_t* __restrict__ clusterSizes)
 {
   // Loop index
   index_type_t i, j;
@@ -233,11 +233,11 @@ static __global__ void minDistances(index_type_t n,
  *  @param code_new Index associated with new centroid.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances2(index_type_t n,
-                                     value_type_t* __restrict__ dists_old,
-                                     const value_type_t* __restrict__ dists_new,
-                                     index_type_t* __restrict__ codes_old,
-                                     index_type_t code_new)
+RAFT_KERNEL minDistances2(index_type_t n,
+                          value_type_t* __restrict__ dists_old,
+                          const value_type_t* __restrict__ dists_new,
+                          index_type_t* __restrict__ codes_old,
+                          index_type_t code_new)
 {
   // Loop index
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
@@ -275,9 +275,9 @@ static __global__ void minDistances2(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t>
-static __global__ void computeClusterSizes(index_type_t n,
-                                           const index_type_t* __restrict__ codes,
-                                           index_type_t* __restrict__ clusterSizes)
+RAFT_KERNEL computeClusterSizes(index_type_t n,
+                                const index_type_t* __restrict__ codes,
+                                index_type_t* __restrict__ clusterSizes)
 {
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
@@ -308,10 +308,10 @@ static __global__ void computeClusterSizes(index_type_t n,
  *    column is the mean position of a cluster).
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(index_type_t d,
-                                       index_type_t k,
-                                       const index_type_t* __restrict__ clusterSizes,
-                                       value_type_t* __restrict__ centroids)
+RAFT_KERNEL divideCentroids(index_type_t d,
+                            index_type_t k,
+                            const index_type_t* __restrict__ clusterSizes,
+                            value_type_t* __restrict__ centroids)
 {
   // Global indices
   index_type_t gidx, gidy;
diff --git a/cpp/include/raft/common/detail/scatter.cuh b/cpp/include/raft/common/detail/scatter.cuh
index 87a8826aa6..6e7522853e 100644
--- a/cpp/include/raft/common/detail/scatter.cuh
+++ b/cpp/include/raft/common/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 namespace raft::detail {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+RAFT_KERNEL scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
 {
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;
diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
index b23660fefe..dd50f47786 100644
--- a/cpp/include/raft/core/detail/copy.hpp
+++ b/cpp/include/raft/core/detail/copy.hpp
@@ -329,8 +329,8 @@ __device__ auto increment_indices(IdxType* indices,
  * parameters.
  */
 template <typename DstType, typename SrcType>
-__global__ mdspan_copyable_with_kernel_t<DstType, SrcType> mdspan_copy_kernel(DstType dst,
-                                                                              SrcType src)
+
+RAFT_KERNEL mdspan_copy_kernel(DstType dst, SrcType src)
 {
   using config = mdspan_copyable<true, DstType, SrcType>;
 
diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp
index bb4207938b..364914043e 100644
--- a/cpp/include/raft/core/detail/macros.hpp
+++ b/cpp/include/raft/core/detail/macros.hpp
@@ -86,6 +86,38 @@
 // as a weak symbol rather than a global."
 #define RAFT_WEAK_FUNCTION __attribute__((weak))
 
+// The RAFT_HIDDEN_FUNCTION specificies that the function will be hidden
+// and therefore not callable by consumers of raft when compiled as
+// a shared library.
+//
+// Hidden visibility also ensures that the linker doesn't de-duplicate the
+// symbol across multiple `.so`. This allows multiple libraries to embed raft
+// without issue
+#define RAFT_HIDDEN_FUNCTION __attribute__((visibility("hidden")))
+
+// The RAFT_KERNEL specificies that a kernel has hidden visibility
+//
+// Raft needs to ensure that the visibility of its __global__ function
+// templates have hidden visibility ( default is weak visibility).
+//
+// When kernls have weak visibility it means that if two dynamic libraries
+// both contain identical instantiations of a RAFT template, then the linker
+// will discard one of the two instantiations and use only one of them.
+//
+// Do to unique requirements of how the CUDA works this de-deduplication
+// can lead to the wrong kernels being called ( SM version being wrong ),
+// silently no kernel being called at all, or cuda runtime errors being
+// thrown.
+//
+// https://github.com/rapidsai/raft/issues/1722
+#if defined(__CUDACC_RDC__)
+#define RAFT_KERNEL RAFT_HIDDEN_FUNCTION __global__ void
+#elif defined(_RAFT_HAS_CUDA)
+#define RAFT_KERNEL static __global__ void
+#else
+#define RAFT_KERNEL static void
+#endif
+
 /**
  * Some macro magic to remove optional parentheses of a macro argument.
  * See https://stackoverflow.com/a/62984543
diff --git a/cpp/include/raft/distance/detail/compress_to_bits.cuh b/cpp/include/raft/distance/detail/compress_to_bits.cuh
index fa0df25461..5ffb717c42 100644
--- a/cpp/include/raft/distance/detail/compress_to_bits.cuh
+++ b/cpp/include/raft/distance/detail/compress_to_bits.cuh
@@ -35,7 +35,7 @@ namespace raft::distance::detail {
  *                          Note: the division (`/`) is a ceilDiv.
  */
 template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
-__global__ void compress_to_bits_kernel(
+RAFT_KERNEL compress_to_bits_kernel(
   raft::device_matrix_view<const bool, int, raft::layout_c_contiguous> in,
   raft::device_matrix_view<T, int, raft::layout_c_contiguous> out)
 {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index f0f12acdb1..2468dcd740 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -87,7 +87,7 @@ struct MinReduceOpImpl {
 };
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+RAFT_KERNEL initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
 {
   auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
   if (tid < m) { redOp.init(min + tid, maxVal); }
@@ -139,20 +139,20 @@ template <typename DataT,
           typename KVPReduceOpT,
           typename OpT,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
-                                                                  const DataT* x,
-                                                                  const DataT* y,
-                                                                  const DataT* xn,
-                                                                  const DataT* yn,
-                                                                  IdxT m,
-                                                                  IdxT n,
-                                                                  IdxT k,
-                                                                  DataT maxVal,
-                                                                  int* mutex,
-                                                                  ReduceOpT redOp,
-                                                                  KVPReduceOpT pairRedOp,
-                                                                  OpT distance_op,
-                                                                  FinalLambda fin_op)
+__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min,
+                                                              const DataT* x,
+                                                              const DataT* y,
+                                                              const DataT* xn,
+                                                              const DataT* yn,
+                                                              IdxT m,
+                                                              IdxT n,
+                                                              IdxT k,
+                                                              DataT maxVal,
+                                                              int* mutex,
+                                                              ReduceOpT redOp,
+                                                              KVPReduceOpT pairRedOp,
+                                                              OpT distance_op,
+                                                              FinalLambda fin_op)
 {
 // compile only if below non-ampere arch.
 #if __CUDA_ARCH__ < 800
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index f02e29c797..8d5b2c766e 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -36,7 +36,7 @@ namespace raft::distance::kernels::detail {
  * @param offset
  */
 template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel_nopad(
+RAFT_KERNEL polynomial_kernel_nopad(
   math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
 {
   for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
@@ -56,7 +56,7 @@ __global__ void polynomial_kernel_nopad(
  * @param offset
  */
 template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel(
+RAFT_KERNEL polynomial_kernel(
   math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
@@ -75,7 +75,7 @@ __global__ void polynomial_kernel(
  * @param offset
  */
 template <typename math_t>
-__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
 {
   for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
        tid += blockDim.x * gridDim.x) {
@@ -93,7 +93,7 @@ __global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t
  * @param offset
  */
 template <typename math_t>
-__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
        tidy += blockDim.y * gridDim.y)
@@ -121,7 +121,7 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
  * @param gain
  */
 template <typename math_t>
-__global__ void rbf_kernel_expanded(
+RAFT_KERNEL rbf_kernel_expanded(
   math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh
index 0e13783c19..4de9f4764a 100644
--- a/cpp/include/raft/distance/detail/masked_nn.cuh
+++ b/cpp/include/raft/distance/detail/masked_nn.cuh
@@ -40,24 +40,24 @@ template <typename DataT,
           typename KVPReduceOpT,
           typename CoreLambda,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void masked_l2_nn_kernel(OutT* min,
-                                                                      const DataT* x,
-                                                                      const DataT* y,
-                                                                      const DataT* xn,
-                                                                      const DataT* yn,
-                                                                      const uint64_t* adj,
-                                                                      const IdxT* group_idxs,
-                                                                      IdxT num_groups,
-                                                                      IdxT m,
-                                                                      IdxT n,
-                                                                      IdxT k,
-                                                                      bool sqrt,
-                                                                      DataT maxVal,
-                                                                      int* mutex,
-                                                                      ReduceOpT redOp,
-                                                                      KVPReduceOpT pairRedOp,
-                                                                      CoreLambda core_op,
-                                                                      FinalLambda fin_op)
+__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL masked_l2_nn_kernel(OutT* min,
+                                                                  const DataT* x,
+                                                                  const DataT* y,
+                                                                  const DataT* xn,
+                                                                  const DataT* yn,
+                                                                  const uint64_t* adj,
+                                                                  const IdxT* group_idxs,
+                                                                  IdxT num_groups,
+                                                                  IdxT m,
+                                                                  IdxT n,
+                                                                  IdxT k,
+                                                                  bool sqrt,
+                                                                  DataT maxVal,
+                                                                  int* mutex,
+                                                                  ReduceOpT redOp,
+                                                                  KVPReduceOpT pairRedOp,
+                                                                  CoreLambda core_op,
+                                                                  FinalLambda fin_op)
 {
   extern __shared__ char smem[];
 
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index 2d0a98862e..5393bf7389 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -31,8 +31,8 @@ template <typename Policy,
           typename DataT,
           typename OutT,
           typename FinOpT>
-__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(
-  OpT distance_op, pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params)
+__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL
+  pairwise_matrix_kernel(OpT distance_op, pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params)
 {
   // Early exit to minimize the size of the kernel when it is not supposed to be compiled.
   constexpr SM_compat_t sm_compat_range{};
diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
index 64d8b4bfae..6e432e050c 100644
--- a/cpp/include/raft/label/detail/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -119,13 +119,13 @@ void getOvrlabels(
 // +/-1, return array with the new class labels and corresponding indices.
 
 template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(Type* map_ids,
-                                 size_t N_labels,
-                                 Type* in,
-                                 Type* out,
-                                 size_t N,
-                                 Lambda filter_op,
-                                 bool zero_based = false)
+RAFT_KERNEL map_label_kernel(Type* map_ids,
+                             size_t N_labels,
+                             Type* in,
+                             Type* out,
+                             size_t N,
+                             Lambda filter_op,
+                             bool zero_based = false)
 {
   int tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
index f93a97d52b..166bb2122a 100644
--- a/cpp/include/raft/label/detail/merge_labels.cuh
+++ b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,13 +32,12 @@ namespace detail {
  *  For an additional cost we can build the graph with edges
  *  E={(A[i], B[i]) | M[i]=1} and make this step faster */
 template <typename value_idx, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X)
-  propagate_label_kernel(const value_idx* __restrict__ labels_a,
-                         const value_idx* __restrict__ labels_b,
-                         value_idx* __restrict__ R,
-                         const bool* __restrict__ mask,
-                         bool* __restrict__ m,
-                         value_idx N)
+RAFT_KERNEL __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a,
+                                                            const value_idx* __restrict__ labels_b,
+                                                            value_idx* __restrict__ R,
+                                                            const bool* __restrict__ mask,
+                                                            bool* __restrict__ m,
+                                                            value_idx N)
 {
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
@@ -65,12 +64,11 @@ __global__ void __launch_bounds__(TPB_X)
 }
 
 template <typename value_idx, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X)
-  reassign_label_kernel(value_idx* __restrict__ labels_a,
-                        const value_idx* __restrict__ labels_b,
-                        const value_idx* __restrict__ R,
-                        value_idx N,
-                        value_idx MAX_LABEL)
+RAFT_KERNEL __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a,
+                                                           const value_idx* __restrict__ labels_b,
+                                                           const value_idx* __restrict__ R,
+                                                           value_idx N,
+                                                           value_idx MAX_LABEL)
 {
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index bf9b2bd1d8..121ac10e24 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,10 +38,10 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st
 }
 
 template <class InT, typename IdxType, typename OutT = InT>
-__global__ void add_dev_scalar_kernel(OutT* outDev,
-                                      const InT* inDev,
-                                      const InT* singleScalarDev,
-                                      IdxType len)
+RAFT_KERNEL add_dev_scalar_kernel(OutT* outDev,
+                                  const InT* inDev,
+                                  const InT* singleScalarDev,
+                                  IdxType len)
 {
   IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
   if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 5b01196cf4..f3c150cbee 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -40,7 +40,7 @@ template <typename Policy,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   coalescedReductionThinKernel(OutType* dots,
                                const InType* data,
                                IdxType D,
@@ -137,15 +137,15 @@ template <int TPB,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
-                                                                      const InType* data,
-                                                                      IdxType D,
-                                                                      IdxType N,
-                                                                      OutType init,
-                                                                      MainLambda main_op,
-                                                                      ReduceLambda reduce_op,
-                                                                      FinalLambda final_op,
-                                                                      bool inplace = false)
+RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
+                                                                  const InType* data,
+                                                                  IdxType D,
+                                                                  IdxType N,
+                                                                  OutType init,
+                                                                  MainLambda main_op,
+                                                                  ReduceLambda reduce_op,
+                                                                  FinalLambda final_op,
+                                                                  bool inplace = false)
 {
   typedef cub::BlockReduce<OutType, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -225,7 +225,7 @@ template <typename Policy,
           typename IdxType,
           typename MainLambda,
           typename ReduceLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   coalescedReductionThickKernel(OutType* buffer,
                                 const InType* data,
                                 IdxType D,
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 0c79dec248..4ff3aa9754 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -65,7 +65,7 @@ __device__ __forceinline__ void map_kernel_mainloop(
 }
 
 template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
+RAFT_KERNEL map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
 {
   const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
   if constexpr (R <= 1) {
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 6fae16117f..d1e211f8d2 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -52,13 +52,13 @@ template <typename InType,
           typename ReduceLambda,
           int TPB,
           typename... Args>
-__global__ void mapThenReduceKernel(OutType* out,
-                                    IdxType len,
-                                    OutType neutral,
-                                    MapOp map,
-                                    ReduceLambda op,
-                                    const InType* in,
-                                    Args... args)
+RAFT_KERNEL mapThenReduceKernel(OutType* out,
+                                IdxType len,
+                                OutType neutral,
+                                MapOp map,
+                                ReduceLambda op,
+                                const InType* in,
+                                Args... args)
 {
   OutType acc = neutral;
   auto idx    = (threadIdx.x + (blockIdx.x * blockDim.x));
diff --git a/cpp/include/raft/linalg/detail/normalize.cuh b/cpp/include/raft/linalg/detail/normalize.cuh
index 78c773ab35..d1ca4816e5 100644
--- a/cpp/include/raft/linalg/detail/normalize.cuh
+++ b/cpp/include/raft/linalg/detail/normalize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ template <typename Policy,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   coalesced_normalize_thin_kernel(Type* out,
                                   const Type* in,
                                   IdxType D,
@@ -92,15 +92,15 @@ template <int TPB,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out,
-                                                                         const Type* in,
-                                                                         IdxType D,
-                                                                         IdxType N,
-                                                                         Type init,
-                                                                         MainLambda main_op,
-                                                                         ReduceLambda reduce_op,
-                                                                         FinalLambda fin_op,
-                                                                         Type eps)
+RAFT_KERNEL __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out,
+                                                                     const Type* in,
+                                                                     IdxType D,
+                                                                     IdxType N,
+                                                                     Type init,
+                                                                     MainLambda main_op,
+                                                                     ReduceLambda reduce_op,
+                                                                     FinalLambda fin_op,
+                                                                     Type eps)
 {
   typedef cub::BlockReduce<Type, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
index a85e04acca..b726e3ea5a 100644
--- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace detail {
 ///@todo: specialize this to support shared-mem based atomics
 
 template <typename T, typename KeyIteratorT, typename IdxType>
-__global__ void reduce_cols_by_key_direct_kernel(
+RAFT_KERNEL reduce_cols_by_key_direct_kernel(
   const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
 {
   typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
@@ -44,7 +44,7 @@ __global__ void reduce_cols_by_key_direct_kernel(
 }
 
 template <typename T, typename KeyIteratorT, typename IdxType>
-__global__ void reduce_cols_by_key_cached_kernel(
+RAFT_KERNEL reduce_cols_by_key_cached_kernel(
   const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
 {
   typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index 572d6b738c..ce11825e12 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace detail {
 //
 
 template <typename IteratorT1, typename IteratorT2>
-void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
+RAFT_KERNEL convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
 {
   for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) {
     dst[idx] = src[idx];
@@ -95,14 +95,14 @@ struct quadSum {
 template <typename DataIteratorT, typename WeightT, typename SumsT, typename IdxT>
 __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
 
-  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
-                                                     IdxT lda,
-                                                     const char* d_keys,
-                                                     const WeightT* d_weights,
-                                                     IdxT nrows,
-                                                     IdxT ncols,
-                                                     IdxT nkeys,
-                                                     SumsT* d_sums)
+  RAFT_KERNEL sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
+                                                 IdxT lda,
+                                                 const char* d_keys,
+                                                 const WeightT* d_weights,
+                                                 IdxT nrows,
+                                                 IdxT ncols,
+                                                 IdxT nkeys,
+                                                 SumsT* d_sums)
 {
   typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
   typedef cub::BlockReduce<quad<SumsT>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
@@ -193,15 +193,15 @@ template <typename DataIteratorT,
           typename WeightT,
           typename SumsT,
           typename IdxT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
-                                                            IdxT lda,
-                                                            KeysIteratorT d_keys,
-                                                            const WeightT* d_weights,
-                                                            IdxT nrows,
-                                                            IdxT ncols,
-                                                            int key_offset,
-                                                            IdxT nkeys,
-                                                            SumsT* d_sums)
+RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
+                                                        IdxT lda,
+                                                        KeysIteratorT d_keys,
+                                                        const WeightT* d_weights,
+                                                        IdxT nrows,
+                                                        IdxT ncols,
+                                                        int key_offset,
+                                                        IdxT nkeys,
+                                                        SumsT* d_sums)
 {
   typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
   typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
@@ -269,13 +269,13 @@ template <typename DataIteratorT,
           typename WeightT,
           typename SumsT,
           typename IdxT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
-                                                            IdxT lda,
-                                                            const WeightT* d_weights,
-                                                            KeysIteratorT d_keys,
-                                                            IdxT nrows,
-                                                            IdxT ncols,
-                                                            SumsT* d_sums)
+RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
+                                                        IdxT lda,
+                                                        const WeightT* d_weights,
+                                                        KeysIteratorT d_keys,
+                                                        IdxT nrows,
+                                                        IdxT ncols,
+                                                        SumsT* d_sums)
 {
   IdxT gid = threadIdx.x + (blockDim.x * static_cast<IdxT>(blockIdx.x));
   IdxT j   = gid % ncols;
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 42e79a9285..aef346bd4b 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -30,7 +30,7 @@ namespace detail {
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
 template <typename Type, typename MainLambda>
-__global__ void stridedSummationKernel(
+RAFT_KERNEL stridedSummationKernel(
   Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
 {
   // Thread reduction
@@ -68,13 +68,13 @@ template <typename InType,
           typename IdxType,
           typename MainLambda,
           typename ReduceLambda>
-__global__ void stridedReductionKernel(OutType* dots,
-                                       const InType* data,
-                                       int D,
-                                       int N,
-                                       OutType init,
-                                       MainLambda main_op,
-                                       ReduceLambda reduce_op)
+RAFT_KERNEL stridedReductionKernel(OutType* dots,
+                                   const InType* data,
+                                   int D,
+                                   int N,
+                                   OutType init,
+                                   MainLambda main_op,
+                                   ReduceLambda reduce_op)
 {
   // Thread reduction
   OutType thread_data = init;
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index 6df09df8ed..6519d58fa1 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,10 +38,10 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream
 }
 
 template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t* outDev,
-                                           const math_t* inDev,
-                                           const math_t* singleScalarDev,
-                                           IdxType len)
+RAFT_KERNEL subtract_dev_scalar_kernel(math_t* outDev,
+                                       const math_t* inDev,
+                                       const math_t* singleScalarDev,
+                                       IdxType len)
 {
   // TODO: kernel do not use shared memory in current implementation
   int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
index 5df7ba3cdc..652c4fda0f 100644
--- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh
+++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ struct SmemPerBlock {
 };
 
 template <typename InType>
-__global__ void devLayoutIdx(InType* in, int n_cols, int totalElements)
+RAFT_KERNEL devLayoutIdx(InType* in, int n_cols, int totalElements)
 {
   int idx = threadIdx.x + blockDim.x * blockIdx.x;
   int n   = n_cols;
@@ -63,7 +63,7 @@ __global__ void devLayoutIdx(InType* in, int n_cols, int totalElements)
 }
 
 template <typename T>
-__global__ void devOffsetKernel(T* in, T value, int n_times)
+RAFT_KERNEL devOffsetKernel(T* in, T value, int n_times)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < n_times) in[idx] = idx * value;
@@ -76,12 +76,12 @@ template <
   int BLOCK_SIZE,
   int ITEMS_PER_THREAD,
   typename std::enable_if<TemplateChecker<InType, BLOCK_SIZE>::IsValid, InType>::type* = nullptr>
-__global__ void __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys,
-                                                                     InType* outputKeys,
-                                                                     OutType* inputVals,
-                                                                     int n_rows,
-                                                                     int n_cols,
-                                                                     InType MAX_VALUE)
+RAFT_KERNEL __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys,
+                                                                 InType* outputKeys,
+                                                                 OutType* inputVals,
+                                                                 int n_rows,
+                                                                 int n_cols,
+                                                                 InType MAX_VALUE)
 {
   typedef cub::BlockLoad<InType, BLOCK_SIZE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>
     BlockLoadTypeKey;
@@ -124,12 +124,12 @@ template <
   int BLOCK_SIZE,
   int ITEMS_PER_THREAD,
   typename std::enable_if<!(TemplateChecker<InType, BLOCK_SIZE>::IsValid), InType>::type* = nullptr>
-__global__ void devKeyValSortColumnPerRow(const InType* inputKeys,
-                                          InType* outputKeys,
-                                          OutType* inputVals,
-                                          int n_rows,
-                                          int n_cols,
-                                          InType MAX_VALUE)
+RAFT_KERNEL devKeyValSortColumnPerRow(const InType* inputKeys,
+                                      InType* outputKeys,
+                                      OutType* inputVals,
+                                      int n_rows,
+                                      int n_cols,
+                                      InType MAX_VALUE)
 {
   // place holder function
   // so that compiler unrolls for all template types successfully
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index 59fcf606c8..73072ec841 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -47,14 +47,14 @@ template <typename Policy,
           typename MapTransformOp,
           typename OutputIteratorT,
           typename IndexT>
-__global__ void gather_kernel(const InputIteratorT in,
-                              IndexT D,
-                              IndexT len,
-                              const MapIteratorT map,
-                              StencilIteratorT stencil,
-                              OutputIteratorT out,
-                              PredicateOp pred_op,
-                              MapTransformOp transform_op)
+RAFT_KERNEL gather_kernel(const InputIteratorT in,
+                          IndexT D,
+                          IndexT len,
+                          const MapIteratorT map,
+                          StencilIteratorT stencil,
+                          OutputIteratorT out,
+                          PredicateOp pred_op,
+                          MapTransformOp transform_op)
 {
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
   typedef typename std::iterator_traits<StencilIteratorT>::value_type StencilValueT;
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 514d0dc51b..6061fe6aee 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -260,7 +260,7 @@ template <typename Type,
           int BlockSize,
           typename Lambda,
           typename... Vecs>
-__global__ void __launch_bounds__(BlockSize)
+RAFT_KERNEL __launch_bounds__(BlockSize)
   matrixLinewiseVecColsMainKernel(Type* out,
                                   const Type* in,
                                   const IdxType arrOffset,
@@ -304,15 +304,14 @@ __global__ void __launch_bounds__(BlockSize)
  * @param [in] vecs pointers to the argument vectors
  */
 template <typename Type, typename IdxType, std::size_t MaxOffset, typename Lambda, typename... Vecs>
-__global__ void __launch_bounds__(MaxOffset, 2)
-  matrixLinewiseVecColsTailKernel(Type* out,
-                                  const Type* in,
-                                  const IdxType arrOffset,
-                                  const IdxType arrTail,
-                                  const IdxType rowLen,
-                                  const IdxType len,
-                                  Lambda op,
-                                  const Vecs*... vecs)
+RAFT_KERNEL __launch_bounds__(MaxOffset, 2) matrixLinewiseVecColsTailKernel(Type* out,
+                                                                            const Type* in,
+                                                                            const IdxType arrOffset,
+                                                                            const IdxType arrTail,
+                                                                            const IdxType rowLen,
+                                                                            const IdxType len,
+                                                                            Lambda op,
+                                                                            const Vecs*... vecs)
 {
   // Note, L::VecElems == 1
   typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
@@ -370,14 +369,13 @@ template <typename Type,
           int BlockSize,
           typename Lambda,
           typename... Vecs>
-__global__ void __launch_bounds__(BlockSize)
-  matrixLinewiseVecRowsMainKernel(Type* out,
-                                  const Type* in,
-                                  const IdxType arrOffset,
-                                  const IdxType rowLen,
-                                  const IdxType len,
-                                  Lambda op,
-                                  const Vecs*... vecs)
+RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecRowsMainKernel(Type* out,
+                                                                         const Type* in,
+                                                                         const IdxType arrOffset,
+                                                                         const IdxType rowLen,
+                                                                         const IdxType len,
+                                                                         Lambda op,
+                                                                         const Vecs*... vecs)
 {
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
   constexpr uint workSize         = L::VecElems * BlockSize;
@@ -413,14 +411,13 @@ template <typename Type,
           int BlockSize,
           typename Lambda,
           typename... Vecs>
-__global__ void __launch_bounds__(BlockSize)
-  matrixLinewiseVecRowsSpanKernel(Type* out,
-                                  const Type* in,
-                                  const IdxType rowLen,
-                                  const IdxType rowLenPadded,
-                                  const IdxType lenPadded,
-                                  Lambda op,
-                                  const Vecs*... vecs)
+RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecRowsSpanKernel(Type* out,
+                                                                         const Type* in,
+                                                                         const IdxType rowLen,
+                                                                         const IdxType rowLenPadded,
+                                                                         const IdxType lenPadded,
+                                                                         Lambda op,
+                                                                         const Vecs*... vecs)
 {
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
   constexpr uint workSize         = L::VecElems * BlockSize;
@@ -457,15 +454,14 @@ __global__ void __launch_bounds__(BlockSize)
  * @param [in] vecs pointers to the argument vectors
  */
 template <typename Type, typename IdxType, std::size_t MaxOffset, typename Lambda, typename... Vecs>
-__global__ void __launch_bounds__(MaxOffset, 2)
-  matrixLinewiseVecRowsTailKernel(Type* out,
-                                  const Type* in,
-                                  const IdxType arrOffset,
-                                  const IdxType arrTail,
-                                  const IdxType rowLen,
-                                  const IdxType len,
-                                  Lambda op,
-                                  const Vecs*... vecs)
+RAFT_KERNEL __launch_bounds__(MaxOffset, 2) matrixLinewiseVecRowsTailKernel(Type* out,
+                                                                            const Type* in,
+                                                                            const IdxType arrOffset,
+                                                                            const IdxType arrTail,
+                                                                            const IdxType rowLen,
+                                                                            const IdxType len,
+                                                                            Lambda op,
+                                                                            const Vecs*... vecs)
 {
   // Note, L::VecElems == 1
   constexpr uint workSize         = MaxOffset;
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index d2707e1254..9e9d7f8b3b 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -331,7 +331,7 @@ void matrixVectorBinarySub(Type* data,
 
 // Computes an argmin/argmax column-wise in a DxN matrix
 template <typename RedOp, int TPB, typename T, typename OutT, typename IdxT>
-__global__ void argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out)
+RAFT_KERNEL argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out)
 {
   typedef cub::
     BlockReduce<cub::KeyValuePair<IdxT, T>, TPB, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY>
@@ -396,7 +396,7 @@ void argmax(const math_t* in, idx_t D, idx_t N, out_t* out, cudaStream_t stream)
 // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
 // flipping the sign if the |max| value for each column is negative.
 template <typename T, int TPB>
-__global__ void signFlipKernel(T* d_in, int D, int N)
+RAFT_KERNEL signFlipKernel(T* d_in, int D, int N)
 {
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 48821df5b2..2fa741fd96 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -169,8 +169,7 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
  * (1-based)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void slice(
-  const m_t* src_d, idx_t lda, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
+RAFT_KERNEL slice(const m_t* src_d, idx_t lda, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t dm = x2 - x1, dn = y2 - y1;
@@ -211,7 +210,7 @@ void sliceMatrix(const m_t* in,
  * @param k: min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
+RAFT_KERNEL getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t m = n_rows, n = n_cols;
@@ -239,7 +238,7 @@ void copyUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, c
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t lda, idx_t k)
+RAFT_KERNEL copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t lda, idx_t k)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
@@ -254,7 +253,7 @@ __global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t ld
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t lda, idx_t k)
+RAFT_KERNEL copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t lda, idx_t k)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
@@ -290,7 +289,7 @@ void getDiagonalMatrix(
  * @param len: size of one side of the matrix
  */
 template <typename m_t, typename idx_t = int>
-__global__ void matrixDiagonalInverse(m_t* in, idx_t len)
+RAFT_KERNEL matrixDiagonalInverse(m_t* in, idx_t len)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; }
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index edde924892..b3c07b9d3a 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -422,16 +422,16 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
 }
 
 template <typename T, typename IdxT, int BitsPerPass>
-__global__ void last_filter_kernel(const T* in,
-                                   const IdxT* in_idx,
-                                   const T* in_buf,
-                                   const IdxT* in_idx_buf,
-                                   T* out,
-                                   IdxT* out_idx,
-                                   IdxT len,
-                                   IdxT k,
-                                   Counter<T, IdxT>* counters,
-                                   const bool select_min)
+RAFT_KERNEL last_filter_kernel(const T* in,
+                               const IdxT* in_idx,
+                               const T* in_buf,
+                               const IdxT* in_idx_buf,
+                               T* out,
+                               IdxT* out_idx,
+                               IdxT len,
+                               IdxT k,
+                               Counter<T, IdxT>* counters,
+                               const bool select_min)
 {
   const size_t batch_id = blockIdx.y;  // size_t to avoid multiplication overflow
 
@@ -525,20 +525,20 @@ __global__ void last_filter_kernel(const T* in,
  * their indices.
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool fused_last_filter>
-__global__ void radix_kernel(const T* in,
-                             const IdxT* in_idx,
-                             const T* in_buf,
-                             const IdxT* in_idx_buf,
-                             T* out_buf,
-                             IdxT* out_idx_buf,
-                             T* out,
-                             IdxT* out_idx,
-                             Counter<T, IdxT>* counters,
-                             IdxT* histograms,
-                             const IdxT len,
-                             const IdxT k,
-                             const bool select_min,
-                             const int pass)
+RAFT_KERNEL radix_kernel(const T* in,
+                         const IdxT* in_idx,
+                         const T* in_buf,
+                         const IdxT* in_idx_buf,
+                         T* out_buf,
+                         IdxT* out_idx_buf,
+                         T* out,
+                         IdxT* out_idx,
+                         Counter<T, IdxT>* counters,
+                         IdxT* histograms,
+                         const IdxT len,
+                         const IdxT k,
+                         const bool select_min,
+                         const int pass)
 {
   const size_t batch_id = blockIdx.y;
   auto counter          = counters + batch_id;
@@ -920,17 +920,17 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
 }
 
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-__global__ void radix_topk_one_block_kernel(const T* in,
-                                            const IdxT* in_idx,
-                                            const IdxT len,
-                                            const IdxT k,
-                                            T* out,
-                                            IdxT* out_idx,
-                                            const bool select_min,
-                                            T* buf1,
-                                            IdxT* idx_buf1,
-                                            T* buf2,
-                                            IdxT* idx_buf2)
+RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
+                                        const IdxT* in_idx,
+                                        const IdxT len,
+                                        const IdxT k,
+                                        T* out,
+                                        IdxT* out_idx,
+                                        const bool select_min,
+                                        T* buf1,
+                                        IdxT* idx_buf1,
+                                        T* buf2,
+                                        IdxT* idx_buf2)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ Counter<T, IdxT> counter;
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 2927604e7d..0ee87de4f7 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -56,7 +56,7 @@
     the top-k result.
 
     Example:
-      __global__ void kernel() {
+      RAFT_KERNEL kernel() {
         block_sort<warp_sort_immediate, ...> queue(...);
 
         for (IdxT i = threadIdx.x; i < len, i += blockDim.x) {
@@ -80,7 +80,7 @@
     (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
 
     Example:
-      __global__ void kernel() {
+      RAFT_KERNEL kernel() {
         warp_sort_immediate<...> queue(...);
         int warp_id = threadIdx.x / WarpSize;
         int lane_id = threadIdx.x % WarpSize;
@@ -750,8 +750,8 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           bool Ascending,
           typename T,
           typename IdxT>
-__launch_bounds__(256) __global__
-  void block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
+__launch_bounds__(256) RAFT_KERNEL
+  block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
   using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 8845e37973..8fcba38883 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -69,12 +69,12 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a
 }
 
 template <class DATA_T, class IdxT, int numElementsPerThread>
-__global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, dataset_dim]
-                          const IdxT dataset_size,
-                          const uint32_t dataset_dim,
-                          IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                          const uint32_t graph_size,
-                          const uint32_t graph_degree)
+RAFT_KERNEL kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size, dataset_dim]
+                      const IdxT dataset_size,
+                      const uint32_t dataset_dim,
+                      IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
+                      const uint32_t graph_size,
+                      const uint32_t graph_degree)
 {
   const IdxT srcNode = (blockDim.x * blockIdx.x + threadIdx.x) / raft::WarpSize;
   if (srcNode >= graph_size) { return; }
@@ -125,15 +125,15 @@ __global__ void kern_sort(const DATA_T* const dataset,  // [dataset_chunk_size,
 }
 
 template <int MAX_DEGREE, class IdxT>
-__global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
-                           const uint32_t graph_size,
-                           const uint32_t graph_degree,
-                           const uint32_t degree,
-                           const uint32_t batch_size,
-                           const uint32_t batch_id,
-                           uint8_t* const detour_count,          // [graph_chunk_size, graph_degree]
-                           uint32_t* const num_no_detour_edges,  // [graph_size]
-                           uint64_t* const stats)
+RAFT_KERNEL kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, graph_degree]
+                       const uint32_t graph_size,
+                       const uint32_t graph_degree,
+                       const uint32_t degree,
+                       const uint32_t batch_size,
+                       const uint32_t batch_id,
+                       uint8_t* const detour_count,          // [graph_chunk_size, graph_degree]
+                       uint32_t* const num_no_detour_edges,  // [graph_size]
+                       uint64_t* const stats)
 {
   __shared__ uint32_t smem_num_detour[MAX_DEGREE];
   uint64_t* const num_retain = stats;
@@ -188,11 +188,11 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
 }
 
 template <class IdxT>
-__global__ void kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
-                                    IdxT* const rev_graph,            // [size, degree]
-                                    uint32_t* const rev_graph_count,  // [graph_size]
-                                    const uint32_t graph_size,
-                                    const uint32_t degree)
+RAFT_KERNEL kern_make_rev_graph(const IdxT* const dest_nodes,     // [graph_size]
+                                IdxT* const rev_graph,            // [size, degree]
+                                uint32_t* const rev_graph_count,  // [graph_size]
+                                const uint32_t graph_size,
+                                const uint32_t degree)
 {
   const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
   const uint32_t tnum = blockDim.x * gridDim.x;
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 358a183971..dbca33f8de 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -132,7 +132,7 @@ template <unsigned TEAM_SIZE,
           class INDEX_T,
           class LOAD_T,
           class SAMPLE_FILTER_T>
-__launch_bounds__(1024, 1) __global__ void search_kernel(
+__launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   INDEX_T* const result_indices_ptr,       // [num_queries, num_cta_per_query, itopk_size]
   DISTANCE_T* const result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
   const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
@@ -372,11 +372,11 @@ __launch_bounds__(1024, 1) __global__ void search_kernel(
 }
 
 template <class T>
-__global__ void set_value_batch_kernel(T* const dev_ptr,
-                                       const std::size_t ld,
-                                       const T val,
-                                       const std::size_t count,
-                                       const std::size_t batch_size)
+RAFT_KERNEL set_value_batch_kernel(T* const dev_ptr,
+                                   const std::size_t ld,
+                                   const T val,
+                                   const std::size_t count,
+                                   const std::size_t batch_size)
 {
   const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= count * batch_size) { return; }
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index 9392bde440..eddd954e95 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -45,13 +45,13 @@ namespace raft::neighbors::cagra::detail {
 namespace multi_kernel_search {
 
 template <class T>
-__global__ void set_value_kernel(T* const dev_ptr, const T val)
+RAFT_KERNEL set_value_kernel(T* const dev_ptr, const T val)
 {
   *dev_ptr = val;
 }
 
 template <class T>
-__global__ void set_value_kernel(T* const dev_ptr, const T val, const std::size_t count)
+RAFT_KERNEL set_value_kernel(T* const dev_ptr, const T val, const std::size_t count)
 {
   const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= count) { return; }
@@ -73,7 +73,7 @@ void set_value(T* const dev_ptr, const T val, const std::size_t count, cudaStrea
 }
 
 template <class T>
-__global__ void get_value_kernel(T* const host_ptr, const T* const dev_ptr)
+RAFT_KERNEL get_value_kernel(T* const host_ptr, const T* const dev_ptr)
 {
   *host_ptr = *dev_ptr;
 }
@@ -90,22 +90,21 @@ template <unsigned TEAM_SIZE,
           class DATA_T,
           class DISTANCE_T,
           class INDEX_T>
-__global__ void random_pickup_kernel(
-  const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
-  const std::size_t dataset_dim,
-  const std::size_t dataset_size,
-  const std::size_t dataset_ld,
-  const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-  const std::size_t num_pickup,
-  const unsigned num_distilation,
-  const uint64_t rand_xor_mask,
-  const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
-  const uint32_t num_seeds,
-  INDEX_T* const result_indices_ptr,       // [num_queries, ldr]
-  DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
-  const std::uint32_t ldr,                 // (*) ldr >= num_pickup
-  INDEX_T* const visited_hashmap_ptr,      // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen)
+RAFT_KERNEL random_pickup_kernel(const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+                                 const std::size_t dataset_dim,
+                                 const std::size_t dataset_size,
+                                 const std::size_t dataset_ld,
+                                 const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+                                 const std::size_t num_pickup,
+                                 const unsigned num_distilation,
+                                 const uint64_t rand_xor_mask,
+                                 const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+                                 const uint32_t num_seeds,
+                                 INDEX_T* const result_indices_ptr,       // [num_queries, ldr]
+                                 DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
+                                 const std::uint32_t ldr,                 // (*) ldr >= num_pickup
+                                 INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
+                                 const std::uint32_t hash_bitlen)
 {
   const auto ldb               = hashmap::get_size(hash_bitlen);
   const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / TEAM_SIZE;
@@ -204,7 +203,7 @@ void random_pickup(const DATA_T* const dataset_ptr,  // [dataset_size, dataset_d
 }
 
 template <class INDEX_T>
-__global__ void pickup_next_parents_kernel(
+RAFT_KERNEL pickup_next_parents_kernel(
   INDEX_T* const parent_candidates_ptr,        // [num_queries, lds]
   const std::size_t lds,                       // (*) lds >= parent_candidates_size
   const std::uint32_t parent_candidates_size,  //
@@ -309,7 +308,7 @@ template <unsigned TEAM_SIZE,
           class INDEX_T,
           class DISTANCE_T,
           class SAMPLE_FILTER_T>
-__global__ void compute_distance_to_child_nodes_kernel(
+RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   const INDEX_T* const parent_node_list,  // [num_queries, search_width]
   INDEX_T* const parent_candidates_ptr,   // [num_queries, search_width]
   DISTANCE_T* const parent_distance_ptr,  // [num_queries, search_width]
@@ -440,10 +439,10 @@ void compute_distance_to_child_nodes(
 }
 
 template <class INDEX_T>
-__global__ void remove_parent_bit_kernel(const std::uint32_t num_queries,
-                                         const std::uint32_t num_topk,
-                                         INDEX_T* const topk_indices_ptr,  // [ld, num_queries]
-                                         const std::uint32_t ld)
+RAFT_KERNEL remove_parent_bit_kernel(const std::uint32_t num_queries,
+                                     const std::uint32_t num_topk,
+                                     INDEX_T* const topk_indices_ptr,  // [ld, num_queries]
+                                     const std::uint32_t ld)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
 
@@ -470,13 +469,13 @@ void remove_parent_bit(const std::uint32_t num_queries,
 
 // This function called after the `remove_parent_bit` function
 template <class INDEX_T, class DISTANCE_T, class SAMPLE_FILTER_T>
-__global__ void apply_filter_kernel(INDEX_T* const result_indices_ptr,
-                                    DISTANCE_T* const result_distances_ptr,
-                                    const std::size_t lds,
-                                    const std::uint32_t result_buffer_size,
-                                    const std::uint32_t num_queries,
-                                    const INDEX_T query_id_offset,
-                                    SAMPLE_FILTER_T sample_filter)
+RAFT_KERNEL apply_filter_kernel(INDEX_T* const result_indices_ptr,
+                                DISTANCE_T* const result_distances_ptr,
+                                const std::size_t lds,
+                                const std::uint32_t result_buffer_size,
+                                const std::uint32_t num_queries,
+                                const INDEX_T query_id_offset,
+                                SAMPLE_FILTER_T sample_filter)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   const auto tid                     = threadIdx.x + blockIdx.x * blockDim.x;
@@ -515,12 +514,12 @@ void apply_filter(INDEX_T* const result_indices_ptr,
 }
 
 template <class T>
-__global__ void batched_memcpy_kernel(T* const dst,  // [batch_size, ld_dst]
-                                      const uint64_t ld_dst,
-                                      const T* const src,  // [batch_size, ld_src]
-                                      const uint64_t ld_src,
-                                      const uint64_t count,
-                                      const uint64_t batch_size)
+RAFT_KERNEL batched_memcpy_kernel(T* const dst,  // [batch_size, ld_dst]
+                                  const uint64_t ld_dst,
+                                  const T* const src,  // [batch_size, ld_src]
+                                  const uint64_t ld_src,
+                                  const uint64_t count,
+                                  const uint64_t batch_size)
 {
   const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= count * batch_size) { return; }
@@ -547,11 +546,11 @@ void batched_memcpy(T* const dst,  // [batch_size, ld_dst]
 }
 
 template <class T>
-__global__ void set_value_batch_kernel(T* const dev_ptr,
-                                       const std::size_t ld,
-                                       const T val,
-                                       const std::size_t count,
-                                       const std::size_t batch_size)
+RAFT_KERNEL set_value_batch_kernel(T* const dev_ptr,
+                                   const std::size_t ld,
+                                   const T val,
+                                   const std::size_t count,
+                                   const std::size_t batch_size)
 {
   const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= count * batch_size) { return; }
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 3a5501f545..96535e5f20 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -463,31 +463,31 @@ template <unsigned TEAM_SIZE,
           class DISTANCE_T,
           class INDEX_T,
           class SAMPLE_FILTER_T>
-__launch_bounds__(1024, 1) __global__
-  void search_kernel(INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
-                     DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
-                     const std::uint32_t top_k,
-                     const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
-                     const std::size_t dataset_dim,
-                     const std::size_t dataset_size,
-                     const std::size_t dataset_ld,     // stride of dataset
-                     const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
-                     const INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
-                     const std::uint32_t graph_degree,
-                     const unsigned num_distilation,
-                     const uint64_t rand_xor_mask,
-                     const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
-                     const uint32_t num_seeds,
-                     INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
-                     const std::uint32_t internal_topk,
-                     const std::uint32_t search_width,
-                     const std::uint32_t min_iteration,
-                     const std::uint32_t max_iteration,
-                     std::uint32_t* const num_executed_iterations,  // [num_queries]
-                     const std::uint32_t hash_bitlen,
-                     const std::uint32_t small_hash_bitlen,
-                     const std::uint32_t small_hash_reset_interval,
-                     SAMPLE_FILTER_T sample_filter)
+__launch_bounds__(1024, 1) RAFT_KERNEL
+  search_kernel(INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
+                DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
+                const std::uint32_t top_k,
+                const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+                const std::size_t dataset_dim,
+                const std::size_t dataset_size,
+                const std::size_t dataset_ld,     // stride of dataset
+                const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+                const INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
+                const std::uint32_t graph_degree,
+                const unsigned num_distilation,
+                const uint64_t rand_xor_mask,
+                const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+                const uint32_t num_seeds,
+                INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+                const std::uint32_t internal_topk,
+                const std::uint32_t search_width,
+                const std::uint32_t min_iteration,
+                const std::uint32_t max_iteration,
+                std::uint32_t* const num_executed_iterations,  // [num_queries]
+                const std::uint32_t hash_bitlen,
+                const std::uint32_t small_hash_bitlen,
+                const std::uint32_t small_hash_reset_interval,
+                SAMPLE_FILTER_T sample_filter)
 {
   using LOAD_T        = device::LOAD_128BIT_T;
   const auto query_id = blockIdx.y;
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
index fd4aeb9bb3..8e04a2eb0d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
@@ -865,21 +865,21 @@ int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH)
 }  // unnamed namespace
 
 template <int stateBitLen, int vecLen, int maxTopk, int numSortThreads, class ValT>
-__launch_bounds__(1024, 1) __global__
-  void kern_topk_cta_11(uint32_t topk,
-                        uint32_t size_batch,
-                        uint32_t len_x,
-                        const uint32_t* _x,  // [size_batch, ld_x,]
-                        uint32_t ld_x,
-                        const ValT* _in_vals,  // [size_batch, ld_iv,]
-                        uint32_t ld_iv,
-                        uint32_t* _y,  // [size_batch, ld_y,]
-                        uint32_t ld_y,
-                        ValT* _out_vals,  // [size_batch, ld_ov,]
-                        uint32_t ld_ov,
-                        uint8_t* _state,   // [size_batch, ...,]
-                        uint32_t* _hints,  // [size_batch,]
-                        bool sort)
+__launch_bounds__(1024, 1) RAFT_KERNEL
+  kern_topk_cta_11(uint32_t topk,
+                   uint32_t size_batch,
+                   uint32_t len_x,
+                   const uint32_t* _x,  // [size_batch, ld_x,]
+                   uint32_t ld_x,
+                   const ValT* _in_vals,  // [size_batch, ld_iv,]
+                   uint32_t ld_iv,
+                   uint32_t* _y,  // [size_batch, ld_y,]
+                   uint32_t ld_y,
+                   ValT* _out_vals,  // [size_batch, ld_ov,]
+                   uint32_t ld_ov,
+                   uint8_t* _state,   // [size_batch, ...,]
+                   uint32_t* _hints,  // [size_batch,]
+                   bool sort)
 {
   const uint32_t i_batch = blockIdx.x;
   if (i_batch >= size_batch) return;
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 9cde1143e0..c1b6056c7d 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -112,15 +112,15 @@ auto clone(const raft::resources& res, const index<T, IdxT>& source) -> index<T,
  *
  */
 template <typename T, typename IdxT, typename LabelT, bool gather_src = false>
-__global__ void build_index_kernel(const LabelT* labels,
-                                   const T* source_vecs,
-                                   const IdxT* source_ixs,
-                                   T** list_data_ptrs,
-                                   IdxT** list_index_ptrs,
-                                   uint32_t* list_sizes_ptr,
-                                   IdxT n_rows,
-                                   uint32_t dim,
-                                   uint32_t veclen)
+RAFT_KERNEL build_index_kernel(const LabelT* labels,
+                               const T* source_vecs,
+                               const IdxT* source_ixs,
+                               T** list_data_ptrs,
+                               IdxT** list_index_ptrs,
+                               uint32_t* list_sizes_ptr,
+                               IdxT n_rows,
+                               uint32_t dim,
+                               uint32_t veclen)
 {
   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
   if (i >= n_rows) { return; }
@@ -419,13 +419,12 @@ inline void fill_refinement_index(raft::resources const& handle,
 }
 
 template <typename T>
-__global__ void pack_interleaved_list_kernel(
-  const T* codes,
-  T* list_data,
-  uint32_t n_rows,
-  uint32_t dim,
-  uint32_t veclen,
-  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+RAFT_KERNEL pack_interleaved_list_kernel(const T* codes,
+                                         T* list_data,
+                                         uint32_t n_rows,
+                                         uint32_t dim,
+                                         uint32_t veclen,
+                                         std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
   uint32_t tid          = blockIdx.x * blockDim.x + threadIdx.x;
   const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
@@ -435,7 +434,7 @@ __global__ void pack_interleaved_list_kernel(
 }
 
 template <typename T>
-__global__ void unpack_interleaved_list_kernel(
+RAFT_KERNEL unpack_interleaved_list_kernel(
   const T* list_data,
   T* codes,
   uint32_t n_rows,
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
index 81779668c4..ad3d158e48 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
@@ -659,7 +659,7 @@ template <int Capacity,
           typename IvfSampleFilterT,
           typename Lambda,
           typename PostLambda>
-__global__ void __launch_bounds__(kThreadsPerBlock)
+RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
   interleaved_scan_kernel(Lambda compute_dist,
                           PostLambda post_process,
                           const uint32_t query_smem_elems,
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 975ae9ec00..20ef6a05e0 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -65,7 +65,7 @@ namespace raft::neighbors::ivf_pq::detail {
 using namespace raft::spatial::knn::detail;  // NOLINT
 
 template <uint32_t BlockDim, typename T, typename S>
-__launch_bounds__(BlockDim) __global__ void copy_warped_kernel(
+__launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
 {
   using warp    = Pow2<WarpSize>;
@@ -262,10 +262,8 @@ void flat_compute_residuals(
 }
 
 template <uint32_t BlockDim, typename IdxT>
-__launch_bounds__(BlockDim) __global__ void fill_indices_kernel(IdxT n_rows,
-                                                                IdxT* data_indices,
-                                                                IdxT* data_offsets,
-                                                                const uint32_t* labels)
+__launch_bounds__(BlockDim) RAFT_KERNEL
+  fill_indices_kernel(IdxT n_rows, IdxT* data_indices, IdxT* data_offsets, const uint32_t* labels)
 {
   const auto i = IdxT(BlockDim) * IdxT(blockIdx.x) + IdxT(threadIdx.x);
   if (i >= n_rows) { return; }
@@ -554,7 +552,7 @@ struct unpack_codes {
 };
 
 template <uint32_t BlockSize, uint32_t PqBits>
-__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
+__launch_bounds__(BlockSize) RAFT_KERNEL unpack_list_data_kernel(
   device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
   device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
@@ -672,7 +670,7 @@ struct reconstruct_vectors {
 };
 
 template <uint32_t BlockSize, uint32_t PqBits>
-__launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
+__launch_bounds__(BlockSize) RAFT_KERNEL reconstruct_list_data_kernel(
   device_matrix_view<float, uint32_t, row_major> out_vectors,
   device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
   device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
@@ -791,7 +789,7 @@ struct pass_codes {
 };
 
 template <uint32_t BlockSize, uint32_t PqBits>
-__launch_bounds__(BlockSize) __global__ void pack_list_data_kernel(
+__launch_bounds__(BlockSize) RAFT_KERNEL pack_list_data_kernel(
   device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
   device_matrix_view<const uint8_t, uint32_t, row_major> codes,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
@@ -943,7 +941,7 @@ struct encode_vectors {
 };
 
 template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
-__launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
+__launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_kernel(
   device_matrix_view<const float, IdxT, row_major> new_vectors,
   std::variant<IdxT, const IdxT*> src_offset_or_indices,
   const uint32_t* new_labels,
@@ -988,7 +986,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
 }
 
 template <uint32_t BlockSize, uint32_t PqBits>
-__launch_bounds__(BlockSize) __global__ void encode_list_data_kernel(
+__launch_bounds__(BlockSize) RAFT_KERNEL encode_list_data_kernel(
   device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
   device_matrix_view<const float, uint32_t, row_major> new_vectors,
   device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
index 1a9788ce4c..6afb7e4299 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
@@ -42,27 +42,27 @@ template <typename OutT,
           int Capacity,
           bool PrecompBaseDiff,
           bool EnableSMemLut>
-__global__ void compute_similarity_kernel(uint32_t dim,
-                                          uint32_t n_probes,
-                                          uint32_t pq_dim,
-                                          uint32_t n_queries,
-                                          uint32_t queries_offset,
-                                          distance::DistanceType metric,
-                                          codebook_gen codebook_kind,
-                                          uint32_t topk,
-                                          uint32_t max_samples,
-                                          const float* cluster_centers,
-                                          const float* pq_centers,
-                                          const uint8_t* const* pq_dataset,
-                                          const uint32_t* cluster_labels,
-                                          const uint32_t* _chunk_indices,
-                                          const float* queries,
-                                          const uint32_t* index_list,
-                                          float* query_kths,
-                                          IvfSampleFilterT sample_filter,
-                                          LutT* lut_scores,
-                                          OutT* _out_scores,
-                                          uint32_t* _out_indices) RAFT_EXPLICIT;
+RAFT_KERNEL compute_similarity_kernel(uint32_t dim,
+                                      uint32_t n_probes,
+                                      uint32_t pq_dim,
+                                      uint32_t n_queries,
+                                      uint32_t queries_offset,
+                                      distance::DistanceType metric,
+                                      codebook_gen codebook_kind,
+                                      uint32_t topk,
+                                      uint32_t max_samples,
+                                      const float* cluster_centers,
+                                      const float* pq_centers,
+                                      const uint8_t* const* pq_dataset,
+                                      const uint32_t* cluster_labels,
+                                      const uint32_t* _chunk_indices,
+                                      const float* queries,
+                                      const uint32_t* index_list,
+                                      float* query_kths,
+                                      IvfSampleFilterT sample_filter,
+                                      LutT* lut_scores,
+                                      OutT* _out_scores,
+                                      uint32_t* _out_indices) RAFT_EXPLICIT;
 
 // The signature of the kernel defined by a minimal set of template parameters
 template <typename OutT, typename LutT, typename IvfSampleFilterT>
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
index 7c5b523a8b..bd88c029e1 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
@@ -268,27 +268,27 @@ template <typename OutT,
           int Capacity,
           bool PrecompBaseDiff,
           bool EnableSMemLut>
-__global__ void compute_similarity_kernel(uint32_t dim,
-                                          uint32_t n_probes,
-                                          uint32_t pq_dim,
-                                          uint32_t n_queries,
-                                          uint32_t queries_offset,
-                                          distance::DistanceType metric,
-                                          codebook_gen codebook_kind,
-                                          uint32_t topk,
-                                          uint32_t max_samples,
-                                          const float* cluster_centers,
-                                          const float* pq_centers,
-                                          const uint8_t* const* pq_dataset,
-                                          const uint32_t* cluster_labels,
-                                          const uint32_t* _chunk_indices,
-                                          const float* queries,
-                                          const uint32_t* index_list,
-                                          float* query_kths,
-                                          IvfSampleFilterT sample_filter,
-                                          LutT* lut_scores,
-                                          OutT* _out_scores,
-                                          uint32_t* _out_indices)
+RAFT_KERNEL compute_similarity_kernel(uint32_t dim,
+                                      uint32_t n_probes,
+                                      uint32_t pq_dim,
+                                      uint32_t n_queries,
+                                      uint32_t queries_offset,
+                                      distance::DistanceType metric,
+                                      codebook_gen codebook_kind,
+                                      uint32_t topk,
+                                      uint32_t max_samples,
+                                      const float* cluster_centers,
+                                      const float* pq_centers,
+                                      const uint8_t* const* pq_dataset,
+                                      const uint32_t* cluster_labels,
+                                      const uint32_t* _chunk_indices,
+                                      const float* queries,
+                                      const uint32_t* index_list,
+                                      float* query_kths,
+                                      IvfSampleFilterT sample_filter,
+                                      LutT* lut_scores,
+                                      OutT* _out_scores,
+                                      uint32_t* _out_indices)
 {
   /* Shared memory:
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index b9e911ffe2..016fd8c693 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -171,12 +171,12 @@ void select_clusters(raft::resources const& handle,
  * number of samples per query (sum of the cluster sizes that we probe) is returned in n_samples.
  */
 template <int BlockDim>
-__launch_bounds__(BlockDim) __global__
-  void calc_chunk_indices_kernel(uint32_t n_probes,
-                                 const uint32_t* cluster_sizes,      // [n_clusters]
-                                 const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
-                                 uint32_t* chunk_indices,            // [n_queries, n_probes]
-                                 uint32_t* n_samples                 // [n_queries]
+__launch_bounds__(BlockDim) RAFT_KERNEL
+  calc_chunk_indices_kernel(uint32_t n_probes,
+                            const uint32_t* cluster_sizes,      // [n_clusters]
+                            const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
+                            uint32_t* chunk_indices,            // [n_queries, n_probes]
+                            uint32_t* n_samples                 // [n_queries]
   )
 {
   using block_scan = cub::BlockScan<uint32_t, BlockDim>;
@@ -274,15 +274,15 @@ __device__ inline auto find_chunk_ix(uint32_t& sample_ix,  // NOLINT
 }
 
 template <int BlockDim, typename IdxT>
-__launch_bounds__(BlockDim) __global__
-  void postprocess_neighbors_kernel(IdxT* neighbors_out,                // [n_queries, topk]
-                                    const uint32_t* neighbors_in,       // [n_queries, topk]
-                                    const IdxT* const* db_indices,      // [n_clusters][..]
-                                    const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
-                                    const uint32_t* chunk_indices,      // [n_queries, n_probes]
-                                    uint32_t n_queries,
-                                    uint32_t n_probes,
-                                    uint32_t topk)
+__launch_bounds__(BlockDim) RAFT_KERNEL
+  postprocess_neighbors_kernel(IdxT* neighbors_out,                // [n_queries, topk]
+                               const uint32_t* neighbors_in,       // [n_queries, topk]
+                               const IdxT* const* db_indices,      // [n_clusters][..]
+                               const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
+                               const uint32_t* chunk_indices,      // [n_queries, n_probes]
+                               uint32_t n_queries,
+                               uint32_t n_probes,
+                               uint32_t topk)
 {
   const uint64_t i        = threadIdx.x + BlockDim * uint64_t(blockIdx.x);
   const uint32_t query_ix = i / uint64_t(topk);
diff --git a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
index 0a33832b79..3d03d6db4f 100644
--- a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
@@ -30,16 +30,16 @@ template <typename value_idx = std::int64_t,
           int warp_q,
           int thread_q,
           int tpb>
-__global__ void knn_merge_parts_kernel(const value_t* inK,
-                                       const value_idx* inV,
-                                       value_t* outK,
-                                       value_idx* outV,
-                                       size_t n_samples,
-                                       int n_parts,
-                                       value_t initK,
-                                       value_idx initV,
-                                       int k,
-                                       value_idx* translations)
+RAFT_KERNEL knn_merge_parts_kernel(const value_t* inK,
+                                   const value_idx* inV,
+                                   value_t* outK,
+                                   value_idx* outV,
+                                   size_t n_samples,
+                                   int n_parts,
+                                   value_t initK,
+                                   value_idx initV,
+                                   int k,
+                                   value_idx* translations)
 {
   constexpr int kNumWarps = tpb / WarpSize;
 
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index ce77cdc3de..c794b1617c 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -448,11 +448,11 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer,
 // TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
 /** Calculate L2 norm, and cast data to __half */
 template <typename Data_t>
-__global__ void preprocess_data_kernel(const Data_t* input_data,
-                                       __half* output_data,
-                                       int dim,
-                                       DistData_t* l2_norms,
-                                       size_t list_offset = 0)
+RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
+                                   __half* output_data,
+                                   int dim,
+                                   DistData_t* l2_norms,
+                                   size_t list_offset = 0)
 {
   extern __shared__ char buffer[];
   __shared__ float l2_norm;
@@ -493,10 +493,10 @@ __global__ void preprocess_data_kernel(const Data_t* input_data,
 }
 
 template <typename Index_t>
-__global__ void add_rev_edges_kernel(const Index_t* graph,
-                                     Index_t* rev_graph,
-                                     int num_samples,
-                                     int2* list_sizes)
+RAFT_KERNEL add_rev_edges_kernel(const Index_t* graph,
+                                 Index_t* rev_graph,
+                                 int num_samples,
+                                 int2* list_sizes)
 {
   size_t list_id = blockIdx.x;
   int2 list_size = list_sizes[list_id];
@@ -688,7 +688,7 @@ __device__ __forceinline__ void remove_duplicates(
 // For architectures 750 and 860, the values for MAX_RESIDENT_THREAD_PER_SM
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
 template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
-__global__ void
+RAFT_KERNEL
 #ifdef __CUDA_ARCH__
 #if (__CUDA_ARCH__) == 750 || (__CUDA_ARCH__) == 860
 __launch_bounds__(BLOCK_SIZE)
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh
index d2e3206993..35f322bbf6 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh
@@ -25,15 +25,15 @@
 namespace raft::neighbors::detail {
 
 template <typename payload_t, typename key_t, bool select_min, int warp_q, int thread_q, int tpb>
-__global__ void select_k_kernel(const key_t* inK,
-                                const payload_t* inV,
-                                size_t n_rows,
-                                size_t n_cols,
-                                key_t* outK,
-                                payload_t* outV,
-                                key_t initK,
-                                payload_t initV,
-                                int k)
+RAFT_KERNEL select_k_kernel(const key_t* inK,
+                            const payload_t* inV,
+                            size_t n_rows,
+                            size_t n_cols,
+                            key_t* outK,
+                            payload_t* outV,
+                            key_t initK,
+                            payload_t initV,
+                            int k)
 {
   using align_warp        = Pow2<WarpSize>;
   constexpr int kNumWarps = align_warp::div(tpb);
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 4e3da63ef6..1140b3484f 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -92,16 +92,16 @@ DI void get_mu_sigma(DataT& mu,
 }
 
 template <typename DataT, typename IdxT, typename GenType>
-__global__ void generate_data_kernel(raft::random::DeviceState<GenType> rng_state,
-                                     DataT* out,
-                                     const IdxT* labels,
-                                     IdxT n_rows,
-                                     IdxT n_cols,
-                                     IdxT n_clusters,
-                                     bool row_major,
-                                     const DataT* centers,
-                                     const DataT* cluster_std,
-                                     const DataT cluster_std_scalar)
+RAFT_KERNEL generate_data_kernel(raft::random::DeviceState<GenType> rng_state,
+                                 DataT* out,
+                                 const IdxT* labels,
+                                 IdxT n_rows,
+                                 IdxT n_cols,
+                                 IdxT n_clusters,
+                                 bool row_major,
+                                 const DataT* centers,
+                                 const DataT* cluster_std,
+                                 const DataT cluster_std_scalar)
 {
   uint64_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   GenType gen(rng_state, tid);
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index aec1a15f84..40843668ce 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -40,7 +40,7 @@ namespace detail {
 
 /* Internal auxiliary function to help build the singular profile */
 template <typename DataT, typename IdxT>
-static __global__ void _singular_profile_kernel(DataT* out, IdxT n, DataT tail_strength, IdxT rank)
+RAFT_KERNEL _singular_profile_kernel(DataT* out, IdxT n, DataT tail_strength, IdxT rank)
 {
   IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
@@ -130,7 +130,7 @@ static void _make_low_rank_matrix(raft::resources const& handle,
 /* Internal auxiliary function to permute rows in the given matrix according
  * to a given permutation vector */
 template <typename DataT, typename IdxT>
-static __global__ void _gather2d_kernel(
+RAFT_KERNEL _gather2d_kernel(
   DataT* out, const DataT* in, const IdxT* perms, IdxT n_rows, IdxT n_cols)
 {
   IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index 68934ac1ff..03c62a1b33 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -94,7 +94,7 @@ void matVecAdd(
 
 // helper kernels
 template <typename T>
-__global__ void combined_dot_product(int rows, int cols, const T* W, T* matrix, int* check)
+RAFT_KERNEL combined_dot_product(int rows, int cols, const T* W, T* matrix, int* check)
 {
   int m_i = threadIdx.x + blockDim.x * blockIdx.x;
   int Wi  = m_i / cols;
@@ -108,7 +108,7 @@ __global__ void combined_dot_product(int rows, int cols, const T* W, T* matrix,
 
 template <typename T>  // if uplo = 0, lower part of dim x dim matrix set to
 // value
-__global__ void fill_uplo(int dim, Filler uplo, T value, T* A)
+RAFT_KERNEL fill_uplo(int dim, Filler uplo, T value, T* A)
 {
   int j = threadIdx.x + blockDim.x * blockIdx.x;
   int i = threadIdx.y + blockDim.y * blockIdx.y;
diff --git a/cpp/include/raft/random/detail/permute.cuh b/cpp/include/raft/random/detail/permute.cuh
index 9dd3912fc4..cf68a746a9 100644
--- a/cpp/include/raft/random/detail/permute.cuh
+++ b/cpp/include/raft/random/detail/permute.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace raft::random {
 namespace detail {
 
 template <typename Type, typename IntType, typename IdxType, int TPB, bool rowMajor>
-__global__ void permuteKernel(
+RAFT_KERNEL permuteKernel(
   IntType* perms, Type* out, const Type* in, IdxType a, IdxType b, IdxType N, IdxType D)
 {
   namespace cg        = cooperative_groups;
diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index d00fc29056..9ad7c68f87 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -74,15 +74,15 @@ DI void store_ids(
 }
 
 template <typename IdxT, typename ProbT>
-__global__ void rmat_gen_kernel(IdxT* out,
-                                IdxT* out_src,
-                                IdxT* out_dst,
-                                const ProbT* theta,
-                                IdxT r_scale,
-                                IdxT c_scale,
-                                IdxT n_edges,
-                                IdxT max_scale,
-                                raft::random::RngState r)
+RAFT_KERNEL rmat_gen_kernel(IdxT* out,
+                            IdxT* out_src,
+                            IdxT* out_dst,
+                            const ProbT* theta,
+                            IdxT r_scale,
+                            IdxT c_scale,
+                            IdxT n_edges,
+                            IdxT max_scale,
+                            raft::random::RngState r)
 {
   IdxT idx = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
   extern __shared__ ProbT s_theta[];
@@ -134,17 +134,17 @@ void rmat_rectangular_gen_caller(IdxT* out,
 }
 
 template <typename IdxT, typename ProbT>
-__global__ void rmat_gen_kernel(IdxT* out,
-                                IdxT* out_src,
-                                IdxT* out_dst,
-                                ProbT a,
-                                ProbT b,
-                                ProbT c,
-                                IdxT r_scale,
-                                IdxT c_scale,
-                                IdxT n_edges,
-                                IdxT max_scale,
-                                raft::random::RngState r)
+RAFT_KERNEL rmat_gen_kernel(IdxT* out,
+                            IdxT* out_src,
+                            IdxT* out_dst,
+                            ProbT a,
+                            ProbT b,
+                            ProbT c,
+                            IdxT r_scale,
+                            IdxT c_scale,
+                            IdxT n_edges,
+                            IdxT max_scale,
+                            raft::random::RngState r)
 {
   IdxT idx = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
   IdxT src_id{0}, dst_id{0};
diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
index d6999d9236..26bf93c260 100644
--- a/cpp/include/raft/random/detail/rng_device.cuh
+++ b/cpp/include/raft/random/detail/rng_device.cuh
@@ -666,10 +666,7 @@ template <int ITEMS_PER_CALL,
           typename LenType,
           typename GenType,
           typename ParamType>
-__global__ void rngKernel(DeviceState<GenType> rng_state,
-                          OutType* ptr,
-                          LenType len,
-                          ParamType params)
+RAFT_KERNEL rngKernel(DeviceState<GenType> rng_state, OutType* ptr, LenType len, ParamType params)
 {
   LenType tid = threadIdx.x + static_cast<LenType>(blockIdx.x) * blockDim.x;
   GenType gen(rng_state, (uint64_t)tid);
@@ -686,11 +683,11 @@ __global__ void rngKernel(DeviceState<GenType> rng_state,
 }
 
 template <typename GenType, typename OutType, typename WeightType, typename IdxType>
-__global__ void sample_with_replacement_kernel(DeviceState<GenType> rng_state,
-                                               OutType* out,
-                                               const WeightType* weights_csum,
-                                               IdxType sampledLen,
-                                               IdxType len)
+RAFT_KERNEL sample_with_replacement_kernel(DeviceState<GenType> rng_state,
+                                           OutType* out,
+                                           const WeightType* weights_csum,
+                                           IdxType sampledLen,
+                                           IdxType len)
 {
   // todo(lsugy): warp-collaborative binary search
 
@@ -727,7 +724,7 @@ template <typename OutType,
           typename GenType,
           int ITEMS_PER_CALL,
           typename ParamType>
-__global__ void fillKernel(
+RAFT_KERNEL fillKernel(
   uint64_t seed, uint64_t adv_subs, uint64_t offset, OutType* ptr, LenType len, ParamType params)
 {
   LenType tid = threadIdx.x + static_cast<LenType>(blockIdx.x) * blockDim.x;
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index cd465e634a..57f4c8d33d 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ namespace detail {
  * as well as triple chevron kernel calls, see the following code example
  * @code
  * template <int C1, int C2, typename GenType>
- * __global__ void my_kernel(DeviceState<GenType> state, int arg1) { ... }
+ * RAFT_KERNEL my_kernel(DeviceState<GenType> state, int arg1) { ... }
  *
  * template <int C1, typename GenType, int C2 = 2>
  * void foo(DeviceState<GenType> state, int arg1) {
diff --git a/cpp/include/raft/solver/detail/lap_kernels.cuh b/cpp/include/raft/solver/detail/lap_kernels.cuh
index 88def15153..383c3ab713 100644
--- a/cpp/include/raft/solver/detail/lap_kernels.cuh
+++ b/cpp/include/raft/solver/detail/lap_kernels.cuh
@@ -159,7 +159,7 @@ __device__ void __augment(vertex_t* d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_rowReduction(
+RAFT_KERNEL kernel_rowReduction(
   weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity)
 {
   int spid     = blockIdx.y * blockDim.y + threadIdx.y;
@@ -181,12 +181,12 @@ __global__ void kernel_rowReduction(
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_columnReduction(weight_t const* d_costs,
-                                       weight_t const* d_row_duals,
-                                       weight_t* d_col_duals,
-                                       int SP,
-                                       vertex_t N,
-                                       weight_t infinity)
+RAFT_KERNEL kernel_columnReduction(weight_t const* d_costs,
+                                   weight_t const* d_row_duals,
+                                   weight_t* d_col_duals,
+                                   int SP,
+                                   vertex_t N,
+                                   weight_t infinity)
 {
   int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -209,16 +209,16 @@ __global__ void kernel_columnReduction(weight_t const* d_costs,
 
 // Kernel for calculating initial assignments.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
-                                                 weight_t const* d_row_duals,
-                                                 weight_t const* d_col_duals,
-                                                 vertex_t* d_row_assignments,
-                                                 vertex_t* d_col_assignments,
-                                                 int* d_row_lock,
-                                                 int* d_col_lock,
-                                                 int SP,
-                                                 vertex_t N,
-                                                 weight_t epsilon)
+RAFT_KERNEL kernel_computeInitialAssignments(weight_t const* d_costs,
+                                             weight_t const* d_row_duals,
+                                             weight_t const* d_col_duals,
+                                             vertex_t* d_row_assignments,
+                                             vertex_t* d_col_assignments,
+                                             int* d_row_lock,
+                                             int* d_col_lock,
+                                             int SP,
+                                             vertex_t N,
+                                             weight_t epsilon)
 {
   int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -249,7 +249,7 @@ __global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
 
 // Kernel for populating the cover arrays and initializing alternating tree.
 template <typename vertex_t>
-__global__ void kernel_computeRowCovers(
+RAFT_KERNEL kernel_computeRowCovers(
   vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N)
 {
   int spid  = blockIdx.y * blockDim.y + threadIdx.y;
@@ -268,7 +268,7 @@ __global__ void kernel_computeRowCovers(
 
 // Kernel for populating the predicate matrix for edges in row major format.
 template <typename vertex_t>
-__global__ void kernel_rowPredicateConstructionCSR(
+RAFT_KERNEL kernel_rowPredicateConstructionCSR(
   bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N)
 {
   int spid  = blockIdx.y * blockDim.y + threadIdx.y;
@@ -289,13 +289,13 @@ __global__ void kernel_rowPredicateConstructionCSR(
 
 // Kernel for scattering the edges based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_rowScatterCSR(bool const* d_predicates,
-                                     vertex_t const* d_addresses,
-                                     vertex_t* d_neighbors,
-                                     vertex_t* d_ptrs,
-                                     vertex_t M,
-                                     int SP,
-                                     vertex_t N)
+RAFT_KERNEL kernel_rowScatterCSR(bool const* d_predicates,
+                                 vertex_t const* d_addresses,
+                                 vertex_t* d_neighbors,
+                                 vertex_t* d_ptrs,
+                                 vertex_t M,
+                                 int SP,
+                                 vertex_t N)
 {
   int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -316,16 +316,16 @@ __global__ void kernel_rowScatterCSR(bool const* d_predicates,
 
 // Kernel for finding the minimum zero cover.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_coverAndExpand(bool* d_flag,
-                                      vertex_t const* d_ptrs,
-                                      vertex_t const* d_neighbors,
-                                      weight_t const* d_elements,
-                                      Vertices<vertex_t, weight_t> d_vertices,
-                                      VertexData<vertex_t> d_row_data,
-                                      VertexData<vertex_t> d_col_data,
-                                      int SP,
-                                      vertex_t N,
-                                      weight_t epsilon)
+RAFT_KERNEL kernel_coverAndExpand(bool* d_flag,
+                                  vertex_t const* d_ptrs,
+                                  vertex_t const* d_neighbors,
+                                  weight_t const* d_elements,
+                                  Vertices<vertex_t, weight_t> d_vertices,
+                                  VertexData<vertex_t> d_row_data,
+                                  VertexData<vertex_t> d_col_data,
+                                  int SP,
+                                  vertex_t N,
+                                  weight_t epsilon)
 {
   int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -362,10 +362,10 @@ __global__ void kernel_coverAndExpand(bool* d_flag,
 
 // Kernel for constructing the predicates for reverse pass or augmentation candidates.
 template <typename vertex_t>
-__global__ void kernel_augmentPredicateConstruction(bool* d_predicates,
-                                                    vertex_t* d_addresses,
-                                                    int* d_visited,
-                                                    int size)
+RAFT_KERNEL kernel_augmentPredicateConstruction(bool* d_predicates,
+                                                vertex_t* d_addresses,
+                                                int* d_visited,
+                                                int size)
 {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -383,10 +383,10 @@ __global__ void kernel_augmentPredicateConstruction(bool* d_predicates,
 
 // Kernel for scattering the vertices based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_augmentScatter(vertex_t* d_elements,
-                                      bool const* d_predicates,
-                                      vertex_t const* d_addresses,
-                                      std::size_t size)
+RAFT_KERNEL kernel_augmentScatter(vertex_t* d_elements,
+                                  bool const* d_predicates,
+                                  vertex_t const* d_addresses,
+                                  std::size_t size)
 {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -397,10 +397,10 @@ __global__ void kernel_augmentScatter(vertex_t* d_elements,
 
 // Kernel for executing the reverse pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_reverseTraversal(vertex_t* d_elements,
-                                        VertexData<vertex_t> d_row_data,
-                                        VertexData<vertex_t> d_col_data,
-                                        int size)
+RAFT_KERNEL kernel_reverseTraversal(vertex_t* d_elements,
+                                    VertexData<vertex_t> d_row_data,
+                                    VertexData<vertex_t> d_col_data,
+                                    int size)
 {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -416,13 +416,13 @@ __global__ void kernel_reverseTraversal(vertex_t* d_elements,
 
 // Kernel for executing the augmentation pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_augmentation(vertex_t* d_row_assignments,
-                                    vertex_t* d_col_assignments,
-                                    vertex_t const* d_row_elements,
-                                    VertexData<vertex_t> d_row_data,
-                                    VertexData<vertex_t> d_col_data,
-                                    vertex_t N,
-                                    vertex_t size)
+RAFT_KERNEL kernel_augmentation(vertex_t* d_row_assignments,
+                                vertex_t* d_col_assignments,
+                                vertex_t const* d_row_elements,
+                                VertexData<vertex_t> d_row_data,
+                                VertexData<vertex_t> d_col_data,
+                                vertex_t N,
+                                vertex_t size)
 {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -440,12 +440,12 @@ __global__ void kernel_augmentation(vertex_t* d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_1(weight_t* d_sp_min,
-                                    weight_t const* d_col_slacks,
-                                    int const* d_col_covers,
-                                    int SP,
-                                    vertex_t N,
-                                    weight_t infinity)
+RAFT_KERNEL kernel_dualUpdate_1(weight_t* d_sp_min,
+                                weight_t const* d_col_slacks,
+                                int const* d_col_covers,
+                                int SP,
+                                vertex_t N,
+                                weight_t infinity)
 {
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -468,18 +468,18 @@ __global__ void kernel_dualUpdate_1(weight_t* d_sp_min,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min,
-                                    weight_t* d_row_duals,
-                                    weight_t* d_col_duals,
-                                    weight_t* d_col_slacks,
-                                    int const* d_row_covers,
-                                    int const* d_col_covers,
-                                    int* d_row_visited,
-                                    vertex_t* d_col_parents,
-                                    int SP,
-                                    vertex_t N,
-                                    weight_t infinity,
-                                    weight_t epsilon)
+RAFT_KERNEL kernel_dualUpdate_2(weight_t const* d_sp_min,
+                                weight_t* d_row_duals,
+                                weight_t* d_col_duals,
+                                weight_t* d_col_slacks,
+                                int const* d_row_covers,
+                                int const* d_col_covers,
+                                int* d_row_visited,
+                                vertex_t* d_col_parents,
+                                int SP,
+                                vertex_t N,
+                                weight_t infinity,
+                                weight_t epsilon)
 {
   int spid = blockIdx.y * blockDim.y + threadIdx.y;
   int id   = blockIdx.x * blockDim.x + threadIdx.x;
@@ -512,11 +512,11 @@ __global__ void kernel_dualUpdate_2(weight_t const* d_sp_min,
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual,
-                                      weight_t const* d_row_duals,
-                                      weight_t const* d_col_duals,
-                                      int SP,
-                                      vertex_t N)
+RAFT_KERNEL kernel_calcObjValDual(weight_t* d_obj_val_dual,
+                                  weight_t const* d_row_duals,
+                                  weight_t const* d_col_duals,
+                                  int SP,
+                                  vertex_t N)
 {
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -532,11 +532,11 @@ __global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual,
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
-                                        weight_t const* d_costs,
-                                        vertex_t const* d_row_assignments,
-                                        int SP,
-                                        vertex_t N)
+RAFT_KERNEL kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
+                                    weight_t const* d_costs,
+                                    vertex_t const* d_row_assignments,
+                                    int SP,
+                                    vertex_t N)
 {
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
index b3471d7426..4a4453417d 100644
--- a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
@@ -62,7 +62,7 @@ static const constexpr int adj_to_csr_tpb = 512;
  *                             the number of non-zeros in `adj`.
  */
 template <typename index_t>
-__global__ void __launch_bounds__(adj_to_csr_tpb)
+RAFT_KERNEL __launch_bounds__(adj_to_csr_tpb)
   adj_to_csr_kernel(const bool* adj,         // row-major adjacency matrix
                     const index_t* row_ind,  // precomputed row indices
                     index_t num_rows,        // # rows of adj
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index 7cc4770138..1302ed1220 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,10 +38,10 @@ namespace convert {
 namespace detail {
 
 template <typename value_idx = int, int TPB_X = 32>
-__global__ void csr_to_coo_kernel(const value_idx* row_ind,
-                                  value_idx m,
-                                  value_idx* coo_rows,
-                                  value_idx nnz)
+RAFT_KERNEL csr_to_coo_kernel(const value_idx* row_ind,
+                              value_idx m,
+                              value_idx* coo_rows,
+                              value_idx nnz)
 {
   // row-based matrix 1 thread per row
   value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index 2be887e836..f3db8a36c1 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ namespace convert {
 namespace detail {
 
 template <typename value_t>
-__global__ void csr_to_dense_warp_per_row_kernel(
+RAFT_KERNEL csr_to_dense_warp_per_row_kernel(
   int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a)
 {
   int row = blockIdx.x;
diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh
index c0985779f4..4f8496a026 100644
--- a/cpp/include/raft/sparse/detail/csr.cuh
+++ b/cpp/include/raft/sparse/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,15 +48,15 @@ struct WeakCCState {
 };
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_label_device(Index_* __restrict__ labels,
-                                     const Index_* __restrict__ row_ind,
-                                     const Index_* __restrict__ row_ind_ptr,
-                                     Index_ nnz,
-                                     bool* __restrict__ m,
-                                     Index_ start_vertex_id,
-                                     Index_ batch_size,
-                                     Index_ N,
-                                     Lambda filter_op)
+RAFT_KERNEL weak_cc_label_device(Index_* __restrict__ labels,
+                                 const Index_* __restrict__ row_ind,
+                                 const Index_* __restrict__ row_ind_ptr,
+                                 Index_ nnz,
+                                 bool* __restrict__ m,
+                                 Index_ start_vertex_id,
+                                 Index_ batch_size,
+                                 Index_ N,
+                                 Lambda filter_op)
 {
   Index_ tid       = threadIdx.x + blockIdx.x * TPB_X;
   Index_ global_id = tid + start_vertex_id;
@@ -96,10 +96,7 @@ __global__ void weak_cc_label_device(Index_* __restrict__ labels,
 }
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_init_all_kernel(Index_* labels,
-                                        Index_ N,
-                                        Index_ MAX_LABEL,
-                                        Lambda filter_op)
+RAFT_KERNEL weak_cc_init_all_kernel(Index_* labels, Index_ N, Index_ MAX_LABEL, Lambda filter_op)
 {
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
diff --git a/cpp/include/raft/sparse/detail/utils.h b/cpp/include/raft/sparse/detail/utils.h
index b5017451e6..3eed74f3b4 100644
--- a/cpp/include/raft/sparse/detail/utils.h
+++ b/cpp/include/raft/sparse/detail/utils.h
@@ -84,7 +84,7 @@ __device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
 }
 
 template <typename value_idx>
-__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols)
+RAFT_KERNEL iota_fill_block_kernel(value_idx* indices, value_idx ncols)
 {
   int row = blockIdx.x;
   int tid = threadIdx.x;
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index e87ef99469..ae16b1adcc 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -35,10 +35,10 @@ namespace distance {
 namespace detail {
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_binary_row_norm_kernel(value_t* out,
-                                               const value_idx* __restrict__ coo_rows,
-                                               const value_t* __restrict__ data,
-                                               value_idx nnz)
+RAFT_KERNEL compute_binary_row_norm_kernel(value_t* out,
+                                           const value_idx* __restrict__ coo_rows,
+                                           const value_t* __restrict__ data,
+                                           value_idx nnz)
 {
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < nnz) {
@@ -51,12 +51,12 @@ __global__ void compute_binary_row_norm_kernel(value_t* out,
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_binary_warp_kernel(value_t* __restrict__ C,
-                                           const value_t* __restrict__ Q_norms,
-                                           const value_t* __restrict__ R_norms,
-                                           value_idx n_rows,
-                                           value_idx n_cols,
-                                           expansion_f expansion_func)
+RAFT_KERNEL compute_binary_warp_kernel(value_t* __restrict__ C,
+                                       const value_t* __restrict__ Q_norms,
+                                       const value_t* __restrict__ R_norms,
+                                       value_idx n_rows,
+                                       value_idx n_cols,
+                                       expansion_f expansion_func)
 {
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   value_idx i     = tid / n_cols;
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
index 9bfdd3bad0..43e8dea3ee 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,25 +86,25 @@ template <typename strategy_t,
           typename product_f,
           typename accum_f,
           typename write_f>
-__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
-                                                     indptr_it indptrA,
-                                                     value_idx* indicesA,
-                                                     value_t* dataA,
-                                                     value_idx nnz_a,
-                                                     value_idx* rowsB,
-                                                     value_idx* indicesB,
-                                                     value_t* dataB,
-                                                     value_idx m,
-                                                     value_idx n,
-                                                     int dim,
-                                                     value_idx nnz_b,
-                                                     value_t* out,
-                                                     int n_blocks_per_row,
-                                                     int chunk_size,
-                                                     value_idx b_ncols,
-                                                     product_f product_func,
-                                                     accum_f accum_func,
-                                                     write_f write_func)
+RAFT_KERNEL balanced_coo_generalized_spmv_kernel(strategy_t strategy,
+                                                 indptr_it indptrA,
+                                                 value_idx* indicesA,
+                                                 value_t* dataA,
+                                                 value_idx nnz_a,
+                                                 value_idx* rowsB,
+                                                 value_idx* indicesB,
+                                                 value_t* dataB,
+                                                 value_idx m,
+                                                 value_idx n,
+                                                 int dim,
+                                                 value_idx nnz_b,
+                                                 value_t* out,
+                                                 int n_blocks_per_row,
+                                                 int chunk_size,
+                                                 value_idx b_ncols,
+                                                 product_f product_func,
+                                                 accum_f accum_func,
+                                                 write_f write_func)
 {
   typedef cub::WarpReduce<value_t> warp_reduce;
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 4c061336b3..38aa106d78 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -83,9 +83,9 @@ class mask_row_it {
 };
 
 template <typename value_idx>
-__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
-                                          value_idx* chunk_indices,
-                                          value_idx n_rows)
+RAFT_KERNEL fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
+                                      value_idx* chunk_indices,
+                                      value_idx n_rows)
 {
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n_rows) {
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index acae3dc445..c198f91a31 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -44,32 +44,32 @@ namespace detail {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_row_norm_kernel(value_t* out,
-                                        const value_idx* __restrict__ coo_rows,
-                                        const value_t* __restrict__ data,
-                                        value_idx nnz)
+RAFT_KERNEL compute_row_norm_kernel(value_t* out,
+                                    const value_idx* __restrict__ coo_rows,
+                                    const value_t* __restrict__ data,
+                                    value_idx nnz)
 {
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_row_sum_kernel(value_t* out,
-                                       const value_idx* __restrict__ coo_rows,
-                                       const value_t* __restrict__ data,
-                                       value_idx nnz)
+RAFT_KERNEL compute_row_sum_kernel(value_t* out,
+                                   const value_idx* __restrict__ coo_rows,
+                                   const value_t* __restrict__ data,
+                                   value_idx nnz)
 {
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C,
-                                              const value_t* __restrict__ Q_sq_norms,
-                                              const value_t* __restrict__ R_sq_norms,
-                                              value_idx n_rows,
-                                              value_idx n_cols,
-                                              expansion_f expansion_func)
+RAFT_KERNEL compute_euclidean_warp_kernel(value_t* __restrict__ C,
+                                          const value_t* __restrict__ Q_sq_norms,
+                                          const value_t* __restrict__ R_sq_norms,
+                                          value_idx n_rows,
+                                          value_idx n_cols,
+                                          expansion_f expansion_func)
 {
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   value_idx i     = tid / n_cols;
@@ -87,14 +87,14 @@ __global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
-                                                const value_t* __restrict__ Q_sq_norms,
-                                                const value_t* __restrict__ R_sq_norms,
-                                                const value_t* __restrict__ Q_norms,
-                                                const value_t* __restrict__ R_norms,
-                                                value_idx n_rows,
-                                                value_idx n_cols,
-                                                value_idx n)
+RAFT_KERNEL compute_correlation_warp_kernel(value_t* __restrict__ C,
+                                            const value_t* __restrict__ Q_sq_norms,
+                                            const value_t* __restrict__ R_sq_norms,
+                                            const value_t* __restrict__ Q_norms,
+                                            const value_t* __restrict__ R_norms,
+                                            value_idx n_rows,
+                                            value_idx n_cols,
+                                            value_idx n)
 {
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   value_idx i     = tid / n_cols;
diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh
index ea1356938e..72dd4a60f9 100644
--- a/cpp/include/raft/sparse/linalg/detail/add.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,16 +41,16 @@ namespace linalg {
 namespace detail {
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
-                                               const int* a_indptr,
-                                               const T* a_val,
-                                               int nnz1,
-                                               const int* b_ind,
-                                               const int* b_indptr,
-                                               const T* b_val,
-                                               int nnz2,
-                                               int m,
-                                               int* out_rowcounts)
+RAFT_KERNEL csr_add_calc_row_counts_kernel(const int* a_ind,
+                                           const int* a_indptr,
+                                           const T* a_val,
+                                           int nnz1,
+                                           const int* b_ind,
+                                           const int* b_indptr,
+                                           const T* b_val,
+                                           int nnz2,
+                                           int m,
+                                           int* out_rowcounts)
 {
   // loop through columns in each set of rows and
   // calculate number of unique cols across both rows
@@ -100,18 +100,18 @@ __global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
 }
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_kernel(const int* a_ind,
-                               const int* a_indptr,
-                               const T* a_val,
-                               int nnz1,
-                               const int* b_ind,
-                               const int* b_indptr,
-                               const T* b_val,
-                               int nnz2,
-                               int m,
-                               int* out_ind,
-                               int* out_indptr,
-                               T* out_val)
+RAFT_KERNEL csr_add_kernel(const int* a_ind,
+                           const int* a_indptr,
+                           const T* a_val,
+                           int nnz1,
+                           const int* b_ind,
+                           const int* b_indptr,
+                           const T* b_val,
+                           int nnz2,
+                           int m,
+                           int* out_ind,
+                           int* out_indptr,
+                           T* out_val)
 {
   // 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index 86fcdb58d6..a68968ea97 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ namespace detail {
  * @param results array to place results
  */
 template <int TPB_X = 64, typename T = int>
-__global__ void coo_degree_kernel(const T* rows, int nnz, T* results)
+RAFT_KERNEL coo_degree_kernel(const T* rows, int nnz, T* results)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz) { atomicAdd(results + rows[row], (T)1); }
@@ -65,14 +65,14 @@ void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
+RAFT_KERNEL coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_scalar_kernel(
+RAFT_KERNEL coo_degree_scalar_kernel(
   const int* rows, const T* vals, int nnz, T scalar, int* results)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 56ca2ebfa7..d68e1e992f 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -43,7 +43,7 @@ namespace linalg {
 namespace detail {
 
 template <int TPB_X = 64, typename T>
-__global__ void csr_row_normalize_l1_kernel(
+RAFT_KERNEL csr_row_normalize_l1_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
@@ -109,7 +109,7 @@ void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void csr_row_normalize_max_kernel(
+RAFT_KERNEL csr_row_normalize_max_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 2bf7483c4e..6febf0aadc 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -49,16 +49,16 @@ namespace detail {
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
 template <int TPB_X = 128, typename T, typename Lambda>
-__global__ void coo_symmetrize_kernel(int* row_ind,
-                                      int* rows,
-                                      int* cols,
-                                      T* vals,
-                                      int* orows,
-                                      int* ocols,
-                                      T* ovals,
-                                      int n,
-                                      int cnnz,
-                                      Lambda reduction_op)
+RAFT_KERNEL coo_symmetrize_kernel(int* row_ind,
+                                  int* rows,
+                                  int* cols,
+                                  T* vals,
+                                  int* orows,
+                                  int* ocols,
+                                  T* ovals,
+                                  int n,
+                                  int cnnz,
+                                  Lambda reduction_op)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
@@ -174,12 +174,12 @@ void coo_symmetrize(COO<T>* in,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t* __restrict__ data,
-                                           const value_idx* __restrict__ indices,
-                                           const value_idx n,
-                                           const int k,
-                                           value_idx* __restrict__ row_sizes,
-                                           value_idx* __restrict__ row_sizes2)
+RAFT_KERNEL symmetric_find_size(const value_t* __restrict__ data,
+                                const value_idx* __restrict__ indices,
+                                const value_idx n,
+                                const int k,
+                                value_idx* __restrict__ row_sizes,
+                                value_idx* __restrict__ row_sizes2)
 {
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
   const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
@@ -202,10 +202,10 @@ __global__ static void symmetric_find_size(const value_t* __restrict__ data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n,
-                                        const int k,
-                                        value_idx* __restrict__ row_sizes,
-                                        const value_idx* __restrict__ row_sizes2)
+RAFT_KERNEL reduce_find_size(const value_idx n,
+                             const int k,
+                             value_idx* __restrict__ row_sizes,
+                             const value_idx* __restrict__ row_sizes2)
 {
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (i >= n) return;
@@ -227,14 +227,14 @@ __global__ static void reduce_find_size(const value_idx n,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx* __restrict__ edges,
-                                     const value_t* __restrict__ data,
-                                     const value_idx* __restrict__ indices,
-                                     value_t* __restrict__ VAL,
-                                     value_idx* __restrict__ COL,
-                                     value_idx* __restrict__ ROW,
-                                     const value_idx n,
-                                     const int k)
+RAFT_KERNEL symmetric_sum(value_idx* __restrict__ edges,
+                          const value_t* __restrict__ data,
+                          const value_idx* __restrict__ indices,
+                          value_t* __restrict__ VAL,
+                          value_idx* __restrict__ COL,
+                          value_idx* __restrict__ ROW,
+                          const value_idx n,
+                          const int k)
 {
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
   const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index c36fe776d8..1de8d5b426 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -57,12 +57,12 @@ void coo_symmetrize(COO<T>* in,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ void symmetric_find_size(const value_t __restrict__* data,
-                                    const value_idx __restrict__* indices,
-                                    const value_idx n,
-                                    const int k,
-                                    value_idx __restrict__* row_sizes,
-                                    value_idx __restrict__* row_sizes2)
+RAFT_KERNEL symmetric_find_size(const value_t __restrict__* data,
+                                const value_idx __restrict__* indices,
+                                const value_idx n,
+                                const int k,
+                                value_idx __restrict__* row_sizes,
+                                value_idx __restrict__* row_sizes2)
 {
   detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
 }
@@ -78,10 +78,10 @@ __global__ void symmetric_find_size(const value_t __restrict__* data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ void reduce_find_size(const value_idx n,
-                                 const int k,
-                                 value_idx __restrict__* row_sizes,
-                                 const value_idx __restrict__* row_sizes2)
+RAFT_KERNEL reduce_find_size(const value_idx n,
+                             const int k,
+                             value_idx __restrict__* row_sizes,
+                             const value_idx __restrict__* row_sizes2)
 {
   detail::reduce_find_size(n, k, row_sizes, row_sizes2);
 }
@@ -103,14 +103,14 @@ __global__ void reduce_find_size(const value_idx n,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ void symmetric_sum(value_idx* __restrict__ edges,
-                              const value_t* __restrict__ data,
-                              const value_idx* __restrict__ indices,
-                              value_t* __restrict__ VAL,
-                              value_idx* __restrict__ COL,
-                              value_idx* __restrict__ ROW,
-                              const value_idx n,
-                              const int k)
+RAFT_KERNEL symmetric_sum(value_idx* __restrict__ edges,
+                          const value_t* __restrict__ data,
+                          const value_idx* __restrict__ indices,
+                          value_t* __restrict__ VAL,
+                          value_idx* __restrict__ COL,
+                          value_idx* __restrict__ ROW,
+                          const value_idx n,
+                          const int k)
 {
   detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
 }
diff --git a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
index 3570be2b5c..749152ee07 100644
--- a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
@@ -371,13 +371,13 @@ void sort_by_color(raft::resources const& handle,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void min_components_by_color_kernel(value_idx* out_rows,
-                                               value_idx* out_cols,
-                                               value_t* out_vals,
-                                               const value_idx* out_index,
-                                               const value_idx* indices,
-                                               const raft::KeyValuePair<value_idx, value_t>* kvp,
-                                               size_t nnz)
+RAFT_KERNEL min_components_by_color_kernel(value_idx* out_rows,
+                                           value_idx* out_cols,
+                                           value_t* out_vals,
+                                           const value_idx* out_index,
+                                           const value_idx* indices,
+                                           const raft::KeyValuePair<value_idx, value_t>* kvp,
+                                           size_t nnz)
 {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index 00c5317b5c..4344025065 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -45,7 +45,7 @@ namespace raft::sparse::neighbors::detail {
  * @param m
  */
 template <typename value_idx>
-__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz)
+RAFT_KERNEL fill_indices(value_idx* indices, size_t m, size_t nnz)
 {
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;
@@ -62,7 +62,7 @@ value_idx build_k(value_idx n_samples, int c)
 }
 
 template <typename in_t, typename out_t>
-__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz)
+RAFT_KERNEL conv_indices_kernel(in_t* inds, out_t* out, size_t nnz)
 {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index bcc0301318..aa375443f5 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,17 +44,17 @@ namespace op {
 namespace detail {
 
 template <int TPB_X, typename T>
-__global__ void coo_remove_scalar_kernel(const int* rows,
-                                         const int* cols,
-                                         const T* vals,
-                                         int nnz,
-                                         int* crows,
-                                         int* ccols,
-                                         T* cvals,
-                                         int* ex_scan,
-                                         int* cur_ex_scan,
-                                         int m,
-                                         T scalar)
+RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
+                                     const int* cols,
+                                     const T* vals,
+                                     int nnz,
+                                     int* crows,
+                                     int* ccols,
+                                     T* cvals,
+                                     int* ex_scan,
+                                     int* cur_ex_scan,
+                                     int m,
+                                     T scalar)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 658b63729b..8a1a63e424 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -46,10 +46,10 @@ namespace op {
 namespace detail {
 
 template <typename value_idx>
-__global__ void compute_duplicates_diffs_kernel(const value_idx* rows,
-                                                const value_idx* cols,
-                                                value_idx* diff,
-                                                size_t nnz)
+RAFT_KERNEL compute_duplicates_diffs_kernel(const value_idx* rows,
+                                            const value_idx* cols,
+                                            value_idx* diff,
+                                            size_t nnz)
 {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
@@ -60,14 +60,14 @@ __global__ void compute_duplicates_diffs_kernel(const value_idx* rows,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void max_duplicates_kernel(const value_idx* src_rows,
-                                      const value_idx* src_cols,
-                                      const value_t* src_vals,
-                                      const value_idx* index,
-                                      value_idx* out_rows,
-                                      value_idx* out_cols,
-                                      value_t* out_vals,
-                                      size_t nnz)
+RAFT_KERNEL max_duplicates_kernel(const value_idx* src_rows,
+                                  const value_idx* src_cols,
+                                  const value_t* src_vals,
+                                  const value_idx* index,
+                                  value_idx* out_rows,
+                                  value_idx* out_cols,
+                                  value_t* out_vals,
+                                  size_t nnz)
 {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh
index 5e7d2632a9..256ae28f8c 100644
--- a/cpp/include/raft/sparse/op/detail/row_op.cuh
+++ b/cpp/include/raft/sparse/op/detail/row_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ namespace op {
 namespace detail {
 
 template <typename T, int TPB_X = 256, typename Lambda = auto(T, T, T)->void>
-__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
+RAFT_KERNEL csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
 {
   T row = blockIdx.x * TPB_X + threadIdx.x;
   if (row < n_rows) {
diff --git a/cpp/include/raft/sparse/solver/detail/mst_kernels.cuh b/cpp/include/raft/sparse/solver/detail/mst_kernels.cuh
index 916690be67..5b56ea45f0 100644
--- a/cpp/include/raft/sparse/solver/detail/mst_kernels.cuh
+++ b/cpp/include/raft/sparse/solver/detail/mst_kernels.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,15 +26,15 @@
 namespace raft::sparse::solver::detail {
 
 template <typename vertex_t, typename edge_t, typename alteration_t>
-__global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
-                                           const vertex_t* indices,
-                                           const alteration_t* weights,
-                                           const vertex_t* color,
-                                           const vertex_t* color_index,
-                                           edge_t* new_mst_edge,
-                                           const bool* mst_edge,
-                                           alteration_t* min_edge_color,
-                                           const vertex_t v)
+RAFT_KERNEL kernel_min_edge_per_vertex(const edge_t* offsets,
+                                       const vertex_t* indices,
+                                       const alteration_t* weights,
+                                       const vertex_t* color,
+                                       const vertex_t* color_index,
+                                       edge_t* new_mst_edge,
+                                       const bool* mst_edge,
+                                       alteration_t* min_edge_color,
+                                       const vertex_t v)
 {
   edge_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -107,19 +107,19 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-__global__ void min_edge_per_supervertex(const vertex_t* color,
-                                         const vertex_t* color_index,
-                                         edge_t* new_mst_edge,
-                                         bool* mst_edge,
-                                         const vertex_t* indices,
-                                         const weight_t* weights,
-                                         const alteration_t* altered_weights,
-                                         vertex_t* temp_src,
-                                         vertex_t* temp_dst,
-                                         weight_t* temp_weights,
-                                         const alteration_t* min_edge_color,
-                                         const vertex_t v,
-                                         bool symmetrize_output)
+RAFT_KERNEL min_edge_per_supervertex(const vertex_t* color,
+                                     const vertex_t* color_index,
+                                     edge_t* new_mst_edge,
+                                     bool* mst_edge,
+                                     const vertex_t* indices,
+                                     const weight_t* weights,
+                                     const alteration_t* altered_weights,
+                                     vertex_t* temp_src,
+                                     vertex_t* temp_dst,
+                                     weight_t* temp_weights,
+                                     const alteration_t* min_edge_color,
+                                     const vertex_t v,
+                                     bool symmetrize_output)
 {
   auto tid = get_1D_idx<vertex_t>();
   if (tid < v) {
@@ -166,14 +166,14 @@ __global__ void min_edge_per_supervertex(const vertex_t* color,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-__global__ void add_reverse_edge(const edge_t* new_mst_edge,
-                                 const vertex_t* indices,
-                                 const weight_t* weights,
-                                 vertex_t* temp_src,
-                                 vertex_t* temp_dst,
-                                 weight_t* temp_weights,
-                                 const vertex_t v,
-                                 bool symmetrize_output)
+RAFT_KERNEL add_reverse_edge(const edge_t* new_mst_edge,
+                             const vertex_t* indices,
+                             const weight_t* weights,
+                             vertex_t* temp_src,
+                             vertex_t* temp_dst,
+                             weight_t* temp_weights,
+                             const vertex_t v,
+                             bool symmetrize_output)
 {
   auto tid = get_1D_idx<vertex_t>();
 
@@ -215,12 +215,12 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
 // executes for newly added mst edges and updates the colors of both vertices to the lower color
 template <typename vertex_t, typename edge_t>
-__global__ void min_pair_colors(const vertex_t v,
-                                const vertex_t* indices,
-                                const edge_t* new_mst_edge,
-                                const vertex_t* color,
-                                const vertex_t* color_index,
-                                vertex_t* next_color)
+RAFT_KERNEL min_pair_colors(const vertex_t v,
+                            const vertex_t* indices,
+                            const edge_t* new_mst_edge,
+                            const vertex_t* color,
+                            const vertex_t* color_index,
+                            vertex_t* next_color)
 {
   auto i = get_1D_idx<vertex_t>();
 
@@ -248,11 +248,11 @@ __global__ void min_pair_colors(const vertex_t v,
 
 // for each vertex, update color if it was changed in min_pair_colors kernel
 template <typename vertex_t>
-__global__ void update_colors(const vertex_t v,
-                              vertex_t* color,
-                              const vertex_t* color_index,
-                              const vertex_t* next_color,
-                              bool* done)
+RAFT_KERNEL update_colors(const vertex_t v,
+                          vertex_t* color,
+                          const vertex_t* color_index,
+                          const vertex_t* next_color,
+                          bool* done)
 {
   auto i = get_1D_idx<vertex_t>();
 
@@ -271,7 +271,7 @@ __global__ void update_colors(const vertex_t v,
 
 // point vertices to their final color index
 template <typename vertex_t>
-__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index)
+RAFT_KERNEL final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index)
 {
   auto i = get_1D_idx<vertex_t>();
 
@@ -296,14 +296,14 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, ver
 // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
 // Consider using curand device API instead of precomputed random_values array
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-__global__ void alteration_kernel(const vertex_t v,
-                                  const edge_t e,
-                                  const edge_t* offsets,
-                                  const vertex_t* indices,
-                                  const weight_t* weights,
-                                  alteration_t max,
-                                  alteration_t* random_values,
-                                  alteration_t* altered_weights)
+RAFT_KERNEL alteration_kernel(const vertex_t v,
+                              const edge_t e,
+                              const edge_t* offsets,
+                              const vertex_t* indices,
+                              const weight_t* weights,
+                              alteration_t max,
+                              alteration_t* random_values,
+                              alteration_t* altered_weights)
 {
   auto row = get_1D_idx<vertex_t>();
   if (row < v) {
@@ -317,9 +317,9 @@ __global__ void alteration_kernel(const vertex_t v,
 }
 
 template <typename vertex_t, typename edge_t>
-__global__ void kernel_count_new_mst_edges(const vertex_t* mst_src,
-                                           edge_t* mst_edge_count,
-                                           const vertex_t v)
+RAFT_KERNEL kernel_count_new_mst_edges(const vertex_t* mst_src,
+                                       edge_t* mst_edge_count,
+                                       const vertex_t v)
 {
   auto tid = get_1D_idx<vertex_t>();
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 1ce041d8da..c7823c2d38 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -221,7 +221,7 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream)
 }
 
 template <typename T, typename IdxT>
-__global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
+RAFT_KERNEL outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
 {
   IdxT gid = threadIdx.x + blockDim.x * static_cast<IdxT>(blockIdx.x);
   IdxT i   = gid / len_b;
@@ -231,12 +231,12 @@ __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b,
 }
 
 template <typename T, typename IdxT>
-__global__ void block_copy_kernel(const IdxT* in_offsets,
-                                  const IdxT* out_offsets,
-                                  IdxT n_blocks,
-                                  const T* in_data,
-                                  T* out_data,
-                                  IdxT n_mult)
+RAFT_KERNEL block_copy_kernel(const IdxT* in_offsets,
+                              const IdxT* out_offsets,
+                              IdxT n_blocks,
+                              const T* in_data,
+                              T* out_data,
+                              IdxT n_mult)
 {
   IdxT i = static_cast<IdxT>(blockDim.x) * static_cast<IdxT>(blockIdx.x) + threadIdx.x;
   // find the source offset using the binary search.
@@ -314,7 +314,7 @@ void outer_add(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c, rmm::cuda_s
 }
 
 template <typename T, typename S, typename IdxT, typename LabelT>
-__global__ void copy_selected_kernel(
+RAFT_KERNEL copy_selected_kernel(
   IdxT n_rows, IdxT n_cols, const S* src, const LabelT* row_ids, IdxT ld_src, T* dst, IdxT ld_dst)
 {
   IdxT gid   = threadIdx.x + blockDim.x * static_cast<IdxT>(blockIdx.x);
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
index 52267fd83e..9e75f3c9c8 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
@@ -64,18 +64,18 @@ template <typename value_idx,
           int col_q          = 2,
           int tpb            = 32,
           typename distance_func>
-__global__ void perform_post_filter_registers(const value_t* X,
-                                              value_int n_cols,
-                                              const value_idx* R_knn_inds,
-                                              const value_t* R_knn_dists,
-                                              const value_t* R_radius,
-                                              const value_t* landmarks,
-                                              int n_landmarks,
-                                              value_int bitset_size,
-                                              value_int k,
-                                              distance_func dfunc,
-                                              std::uint32_t* output,
-                                              float weight = 1.0)
+RAFT_KERNEL perform_post_filter_registers(const value_t* X,
+                                          value_int n_cols,
+                                          const value_idx* R_knn_inds,
+                                          const value_t* R_knn_dists,
+                                          const value_t* R_radius,
+                                          const value_t* landmarks,
+                                          int n_landmarks,
+                                          value_int bitset_size,
+                                          value_int k,
+                                          distance_func dfunc,
+                                          std::uint32_t* output,
+                                          float weight = 1.0)
 {
   // allocate array of size n_landmarks / 32 ints
   extern __shared__ std::uint32_t shared_mem[];
@@ -155,21 +155,21 @@ template <typename value_idx,
           int thread_q = 2,
           int tpb      = 128,
           int col_q    = 2>
-__global__ void compute_final_dists_registers(const value_t* X_index,
-                                              const value_t* X,
-                                              const value_int n_cols,
-                                              bitset_type* bitset,
-                                              value_int bitset_size,
-                                              const value_t* R_closest_landmark_dists,
-                                              const value_idx* R_indptr,
-                                              const value_idx* R_1nn_inds,
-                                              const value_t* R_1nn_dists,
-                                              value_idx* knn_inds,
-                                              value_t* knn_dists,
-                                              value_int n_landmarks,
-                                              value_int k,
-                                              dist_func dfunc,
-                                              value_int* dist_counter)
+RAFT_KERNEL compute_final_dists_registers(const value_t* X_index,
+                                          const value_t* X,
+                                          const value_int n_cols,
+                                          bitset_type* bitset,
+                                          value_int bitset_size,
+                                          const value_t* R_closest_landmark_dists,
+                                          const value_idx* R_indptr,
+                                          const value_idx* R_1nn_inds,
+                                          const value_t* R_1nn_dists,
+                                          value_idx* knn_inds,
+                                          value_t* knn_dists,
+                                          value_int n_landmarks,
+                                          value_int k,
+                                          dist_func dfunc,
+                                          value_int* dist_counter)
 {
   static constexpr int kNumWarps = tpb / WarpSize;
 
@@ -311,22 +311,22 @@ template <typename value_idx = std::int64_t,
           int col_q          = 2,
           typename value_int = std::uint32_t,
           typename distance_func>
-__global__ void block_rbc_kernel_registers(const value_t* X_index,
-                                           const value_t* X,
-                                           value_int n_cols,  // n_cols should be 2 or 3 dims
-                                           const value_idx* R_knn_inds,
-                                           const value_t* R_knn_dists,
-                                           value_int m,
-                                           value_int k,
-                                           const value_idx* R_indptr,
-                                           const value_idx* R_1nn_cols,
-                                           const value_t* R_1nn_dists,
-                                           value_idx* out_inds,
-                                           value_t* out_dists,
-                                           value_int* dist_counter,
-                                           const value_t* R_radius,
-                                           distance_func dfunc,
-                                           float weight = 1.0)
+RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_index,
+                                       const value_t* X,
+                                       value_int n_cols,  // n_cols should be 2 or 3 dims
+                                       const value_idx* R_knn_inds,
+                                       const value_t* R_knn_dists,
+                                       value_int m,
+                                       value_int k,
+                                       const value_idx* R_indptr,
+                                       const value_idx* R_1nn_cols,
+                                       const value_t* R_1nn_dists,
+                                       value_idx* out_inds,
+                                       value_t* out_dists,
+                                       value_int* dist_counter,
+                                       const value_t* R_radius,
+                                       distance_func dfunc,
+                                       float weight = 1.0)
 {
   static constexpr value_int kNumWarps = tpb / WarpSize;
 
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index 1bc6622e43..cb0ca6cc68 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -171,10 +171,8 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
 };  // struct EpsUnexpL2SqNeighborhood
 
 template <typename DataT, typename IdxT, typename Policy>
-__global__ __launch_bounds__(Policy::Nthreads, 2)
-
-  void epsUnexpL2SqNeighKernel(
-    bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps)
+__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL epsUnexpL2SqNeighKernel(
+  bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps)
 {
   extern __shared__ char smem[];
   EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem);
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
index 67abab3d1e..30ebab43b6 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
@@ -195,22 +195,22 @@ template <typename DataT,
           int NumThreadQ,
           bool usePrevTopKs = false,
           bool isRowMajor   = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x,
-                                                                  const DataT* y,
-                                                                  const DataT* _xn,
-                                                                  const DataT* _yn,
-                                                                  const IdxT m,
-                                                                  const IdxT n,
-                                                                  const IdxT k,
-                                                                  const IdxT lda,
-                                                                  const IdxT ldb,
-                                                                  const IdxT ldd,
-                                                                  OpT distance_op,
-                                                                  FinalLambda fin_op,
-                                                                  unsigned int numOfNN,
-                                                                  volatile int* mutexes,
-                                                                  volatile OutT* out_dists,
-                                                                  volatile IdxT* out_inds)
+__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL fusedL2kNN(const DataT* x,
+                                                              const DataT* y,
+                                                              const DataT* _xn,
+                                                              const DataT* _yn,
+                                                              const IdxT m,
+                                                              const IdxT n,
+                                                              const IdxT k,
+                                                              const IdxT lda,
+                                                              const IdxT ldb,
+                                                              const IdxT ldd,
+                                                              OpT distance_op,
+                                                              FinalLambda fin_op,
+                                                              unsigned int numOfNN,
+                                                              volatile int* mutexes,
+                                                              volatile OutT* out_dists,
+                                                              volatile IdxT* out_inds)
 {
   using AccT = typename OpT::AccT;
   extern __shared__ char smem[];
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 34ef30ace3..5b8cc36368 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -53,12 +53,12 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
  * @param[in] k number of closest neighbors to return
  */
 template <typename value_idx, typename value_t, int warp_q = 1024, int thread_q = 8, int tpb = 128>
-__global__ void haversine_knn_kernel(value_idx* out_inds,
-                                     value_t* out_dists,
-                                     const value_t* index,
-                                     const value_t* query,
-                                     size_t n_index_rows,
-                                     int k)
+RAFT_KERNEL haversine_knn_kernel(value_idx* out_inds,
+                                 value_t* out_dists,
+                                 const value_t* index,
+                                 const value_t* query,
+                                 size_t n_index_rows,
+                                 int k)
 {
   constexpr int kNumWarps = tpb / WarpSize;
 
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 07ab1dbeba..b90b24e406 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -49,11 +49,11 @@ using size_type = int;  // for now; TODO: move it in appropriate header
 // Apply diagonal matrix to vector:
 //
 template <typename IndexType_, typename ValueType_>
-static __global__ void diagmv(IndexType_ n,
-                              ValueType_ alpha,
-                              const ValueType_* __restrict__ D,
-                              const ValueType_* __restrict__ x,
-                              ValueType_* __restrict__ y)
+RAFT_KERNEL diagmv(IndexType_ n,
+                   ValueType_ alpha,
+                   const ValueType_* __restrict__ D,
+                   const ValueType_* __restrict__ x,
+                   ValueType_* __restrict__ y)
 {
   IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index b95b21e079..736936a1f1 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -40,7 +40,7 @@ namespace raft {
 namespace spectral {
 
 template <typename index_type_t, typename value_type_t>
-static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
+RAFT_KERNEL scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
 {
   index_type_t i, j, k, index, mm;
   value_type_t alpha, v, last;
diff --git a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
index 17c7515fbb..559532b0de 100644
--- a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
@@ -40,11 +40,11 @@ namespace detail {
  * Only if the there are > 1 samples in the label, row is initialized to max
  */
 template <typename value_t, typename value_idx, typename label_idx>
-__global__ void fill_b_kernel(value_t* b,
-                              const label_idx* y,
-                              value_idx n_rows,
-                              label_idx n_labels,
-                              const value_idx* cluster_counts)
+RAFT_KERNEL fill_b_kernel(value_t* b,
+                          const label_idx* y,
+                          value_idx n_rows,
+                          label_idx n_labels,
+                          const value_idx* cluster_counts)
 {
   value_idx idx = threadIdx.x + blockIdx.x * blockDim.x;
   label_idx idy = threadIdx.y + blockIdx.y * blockDim.y;
@@ -78,16 +78,16 @@ __global__ void fill_b_kernel(value_t* b,
  * current chunked pairwise distance matrix.
  */
 template <typename value_t, typename value_idx, typename label_idx>
-__global__ void compute_chunked_a_b_kernel(value_t* a,
-                                           value_t* b,
-                                           value_idx row_offset,
-                                           value_idx col_offset,
-                                           const label_idx* y,
-                                           label_idx n_labels,
-                                           const value_idx* cluster_counts,
-                                           const value_t* distances,
-                                           value_idx dist_rows,
-                                           value_idx dist_cols)
+RAFT_KERNEL compute_chunked_a_b_kernel(value_t* a,
+                                       value_t* b,
+                                       value_idx row_offset,
+                                       value_idx col_offset,
+                                       const label_idx* y,
+                                       label_idx n_labels,
+                                       const value_idx* cluster_counts,
+                                       const value_t* distances,
+                                       value_idx dist_rows,
+                                       value_idx dist_cols)
 {
   value_idx row_id = threadIdx.x + blockIdx.x * blockDim.x;
   value_idx col_id = threadIdx.y + blockIdx.y * blockDim.y;
diff --git a/cpp/include/raft/stats/detail/contingencyMatrix.cuh b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
index 0fe9b6a092..9f19b798ea 100644
--- a/cpp/include/raft/stats/detail/contingencyMatrix.cuh
+++ b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,12 +40,12 @@ typedef enum {
 } ContingencyMatrixImplType;
 
 template <typename T, typename OutT = int>
-__global__ void devConstructContingencyMatrix(const T* groundTruth,
-                                              const T* predicted,
-                                              int nSamples,
-                                              OutT* outMat,
-                                              int outIdxOffset,
-                                              int outMatWidth)
+RAFT_KERNEL devConstructContingencyMatrix(const T* groundTruth,
+                                          const T* predicted,
+                                          int nSamples,
+                                          OutT* outMat,
+                                          int outIdxOffset,
+                                          int outMatWidth)
 {
   int elementId = threadIdx.x + blockDim.x * blockIdx.x;
   if (elementId < nSamples) {
@@ -75,12 +75,12 @@ void computeCMatWAtomics(const T* groundTruth,
 }
 
 template <typename T, typename OutT = int>
-__global__ void devConstructContingencyMatrixSmem(const T* groundTruth,
-                                                  const T* predicted,
-                                                  int nSamples,
-                                                  OutT* outMat,
-                                                  int outIdxOffset,
-                                                  int outMatWidth)
+RAFT_KERNEL devConstructContingencyMatrixSmem(const T* groundTruth,
+                                              const T* predicted,
+                                              int nSamples,
+                                              OutT* outMat,
+                                              int outIdxOffset,
+                                              int outMatWidth)
 {
   extern __shared__ char smem[];
   auto* sMemMatrix = reinterpret_cast<OutT*>(smem);
diff --git a/cpp/include/raft/stats/detail/dispersion.cuh b/cpp/include/raft/stats/detail/dispersion.cuh
index 384322bf9d..d873fcf79a 100644
--- a/cpp/include/raft/stats/detail/dispersion.cuh
+++ b/cpp/include/raft/stats/detail/dispersion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename DataT, typename IdxT, int TPB, int ColsPerBlk = 32>
-__global__ void weightedMeanKernel(DataT* mu, const DataT* data, const IdxT* counts, IdxT D, IdxT N)
+RAFT_KERNEL weightedMeanKernel(DataT* mu, const DataT* data, const IdxT* counts, IdxT D, IdxT N)
 {
   constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk;
   IdxT thisColId                  = threadIdx.x % ColsPerBlk;
@@ -51,12 +51,12 @@ __global__ void weightedMeanKernel(DataT* mu, const DataT* data, const IdxT* cou
 }
 
 template <typename DataT, typename IdxT, int TPB>
-__global__ void dispersionKernel(DataT* result,
-                                 const DataT* clusters,
-                                 const IdxT* clusterSizes,
-                                 const DataT* mu,
-                                 IdxT dim,
-                                 IdxT nClusters)
+RAFT_KERNEL dispersionKernel(DataT* result,
+                             const DataT* clusters,
+                             const IdxT* clusterSizes,
+                             const DataT* mu,
+                             IdxT dim,
+                             IdxT nClusters)
 {
   IdxT tid    = threadIdx.x + blockIdx.x * blockDim.x;
   IdxT len    = dim * nClusters;
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
index 8fae2ec7e4..dee61cf1eb 100644
--- a/cpp/include/raft/stats/detail/histogram.cuh
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,8 +75,7 @@ DI void histCoreOp(const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner, C
 }
 
 template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
-__global__ void gmemHistKernel(
-  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+RAFT_KERNEL gmemHistKernel(int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
 {
   auto op = [=] __device__(int binId, IdxT row, IdxT col) {
     if (row >= nrows) return;
@@ -109,8 +108,7 @@ void gmemHist(int* bins,
 }
 
 template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, bool UseMatchAny>
-__global__ void smemHistKernel(
-  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+RAFT_KERNEL smemHistKernel(int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
 {
   extern __shared__ unsigned sbins[];
   for (auto i = threadIdx.x; i < nbins; i += blockDim.x) {
@@ -204,7 +202,7 @@ DI void incrementBin<1>(unsigned* sbins, int* bins, int nbins, int binId)
 }
 
 template <typename DataT, typename BinnerOp, typename IdxT, int BIN_BITS, int VecLen>
-__global__ void smemBitsHistKernel(
+RAFT_KERNEL smemBitsHistKernel(
   int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
 {
   extern __shared__ unsigned sbins[];
@@ -287,13 +285,13 @@ DI void flushHashTable(int2* ht, int hashSize, int* bins, int nbins, int col)
 
 ///@todo: honor VecLen template param
 template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
-__global__ void smemHashHistKernel(int* bins,
-                                   const DataT* data,
-                                   IdxT nrows,
-                                   IdxT nbins,
-                                   BinnerOp binner,
-                                   int hashSize,
-                                   int threshold)
+RAFT_KERNEL smemHashHistKernel(int* bins,
+                               const DataT* data,
+                               IdxT nrows,
+                               IdxT nbins,
+                               BinnerOp binner,
+                               int hashSize,
+                               int threshold)
 {
   extern __shared__ int2 ht[];
   int* needFlush = (int*)&(ht[hashSize]);
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 49532e1c82..cf4dbc7aa3 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
 {
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
   IdxType thisColId           = threadIdx.x % ColsPerBlk;
@@ -47,7 +47,7 @@ __global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxTyp
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+RAFT_KERNEL meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
 {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index a5cb315678..1ebaf3b18a 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,7 +115,7 @@ NB: current implementation here is not optimal, especially the rowmajor version;
  * @param D number of columns in the input data.
  */
 template <typename T, typename I, int BlockSize>
-__global__ void __launch_bounds__(BlockSize)
+RAFT_KERNEL __launch_bounds__(BlockSize)
   meanvar_kernel_rowmajor(const T* data, volatile mean_var<T>* mvs, int* locks, I len, I D)
 {
   // read the data
@@ -164,7 +164,7 @@ __global__ void __launch_bounds__(BlockSize)
 }
 
 template <typename T, typename I, int BlockSize>
-__global__ void __launch_bounds__(BlockSize)
+RAFT_KERNEL __launch_bounds__(BlockSize)
   meanvar_kernel_colmajor(T* mean, T* var, const T* data, I D, I N, bool sample)
 {
   using BlockReduce = cub::BlockReduce<mean_var<T>, BlockSize>;
@@ -183,7 +183,7 @@ __global__ void __launch_bounds__(BlockSize)
 }
 
 template <typename T, typename I>
-__global__ void meanvar_kernel_fill(T* mean, T* var, const mean_var<T>* aggr, I D, bool sample)
+RAFT_KERNEL meanvar_kernel_fill(T* mean, T* var, const mean_var<T>* aggr, I D, bool sample)
 {
   I i = threadIdx.x + blockDim.x * blockIdx.x;
   if (i >= D) return;
diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh
index 4edafe82c8..b88cfc2484 100644
--- a/cpp/include/raft/stats/detail/minmax.cuh
+++ b/cpp/include/raft/stats/detail/minmax.cuh
@@ -87,7 +87,7 @@ DI T atomicMinBits(T* address, T val)
 }
 
 template <typename T, typename E>
-__global__ void decodeKernel(T* globalmin, T* globalmax, int ncols)
+RAFT_KERNEL decodeKernel(T* globalmin, T* globalmax, int ncols)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < ncols) {
@@ -98,7 +98,7 @@ __global__ void decodeKernel(T* globalmin, T* globalmax, int ncols)
 
 ///@todo: implement a proper "fill" kernel
 template <typename T, typename E>
-__global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+RAFT_KERNEL minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= ncols) return;
@@ -107,18 +107,18 @@ __global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_v
 }
 
 template <typename T, typename E>
-__global__ void minmaxKernel(const T* data,
-                             const unsigned int* rowids,
-                             const unsigned int* colids,
-                             int nrows,
-                             int ncols,
-                             int row_stride,
-                             T* g_min,
-                             T* g_max,
-                             T* sampledcols,
-                             T init_min_val,
-                             int batch_ncols,
-                             int num_batches)
+RAFT_KERNEL minmaxKernel(const T* data,
+                         const unsigned int* rowids,
+                         const unsigned int* colids,
+                         int nrows,
+                         int ncols,
+                         int row_stride,
+                         T* g_min,
+                         T* g_max,
+                         T* sampledcols,
+                         T init_min_val,
+                         int batch_ncols,
+                         int num_batches)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   extern __shared__ char shmem[];
diff --git a/cpp/include/raft/stats/detail/mutual_info_score.cuh b/cpp/include/raft/stats/detail/mutual_info_score.cuh
index 806466791f..cc80d6c752 100644
--- a/cpp/include/raft/stats/detail/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/detail/mutual_info_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,12 +51,12 @@ namespace detail {
  * @param d_MI: pointer to the device memory that stores the aggregate mutual information
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void mutual_info_kernel(const int* dContingencyMatrix,
-                                   const int* a,
-                                   const int* b,
-                                   int numUniqueClasses,
-                                   int size,
-                                   double* d_MI)
+RAFT_KERNEL mutual_info_kernel(const int* dContingencyMatrix,
+                               const int* a,
+                               const int* b,
+                               int numUniqueClasses,
+                               int size,
+                               double* d_MI)
 {
   // calculating the indices of pairs of datapoints compared by the current thread
   int j = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/include/raft/stats/detail/neighborhood_recall.cuh b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
index 78cd64538e..11d0448167 100644
--- a/cpp/include/raft/stats/detail/neighborhood_recall.cuh
+++ b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
@@ -37,7 +37,7 @@ template <typename IndicesValueType,
           typename DistanceValueType,
           typename IndexType,
           typename ScalarType>
-__global__ void neighborhood_recall(
+RAFT_KERNEL neighborhood_recall(
   raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> indices,
   raft::device_matrix_view<const IndicesValueType, IndexType, raft::row_major> ref_indices,
   std::optional<raft::device_matrix_view<const DistanceValueType, IndexType, raft::row_major>>
diff --git a/cpp/include/raft/stats/detail/rand_index.cuh b/cpp/include/raft/stats/detail/rand_index.cuh
index 9af6dd6097..744e8dd8e8 100644
--- a/cpp/include/raft/stats/detail/rand_index.cuh
+++ b/cpp/include/raft/stats/detail/rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ namespace detail {
  * @param b: number of pairs of points that both the clusters have classified differently
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void computeTheNumerator(
+RAFT_KERNEL computeTheNumerator(
   const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
 {
   // calculating the indices of pairs of datapoints compared by the current thread
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 6bad1f9159..ae592289ac 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ float accuracy_score(const math_t* predictions,
 }
 
 template <typename T>
-__global__ void reg_metrics_kernel(
+RAFT_KERNEL reg_metrics_kernel(
   const T* predictions, const T* ref_predictions, int n, double* abs_diffs, double* tmp_sums)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index 1e9c7c677c..889a3f5931 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -55,13 +55,13 @@ namespace detail {
  * @param MAX_VAL: DataT specific upper limit
  */
 template <typename DataT, typename LabelT>
-__global__ void populateAKernel(DataT* sampleToClusterSumOfDistances,
-                                DataT* binCountArray,
-                                DataT* d_aArray,
-                                const LabelT* labels,
-                                int nRows,
-                                int nLabels,
-                                const DataT MAX_VAL)
+RAFT_KERNEL populateAKernel(DataT* sampleToClusterSumOfDistances,
+                            DataT* binCountArray,
+                            DataT* d_aArray,
+                            const LabelT* labels,
+                            int nRows,
+                            int nLabels,
+                            const DataT MAX_VAL)
 {
   // getting the current index
   int sampleIndex = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index 2f7e22ca8a..acee4a944e 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -27,7 +27,7 @@ namespace detail {
 
 ///@todo: ColPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
+RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
 {
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
   IdxType thisColId           = threadIdx.x % ColsPerBlk;
@@ -49,8 +49,7 @@ __global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, Idx
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void stddevKernelColMajor(
-  Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
+RAFT_KERNEL stddevKernelColMajor(Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
 {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -67,8 +66,7 @@ __global__ void stddevKernelColMajor(
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void varsKernelColMajor(
-  Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
+RAFT_KERNEL varsKernelColMajor(Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
 {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index b6d5b8a30d..1d727f8b77 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
 {
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
   IdxType thisColId           = threadIdx.x % ColsPerBlk;
@@ -47,7 +47,7 @@ __global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+RAFT_KERNEL sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
 {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
index ebcc2e8655..ef37b2a034 100644
--- a/cpp/include/raft/stats/detail/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -35,7 +35,7 @@ namespace detail {
  * @param n: Number of samples
  * @param work: Number of elements to consider
  */
-__global__ void build_lookup_table(int* lookup_table, const int* X_ind, int n, int work)
+RAFT_KERNEL build_lookup_table(int* lookup_table, const int* X_ind, int n, int work)
 {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i >= work) return;
@@ -58,12 +58,12 @@ __global__ void build_lookup_table(int* lookup_table, const int* X_ind, int n, i
  * @param work: Batch to consider (to do it at once use n * n_neighbors)
  */
 template <typename knn_index_t>
-__global__ void compute_rank(double* rank,
-                             const int* lookup_table,
-                             const knn_index_t* emb_ind,
-                             int n,
-                             int n_neighbors,
-                             int work)
+RAFT_KERNEL compute_rank(double* rank,
+                         const int* lookup_table,
+                         const knn_index_t* emb_ind,
+                         int n,
+                         int n_neighbors,
+                         int work)
 {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i >= work) return;
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/offload_utils.hpp b/cpp/include/raft/thirdparty/mdspan/tests/offload_utils.hpp
index 7c746337c6..dee652e0ec 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/offload_utils.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/offload_utils.hpp
@@ -11,7 +11,7 @@ if (!(LHS == RHS)) { \
 #ifdef _MDSPAN_HAS_CUDA
 
 template<class LAMBDA>
-__global__ void dispatch_kernel(const LAMBDA f) {
+RAFT_KERNEL dispatch_kernel(const LAMBDA f) {
   f();
 }
 
diff --git a/cpp/include/raft/util/cache_util.cuh b/cpp/include/raft/util/cache_util.cuh
index 79a94d9563..9f081bb992 100644
--- a/cpp/include/raft/util/cache_util.cuh
+++ b/cpp/include/raft/util/cache_util.cuh
@@ -42,8 +42,7 @@ namespace cache {
  * @param [out] out vectors collected from the cache, size [n_vec * n]
  */
 template <typename math_t, typename idx_t, typename int_t>
-__global__ void get_vecs(
-  const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out)
+RAFT_KERNEL get_vecs(const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
@@ -83,14 +82,14 @@ __global__ void get_vecs(
  * @param [in] n_cache_vecs
  */
 template <typename math_t>
-__global__ void store_vecs(const math_t* tile,
-                           int n_tile,
-                           int n_vec,
-                           const int* tile_idx,
-                           int n,
-                           const int* cache_idx,
-                           math_t* cache,
-                           int n_cache_vecs)
+RAFT_KERNEL store_vecs(const math_t* tile,
+                       int n_tile,
+                       int n_vec,
+                       const int* tile_idx,
+                       int n,
+                       const int* cache_idx,
+                       math_t* cache,
+                       int n_cache_vecs)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
@@ -256,14 +255,14 @@ DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
  *   not be cached, size [n]
  */
 template <int nthreads, int associativity>
-__global__ void assign_cache_idx(const int* keys,
-                                 int n,
-                                 const int* cache_set,
-                                 int* cached_keys,
-                                 int n_cache_sets,
-                                 int* cache_time,
-                                 int time,
-                                 int* cache_idx)
+RAFT_KERNEL assign_cache_idx(const int* keys,
+                             int n,
+                             const int* cache_set,
+                             int* cached_keys,
+                             int n_cache_sets,
+                             int* cache_time,
+                             int time,
+                             int* cache_idx)
 {
   int block_offset = blockIdx.x * associativity;
 
@@ -329,15 +328,15 @@ __global__ void assign_cache_idx(const int* keys,
  * @param [in] time iteration counter (used for time stamping)
  */
 template <typename = void>
-__global__ void get_cache_idx(int* keys,
-                              int n,
-                              int* cached_keys,
-                              int n_cache_sets,
-                              int associativity,
-                              int* cache_time,
-                              int* cache_idx,
-                              bool* is_cached,
-                              int time)
+RAFT_KERNEL get_cache_idx(int* keys,
+                          int n,
+                          int* cached_keys,
+                          int n_cache_sets,
+                          int associativity,
+                          int* cache_time,
+                          int* cache_idx,
+                          bool* is_cached,
+                          int time)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
diff --git a/cpp/include/raft/util/detail/scatter.cuh b/cpp/include/raft/util/detail/scatter.cuh
index 87a8826aa6..6e7522853e 100644
--- a/cpp/include/raft/util/detail/scatter.cuh
+++ b/cpp/include/raft/util/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 namespace raft::detail {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+RAFT_KERNEL scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
 {
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 8565735672..64a81da01e 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -29,13 +29,13 @@
 namespace raft::neighbors {
 
 template <typename EvalT, typename DataT, typename IdxT>
-__global__ void naive_distance_kernel(EvalT* dist,
-                                      const DataT* x,
-                                      const DataT* y,
-                                      IdxT m,
-                                      IdxT n,
-                                      IdxT k,
-                                      raft::distance::DistanceType metric)
+RAFT_KERNEL naive_distance_kernel(EvalT* dist,
+                                  const DataT* x,
+                                  const DataT* y,
+                                  IdxT m,
+                                  IdxT n,
+                                  IdxT k,
+                                  raft::distance::DistanceType metric)
 {
   IdxT midx = IdxT(threadIdx.x) + IdxT(blockIdx.x) * IdxT(blockDim.x);
   if (midx >= m) return;
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index 52ec2efe8e..f8ae46cbf9 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -69,7 +69,7 @@ struct LinkageInputs {
  * @param b: number of pairs of points that both the clusters have classified differently
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void computeTheNumerator(
+RAFT_KERNEL computeTheNumerator(
   const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
 {
   // calculating the indices of pairs of datapoints compared by the current thread
diff --git a/cpp/test/core/interruptible.cu b/cpp/test/core/interruptible.cu
index 39e76e07ad..aa8f16aa3c 100644
--- a/cpp/test/core/interruptible.cu
+++ b/cpp/test/core/interruptible.cu
@@ -20,6 +20,7 @@
 #include <memory>
 #include <omp.h>
 #include <raft/common/nvtx.hpp>
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/interruptible.hpp>
 #include <rmm/cuda_stream.hpp>
 #include <thread>
@@ -27,7 +28,7 @@
 
 namespace raft {
 
-__global__ void gpu_wait(int millis)
+RAFT_KERNEL gpu_wait(int millis)
 {
   for (auto i = millis; i > 0; i--) {
 #if __CUDA_ARCH__ >= 700
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
index 8e3a9df01b..a11a95fff3 100644
--- a/cpp/test/core/math_device.cu
+++ b/cpp/test/core/math_device.cu
@@ -29,7 +29,7 @@
 #endif
 
 template <typename OutT, typename OpT, typename... Args>
-__global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
+RAFT_KERNEL math_eval_kernel(OutT* out, OpT op, Args... args)
 {
   out[0] = op(std::forward<Args>(args)...);
 }
diff --git a/cpp/test/core/operators_device.cu b/cpp/test/core/operators_device.cu
index 1697a09fcf..fa4cd25a15 100644
--- a/cpp/test/core/operators_device.cu
+++ b/cpp/test/core/operators_device.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 #include <rmm/device_scalar.hpp>
 
 template <typename OutT, typename OpT, typename... Args>
-__global__ void eval_op_on_device_kernel(OutT* out, OpT op, Args... args)
+RAFT_KERNEL eval_op_on_device_kernel(OutT* out, OpT op, Args... args)
 {
   out[0] = op(std::forward<Args>(args)...);
 }
diff --git a/cpp/test/core/span.cu b/cpp/test/core/span.cu
index f16a18332b..667c7b19f7 100644
--- a/cpp/test/core/span.cu
+++ b/cpp/test/core/span.cu
@@ -52,7 +52,7 @@ struct TestStatus {
   int* Data() { return status_; }
 };
 
-__global__ void TestFromOtherKernel(device_span<float> span)
+RAFT_KERNEL TestFromOtherKernel(device_span<float> span)
 {
   // don't get optimized out
   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -60,7 +60,7 @@ __global__ void TestFromOtherKernel(device_span<float> span)
   if (idx >= span.size()) { return; }
 }
 // Test converting different T
-__global__ void TestFromOtherKernelConst(device_span<float const, 16> span)
+RAFT_KERNEL TestFromOtherKernelConst(device_span<float const, 16> span)
 {
   // don't get optimized out
   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -177,7 +177,7 @@ TEST(GPUSpan, RBeginREnd)
   ASSERT_EQ(status.Get(), 1);
 }
 
-__global__ void TestModifyKernel(device_span<float> span)
+RAFT_KERNEL TestModifyKernel(device_span<float> span)
 {
   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
 
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index ff26744185..7c3ec08cf5 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -30,14 +30,14 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceAdjKernel(uint8_t* dist,
-                                       const DataType* x,
-                                       const DataType* y,
-                                       int m,
-                                       int n,
-                                       int k,
-                                       DataType eps,
-                                       bool isRowMajor)
+RAFT_KERNEL naiveDistanceAdjKernel(uint8_t* dist,
+                                   const DataType* x,
+                                   const DataType* y,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   DataType eps,
+                                   bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 8f616ada98..40e5013d1b 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -31,14 +31,14 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceKernel(DataType* dist,
-                                    const DataType* x,
-                                    const DataType* y,
-                                    int m,
-                                    int n,
-                                    int k,
-                                    raft::distance::DistanceType type,
-                                    bool isRowMajor)
+RAFT_KERNEL naiveDistanceKernel(DataType* dist,
+                                const DataType* x,
+                                const DataType* y,
+                                int m,
+                                int n,
+                                int k,
+                                raft::distance::DistanceType type,
+                                bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
@@ -58,14 +58,14 @@ __global__ void naiveDistanceKernel(DataType* dist,
 }
 
 template <typename DataType>
-__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
-                                                    const DataType* x,
-                                                    const DataType* y,
-                                                    int m,
-                                                    int n,
-                                                    int k,
-                                                    raft::distance::DistanceType type,
-                                                    bool isRowMajor)
+RAFT_KERNEL naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
+                                                const DataType* x,
+                                                const DataType* y,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                raft::distance::DistanceType type,
+                                                bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
@@ -95,7 +95,7 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
 }
 
 template <typename DataType>
-__global__ void naiveCosineDistanceKernel(
+RAFT_KERNEL naiveCosineDistanceKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -123,7 +123,7 @@ __global__ void naiveCosineDistanceKernel(
 }
 
 template <typename DataType>
-__global__ void naiveInnerProductKernel(
+RAFT_KERNEL naiveInnerProductKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -145,7 +145,7 @@ __global__ void naiveInnerProductKernel(
 }
 
 template <typename DataType>
-__global__ void naiveHellingerDistanceKernel(
+RAFT_KERNEL naiveHellingerDistanceKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -171,14 +171,14 @@ __global__ void naiveHellingerDistanceKernel(
 }
 
 template <typename DataType>
-__global__ void naiveLpUnexpDistanceKernel(DataType* dist,
-                                           const DataType* x,
-                                           const DataType* y,
-                                           int m,
-                                           int n,
-                                           int k,
-                                           bool isRowMajor,
-                                           DataType p)
+RAFT_KERNEL naiveLpUnexpDistanceKernel(DataType* dist,
+                                       const DataType* x,
+                                       const DataType* y,
+                                       int m,
+                                       int n,
+                                       int k,
+                                       bool isRowMajor,
+                                       DataType p)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
@@ -199,7 +199,7 @@ __global__ void naiveLpUnexpDistanceKernel(DataType* dist,
 }
 
 template <typename DataType>
-__global__ void naiveHammingDistanceKernel(
+RAFT_KERNEL naiveHammingDistanceKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -219,7 +219,7 @@ __global__ void naiveHammingDistanceKernel(
 }
 
 template <typename DataType>
-__global__ void naiveJensenShannonDistanceKernel(
+RAFT_KERNEL naiveJensenShannonDistanceKernel(
   DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -250,7 +250,7 @@ __global__ void naiveJensenShannonDistanceKernel(
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveRussellRaoDistanceKernel(
+RAFT_KERNEL naiveRussellRaoDistanceKernel(
   OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -270,7 +270,7 @@ __global__ void naiveRussellRaoDistanceKernel(
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveKLDivergenceDistanceKernel(
+RAFT_KERNEL naiveKLDivergenceDistanceKernel(
   OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -292,7 +292,7 @@ __global__ void naiveKLDivergenceDistanceKernel(
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveCorrelationDistanceKernel(
+RAFT_KERNEL naiveCorrelationDistanceKernel(
   OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
 {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index e807256f67..27c923b11d 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -39,14 +39,14 @@ struct RaftKVPMinReduce {
 };  // KVPMinReduce
 
 template <typename DataT, bool Sqrt, typename ReduceOpT, int NWARPS>
-__global__ void naiveKernel(raft::KeyValuePair<int, DataT>* min,
-                            DataT* x,
-                            DataT* y,
-                            int m,
-                            int n,
-                            int k,
-                            int* workspace,
-                            DataT maxVal)
+RAFT_KERNEL naiveKernel(raft::KeyValuePair<int, DataT>* min,
+                        DataT* x,
+                        DataT* y,
+                        int m,
+                        int n,
+                        int k,
+                        int* workspace,
+                        DataT maxVal)
 {
   int midx  = threadIdx.y + blockIdx.y * blockDim.y;
   int nidx  = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
index 00653f4ced..67ef1952ca 100644
--- a/cpp/test/distance/masked_nn.cu
+++ b/cpp/test/distance/masked_nn.cu
@@ -44,10 +44,10 @@ enum AdjacencyPattern {
 // - init_adj: to initialize the adjacency kernel with a specific adjacency pattern
 // - referenceKernel: to produce the ground-truth output
 
-__global__ void init_adj(AdjacencyPattern pattern,
-                         int n,
-                         raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
-                         raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
+RAFT_KERNEL init_adj(AdjacencyPattern pattern,
+                     int n,
+                     raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
+                     raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
 {
   int m          = adj.extent(0);
   int num_groups = adj.extent(1);
@@ -85,19 +85,18 @@ __global__ void init_adj(AdjacencyPattern pattern,
 }
 
 template <typename DataT, typename ReduceOpT, int NWARPS>
-__global__ __launch_bounds__(32 * NWARPS,
-                             2) void referenceKernel(raft::KeyValuePair<int, DataT>* min,
-                                                     DataT* x,
-                                                     DataT* y,
-                                                     bool* adj,
-                                                     int* group_idxs,
-                                                     int m,
-                                                     int n,
-                                                     int k,
-                                                     int num_groups,
-                                                     bool sqrt,
-                                                     int* workspace,
-                                                     DataT maxVal)
+__launch_bounds__(32 * NWARPS, 2) RAFT_KERNEL referenceKernel(raft::KeyValuePair<int, DataT>* min,
+                                                              DataT* x,
+                                                              DataT* y,
+                                                              bool* adj,
+                                                              int* group_idxs,
+                                                              int m,
+                                                              int n,
+                                                              int k,
+                                                              int num_groups,
+                                                              bool sqrt,
+                                                              int* workspace,
+                                                              DataT maxVal)
 {
   const int m_stride = blockDim.y * gridDim.y;
   const int m_offset = threadIdx.y + blockIdx.y * blockDim.y;
diff --git a/cpp/test/distance/masked_nn_compress_to_bits.cu b/cpp/test/distance/masked_nn_compress_to_bits.cu
index 9474244a8f..1563974e56 100644
--- a/cpp/test/distance/masked_nn_compress_to_bits.cu
+++ b/cpp/test/distance/masked_nn_compress_to_bits.cu
@@ -47,7 +47,7 @@ namespace raft::distance::masked_nn::compress_to_bits {
  * @parameter[out] out      An `(m * bits_per_elem) x n` boolean matrix.
  */
 template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
-__global__ void decompress_bits_kernel(const T* in, int in_rows, int in_cols, bool* out)
+RAFT_KERNEL decompress_bits_kernel(const T* in, int in_rows, int in_cols, bool* out)
 {
   constexpr int bits_per_element = 8 * sizeof(T);
 
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index c33a1d66e0..db16cd1803 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ namespace raft {
 namespace linalg {
 
 template <typename InT, typename OutT = InT>
-__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len)
+RAFT_KERNEL naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); }
diff --git a/cpp/test/linalg/axpy.cu b/cpp/test/linalg/axpy.cu
index 887e31bb18..44ec4c1385 100644
--- a/cpp/test/linalg/axpy.cu
+++ b/cpp/test/linalg/axpy.cu
@@ -27,7 +27,7 @@ namespace raft {
 namespace linalg {
 // Reference axpy implementation.
 template <typename T>
-__global__ void naiveAxpy(const int n, const T alpha, const T* x, T* y, int incx, int incy)
+RAFT_KERNEL naiveAxpy(const int n, const T alpha, const T* x, T* y, int incx, int incy)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < n) { y[idx * incy] += alpha * x[idx * incx]; }
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index 8b0bc609d2..6b502ecf7c 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len)
+RAFT_KERNEL naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len)
 {
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) { out[idx] = static_cast<OutType>(in1[idx] + in2[idx]); }
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 1e3b26fe8b..06d98acb0e 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -26,7 +26,7 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len)
+RAFT_KERNEL naiveDivideKernel(Type* out, const Type* in, Type scalar, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = in[idx] / scalar; }
diff --git a/cpp/test/linalg/dot.cu b/cpp/test/linalg/dot.cu
index dd45a88375..34a2e7e7e2 100644
--- a/cpp/test/linalg/dot.cu
+++ b/cpp/test/linalg/dot.cu
@@ -26,7 +26,7 @@ namespace raft {
 namespace linalg {
 // Reference dot implementation.
 template <typename T>
-__global__ void naiveDot(const int n, const T* x, int incx, const T* y, int incy, T* out)
+RAFT_KERNEL naiveDot(const int n, const T* x, int incx, const T* y, int incy, T* out)
 {
   T sum = 0;
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 6bb0960990..531af71395 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -27,7 +27,7 @@ namespace linalg {
 //// Testing unary ops
 
 template <typename Type>
-__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len)
+RAFT_KERNEL naiveScaleKernel(Type* out, const Type* in, Type scalar, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = scalar * in[idx]; }
@@ -114,7 +114,7 @@ INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::Va
 //// Testing binary ops
 
 template <typename Type>
-__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len)
+RAFT_KERNEL naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = in1[idx] + in2[idx]; }
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 898c8ad5aa..83649e8a7f 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -37,7 +37,7 @@ struct GemmLayoutInputs {
 
 // Reference GEMM implementation.
 template <typename T>
-__global__ void naiveGemm(
+RAFT_KERNEL naiveGemm(
   T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor)
 {
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index beedb1894a..e90f88e0e1 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -35,13 +35,13 @@ struct GemvInputs {
 
 // Reference GEMV implementation.
 template <typename T>
-__global__ void naiveGemv(T* y,
-                          const T* A,
-                          const T* x,
-                          const int n_rows,
-                          const int n_cols,
-                          const int lda,
-                          const bool trans_a)
+RAFT_KERNEL naiveGemv(T* y,
+                      const T* A,
+                      const T* x,
+                      const int n_rows,
+                      const int n_cols,
+                      const int lda,
+                      const bool trans_a)
 {
   int istart = blockIdx.x * blockDim.x + threadIdx.x;
   int istep  = blockDim.x * gridDim.x;
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index de54e08c80..8e13babec6 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -31,7 +31,7 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename MapOp>
-__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map)
+RAFT_KERNEL naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); }
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index cf316ef111..d28e7cc5b5 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,14 +22,14 @@ namespace raft {
 namespace linalg {
 
 template <typename OutT, typename MatT, typename VecT, typename Lambda, typename IdxType = int>
-__global__ void naiveMatVecKernel(OutT* out,
-                                  const MatT* mat,
-                                  const VecT* vec,
-                                  IdxType D,
-                                  IdxType N,
-                                  bool rowMajor,
-                                  bool bcastAlongRows,
-                                  Lambda op)
+RAFT_KERNEL naiveMatVecKernel(OutT* out,
+                              const MatT* mat,
+                              const VecT* vec,
+                              IdxType D,
+                              IdxType N,
+                              bool rowMajor,
+                              bool bcastAlongRows,
+                              Lambda op)
 {
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
@@ -93,15 +93,15 @@ template <typename OutT,
           typename Vec2T,
           typename Lambda,
           typename IdxType = int>
-__global__ void naiveMatVecKernel(OutT* out,
-                                  const MatT* mat,
-                                  const Vec1T* vec1,
-                                  const Vec2T* vec2,
-                                  IdxType D,
-                                  IdxType N,
-                                  bool rowMajor,
-                                  bool bcastAlongRows,
-                                  Lambda op)
+RAFT_KERNEL naiveMatVecKernel(OutT* out,
+                              const MatT* mat,
+                              const Vec1T* vec1,
+                              const Vec2T* vec2,
+                              IdxType D,
+                              IdxType N,
+                              bool rowMajor,
+                              bool bcastAlongRows,
+                              Lambda op)
 {
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
diff --git a/cpp/test/linalg/mean_squared_error.cu b/cpp/test/linalg/mean_squared_error.cu
index 1eb774053c..5d0694f812 100644
--- a/cpp/test/linalg/mean_squared_error.cu
+++ b/cpp/test/linalg/mean_squared_error.cu
@@ -27,7 +27,7 @@ namespace linalg {
 
 // reference MSE calculation
 template <typename T>
-__global__ void naiveMeanSquaredError(const int n, const T* a, const T* b, T weight, T* out)
+RAFT_KERNEL naiveMeanSquaredError(const int n, const T* a, const T* b, T weight, T* out)
 {
   T err = 0;
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 41bc12e1f9..888ebe28d7 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -47,7 +47,7 @@ template <typename T, typename IdxT>
 
 ///// Row-wise norm test definitions
 template <typename Type, typename IdxT>
-__global__ void naiveRowNormKernel(
+RAFT_KERNEL naiveRowNormKernel(
   Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
 {
   Type acc      = (Type)0;
@@ -123,7 +123,7 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
 
 ///// Column-wise norm test definitisons
 template <typename Type, typename IdxT>
-__global__ void naiveColNormKernel(
+RAFT_KERNEL naiveColNormKernel(
   Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
 {
   IdxT colID = threadIdx.x + static_cast<IdxT>(blockIdx.x) * blockDim.x;
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index fd43315e36..b8e8355fa0 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -25,7 +25,7 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+RAFT_KERNEL naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = raft::pow(in1[idx], in2[idx]); }
@@ -41,7 +41,7 @@ void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaSt
 }
 
 template <typename Type>
-__global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+RAFT_KERNEL naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = raft::pow(in1[idx], in2); }
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 17e91ce202..b191fba6c6 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,15 +35,15 @@ template <typename InType,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void naiveCoalescedReductionKernel(OutType* dots,
-                                              const InType* data,
-                                              IdxType D,
-                                              IdxType N,
-                                              OutType init,
-                                              bool inplace,
-                                              MainLambda main_op,
-                                              ReduceLambda reduce_op,
-                                              FinalLambda fin_op)
+RAFT_KERNEL naiveCoalescedReductionKernel(OutType* dots,
+                                          const InType* data,
+                                          IdxType D,
+                                          IdxType N,
+                                          OutType init,
+                                          bool inplace,
+                                          MainLambda main_op,
+                                          ReduceLambda reduce_op,
+                                          FinalLambda fin_op)
 {
   OutType acc      = init;
   IdxType rowStart = threadIdx.x + static_cast<IdxType>(blockIdx.x) * blockDim.x;
@@ -89,15 +89,15 @@ template <typename InType,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void naiveStridedReductionKernel(OutType* dots,
-                                            const InType* data,
-                                            IdxType D,
-                                            IdxType N,
-                                            OutType init,
-                                            bool inplace,
-                                            MainLambda main_op,
-                                            ReduceLambda reduce_op,
-                                            FinalLambda fin_op)
+RAFT_KERNEL naiveStridedReductionKernel(OutType* dots,
+                                        const InType* data,
+                                        IdxType D,
+                                        IdxType N,
+                                        OutType init,
+                                        bool inplace,
+                                        MainLambda main_op,
+                                        ReduceLambda reduce_op,
+                                        FinalLambda fin_op)
 {
   OutType acc = init;
   IdxType col = threadIdx.x + static_cast<IdxType>(blockIdx.x) * blockDim.x;
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index 6dbdc51f92..203dbdd5d1 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -26,15 +26,15 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveReduceRowsByKeyKernel(const Type* d_A,
-                                           int lda,
-                                           uint32_t* d_keys,
-                                           const Type* d_weight,
-                                           char* d_char_keys,
-                                           int nrows,
-                                           int ncols,
-                                           int nkeys,
-                                           Type* d_sums)
+RAFT_KERNEL naiveReduceRowsByKeyKernel(const Type* d_A,
+                                       int lda,
+                                       uint32_t* d_keys,
+                                       const Type* d_weight,
+                                       char* d_char_keys,
+                                       int nrows,
+                                       int ncols,
+                                       int nkeys,
+                                       Type* d_sums)
 {
   int c = threadIdx.x + blockIdx.x * blockDim.x;
   if (c >= ncols) return;
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index e70551fb93..06d1b76b78 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -25,7 +25,7 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len)
+RAFT_KERNEL naiveSqrtElemKernel(Type* out, const Type* in1, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = raft::sqrt(in1[idx]); }
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 47a2aab82d..292abdc07f 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -25,7 +25,7 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+RAFT_KERNEL naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = in1[idx] - in2[idx]; }
@@ -41,7 +41,7 @@ void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cud
 }
 
 template <typename Type>
-__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+RAFT_KERNEL naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = in1[idx] - in2; }
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index 9d2bd6f7c9..a276024e14 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -24,7 +24,7 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len)
+RAFT_KERNEL naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len)
 {
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index ecc18cd16a..3596e63603 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -33,7 +33,7 @@ namespace raft {
 namespace matrix {
 
 template <typename Type>
-__global__ void naivePowerKernel(Type* in, Type* out, int len)
+RAFT_KERNEL naivePowerKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = in[idx] * in[idx]; }
@@ -49,7 +49,7 @@ void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
 }
 
 template <typename Type>
-__global__ void naiveSqrtKernel(Type* in, Type* out, int len)
+RAFT_KERNEL naiveSqrtKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = raft::sqrt(in[idx]); }
@@ -65,7 +65,7 @@ void naiveSqrt(Type* in, Type* out, int len, cudaStream_t stream)
 }
 
 template <typename Type>
-__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount)
+RAFT_KERNEL naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount)
 {
   int d_i = blockIdx.x * rowCount;
   int end = d_i + rowCount;
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 2856daa8f2..c9336c16cd 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -114,9 +114,9 @@ testing::AssertionResult CheckOrder(raft::host_matrix_view<IdxT, int64_t> index_
 // When testing the CAGRA index sorting function, rounding errors can affect the norm and alter the
 // order of the index. To ensure the accuracy of the test, we utilize the dataset. The generation
 // method is based on the error-free transformation (EFT) method.
-__global__ void GenerateRoundingErrorFreeDataset_kernel(float* const ptr,
-                                                        const uint32_t size,
-                                                        const uint32_t resolution)
+RAFT_KERNEL GenerateRoundingErrorFreeDataset_kernel(float* const ptr,
+                                                    const uint32_t size,
+                                                    const uint32_t resolution)
 {
   const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= size) { return; }
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu
index 338fb8cd82..fc711fc668 100644
--- a/cpp/test/neighbors/ball_cover.cu
+++ b/cpp/test/neighbors/ball_cover.cu
@@ -41,14 +41,14 @@ namespace raft::neighbors::ball_cover {
 using namespace std;
 
 template <typename value_idx, typename value_t>
-__global__ void count_discrepancies_kernel(value_idx* actual_idx,
-                                           value_idx* expected_idx,
-                                           value_t* actual,
-                                           value_t* expected,
-                                           uint32_t m,
-                                           uint32_t n,
-                                           uint32_t* out,
-                                           float thres = 1e-3)
+RAFT_KERNEL count_discrepancies_kernel(value_idx* actual_idx,
+                                       value_idx* expected_idx,
+                                       value_t* actual,
+                                       value_t* expected,
+                                       uint32_t m,
+                                       uint32_t n,
+                                       uint32_t* out,
+                                       float thres = 1e-3)
 {
   uint32_t row = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/cpp/test/neighbors/knn.cu b/cpp/test/neighbors/knn.cu
index d187cdd7a5..5fe88ee89e 100644
--- a/cpp/test/neighbors/knn.cu
+++ b/cpp/test/neighbors/knn.cu
@@ -38,7 +38,7 @@ struct KNNInputs {
 };
 
 template <typename IdxT>
-__global__ void build_actual_output(
+RAFT_KERNEL build_actual_output(
   int* output, int n_rows, int k, const int* idx_labels, const IdxT* indices)
 {
   int element = threadIdx.x + blockDim.x * blockIdx.x;
@@ -47,7 +47,7 @@ __global__ void build_actual_output(
   output[element] = idx_labels[indices[element]];
 }
 
-__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels)
+RAFT_KERNEL build_expected_output(int* output, int n_rows, int k, const int* labels)
 {
   int row = threadIdx.x + blockDim.x * blockIdx.x;
   if (row >= n_rows) return;
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 2fb7fdf142..d78fbaf15b 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -29,14 +29,14 @@ namespace raft {
 namespace random {
 
 template <typename T>
-__global__ void meanKernel(T* out,
-                           int* lens,
-                           const T* data,
-                           const int* labels,
-                           int nrows,
-                           int ncols,
-                           int nclusters,
-                           bool row_major)
+RAFT_KERNEL meanKernel(T* out,
+                       int* lens,
+                       const T* data,
+                       const int* labels,
+                       int nrows,
+                       int ncols,
+                       int nclusters,
+                       bool row_major)
 {
   int tid   = threadIdx.x + blockIdx.x * blockDim.x;
   int rowid = row_major ? tid / ncols : tid % nrows;
@@ -52,7 +52,7 @@ __global__ void meanKernel(T* out,
 }
 
 template <typename T>
-__global__ void compute_mean_var(
+RAFT_KERNEL compute_mean_var(
   T* out, const T* stats, int* lens, int nrows, int ncols, bool row_major)
 {
   int tid    = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index e35d49e453..d84d4da905 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -37,7 +37,7 @@ namespace raft::random {
 /// @todo Duplicate called vctwiseAccumulate in utils.h (Kalman Filters,
 // i think that is much better to use., more general)
 template <typename T>
-__global__ void En_KF_accumulate(const int nPoints, const int dim, const T* X, T* x)
+RAFT_KERNEL En_KF_accumulate(const int nPoints, const int dim, const T* X, T* x)
 {
   int idx = threadIdx.x + blockDim.x * blockIdx.x;
   int col = idx % dim;
@@ -46,14 +46,14 @@ __global__ void En_KF_accumulate(const int nPoints, const int dim, const T* X, T
 }
 
 template <typename T>
-__global__ void En_KF_normalize(const int divider, const int dim, T* x)
+RAFT_KERNEL En_KF_normalize(const int divider, const int dim, T* x)
 {
   int xi = threadIdx.x + blockDim.x * blockIdx.x;
   if (xi < dim) x[xi] = x[xi] / divider;
 }
 
 template <typename T>
-__global__ void En_KF_dif(const int nPoints, const int dim, const T* X, const T* x, T* X_diff)
+RAFT_KERNEL En_KF_dif(const int nPoints, const int dim, const T* X, const T* x, T* X_diff)
 {
   int idx = threadIdx.x + blockDim.x * blockIdx.x;
   int col = idx % dim;
diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/test/random/rmat_rectangular_generator.cu
index 77af44f133..5a314d0f62 100644
--- a/cpp/test/random/rmat_rectangular_generator.cu
+++ b/cpp/test/random/rmat_rectangular_generator.cu
@@ -44,7 +44,7 @@ struct RmatInputs {
 };
 
 template <typename OutT, typename InT>
-__global__ void normalize_kernel(
+RAFT_KERNEL normalize_kernel(
   OutT* theta, const InT* in_vals, size_t max_scale, size_t r_scale, size_t c_scale)
 {
   size_t idx = threadIdx.x;
@@ -67,7 +67,7 @@ __global__ void normalize_kernel(
 
 // handle rectangular cases correctly
 template <typename OutT>
-__global__ void handle_rect_kernel(OutT* theta, size_t max_scale, size_t r_scale, size_t c_scale)
+RAFT_KERNEL handle_rect_kernel(OutT* theta, size_t max_scale, size_t r_scale, size_t c_scale)
 {
   size_t idx = threadIdx.x;
   if (idx < max_scale) {
@@ -97,7 +97,7 @@ __global__ void handle_rect_kernel(OutT* theta, size_t max_scale, size_t r_scale
 // for a single probability distribution across depths, just replicate the theta's!
 // this will keep the test code simpler
 template <typename OutT>
-__global__ void theta_kernel(OutT* theta, size_t max_scale, size_t r_scale, size_t c_scale)
+RAFT_KERNEL theta_kernel(OutT* theta, size_t max_scale, size_t r_scale, size_t c_scale)
 {
   size_t idx = threadIdx.x;
   if (idx != 0 && idx < max_scale) {
@@ -148,7 +148,7 @@ void normalize(OutT* theta,
   }
 }
 
-__global__ void compute_hist(
+RAFT_KERNEL compute_hist(
   int* hist, const size_t* out, size_t len, size_t max_scale, size_t r_scale, size_t c_scale)
 {
   size_t idx = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index d04caf96b1..8ec35de112 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -46,7 +46,7 @@ enum RandomType {
 };
 
 template <typename T, int TPB>
-__global__ void meanKernel(T* out, const T* data, int len)
+RAFT_KERNEL meanKernel(T* out, const T* data, int len)
 {
   typedef cub::BlockReduce<T, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 73c95573ec..88f1a282b4 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -31,7 +31,7 @@ using namespace raft::random;
 enum RandomType { RNG_Uniform };
 
 template <typename T, int TPB>
-__global__ void meanKernel(float* out, const T* data, int len)
+RAFT_KERNEL meanKernel(float* out, const T* data, int len)
 {
   typedef cub::BlockReduce<float, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
diff --git a/cpp/test/random/rng_pcg_host_api.cu b/cpp/test/random/rng_pcg_host_api.cu
index a0263f0586..ae7c0a65f8 100644
--- a/cpp/test/random/rng_pcg_host_api.cu
+++ b/cpp/test/random/rng_pcg_host_api.cu
@@ -43,11 +43,11 @@ __host__ __device__ void single_thread_fill(DType* buffer,
 }
 
 template <typename DType, typename ParamType, int CPT, int IPC>
-__global__ void pcg_device_kernel(DType* buffer,
-                                  DeviceState<PCGenerator> r,
-                                  ParamType params,
-                                  const size_t total_threads,
-                                  const size_t len)
+RAFT_KERNEL pcg_device_kernel(DType* buffer,
+                              DeviceState<PCGenerator> r,
+                              ParamType params,
+                              const size_t total_threads,
+                              const size_t len)
 {
   int tid = int(blockIdx.x) * blockDim.x + threadIdx.x;
 
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index e312f21b4d..8be107ef3e 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -90,7 +90,7 @@ INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesI
 /******************************** adj graph ********************************/
 
 template <typename index_t>
-__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
+RAFT_KERNEL init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
 {
   index_t r = blockDim.y * blockIdx.y + threadIdx.y;
   index_t c = blockDim.x * blockIdx.x + threadIdx.x;
diff --git a/cpp/test/sparse/neighbors/knn_graph.cu b/cpp/test/sparse/neighbors/knn_graph.cu
index bb190b7bc1..232bd76da1 100644
--- a/cpp/test/sparse/neighbors/knn_graph.cu
+++ b/cpp/test/sparse/neighbors/knn_graph.cu
@@ -30,7 +30,7 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(
+RAFT_KERNEL assert_symmetry(
   value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
 {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 29549d057d..666396eb5b 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -31,7 +31,7 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(
+RAFT_KERNEL assert_symmetry(
   value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
 {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index 027434aa31..86f708db62 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -28,7 +28,7 @@ namespace raft {
 namespace stats {
 
 // Note: this kernel also updates the input vector to take care of OOB bins!
-__global__ void naiveHistKernel(int* bins, int nbins, int* in, int nrows)
+RAFT_KERNEL naiveHistKernel(int* bins, int nbins, int* in, int nrows)
 {
   int tid        = threadIdx.x + blockIdx.x * blockDim.x;
   int stride     = blockDim.x * gridDim.x;
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index 6a7ab9e917..3715bc5bd3 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -46,7 +46,7 @@ template <typename T>
 }
 
 template <typename T>
-__global__ void naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+RAFT_KERNEL naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= ncols) return;
@@ -55,7 +55,7 @@ __global__ void naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T i
 }
 
 template <typename T>
-__global__ void naiveMinMaxKernel(const T* data, int nrows, int ncols, T* globalmin, T* globalmax)
+RAFT_KERNEL naiveMinMaxKernel(const T* data, int nrows, int ncols, T* globalmin, T* globalmax)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int col = tid / nrows;
@@ -83,7 +83,7 @@ void naiveMinMax(
 }
 
 template <typename T>
-__global__ void nanKernel(T* data, const bool* mask, int len, T nan)
+RAFT_KERNEL nanKernel(T* data, const bool* mask, int len, T nan)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= len) return;
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu
index f928480b54..1a0f2cc7fc 100644
--- a/cpp/test/util/bitonic_sort.cu
+++ b/cpp/test/util/bitonic_sort.cu
@@ -48,7 +48,7 @@ auto operator<<(std::ostream& os, const test_spec& ss) -> std::ostream&
 }
 
 template <int Capacity, typename T>
-__global__ void bitonic_kernel(T* arr, bool ascending, int warp_width, int n_inputs)
+RAFT_KERNEL bitonic_kernel(T* arr, bool ascending, int warp_width, int n_inputs)
 {
   const int tid          = blockDim.x * blockIdx.x + threadIdx.x;
   const int subwarp_id   = tid / warp_width;
diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu
index 56f798b617..f1a97794ed 100644
--- a/cpp/test/util/device_atomics.cu
+++ b/cpp/test/util/device_atomics.cu
@@ -29,7 +29,7 @@
 
 namespace raft {
 
-__global__ void test_atomic_inc_warp_kernel(int* counter, int* out_array)
+RAFT_KERNEL test_atomic_inc_warp_kernel(int* counter, int* out_array)
 {
   int global_tid                    = blockDim.x * blockIdx.x + threadIdx.x;
   out_array[atomicIncWarp(counter)] = global_tid;
diff --git a/cpp/test/util/integer_utils.cu b/cpp/test/util/integer_utils.cu
index 7ea8d9528d..1682bc9dc7 100644
--- a/cpp/test/util/integer_utils.cu
+++ b/cpp/test/util/integer_utils.cu
@@ -30,12 +30,12 @@ struct MulInputs {
   uint64_t operand_2;
 };
 
-__global__ void mul64_test_kernel(uint64_t* result_high,
-                                  uint64_t* result_low,
-                                  uint64_t* swapped_result_high,
-                                  uint64_t* swapped_result_low,
-                                  const uint64_t op1,
-                                  const uint64_t op2)
+RAFT_KERNEL mul64_test_kernel(uint64_t* result_high,
+                              uint64_t* result_low,
+                              uint64_t* swapped_result_high,
+                              uint64_t* swapped_result_low,
+                              const uint64_t op1,
+                              const uint64_t op2)
 {
   using raft::wmul_64bit;
   wmul_64bit(*result_high, *result_low, op1, op2);
diff --git a/cpp/test/util/reduction.cu b/cpp/test/util/reduction.cu
index 548d3b9d53..b648128785 100644
--- a/cpp/test/util/reduction.cu
+++ b/cpp/test/util/reduction.cu
@@ -33,7 +33,7 @@ namespace raft::util {
 constexpr int max_warps_per_block = 32;
 
 template <typename ReduceLambda>
-__global__ void test_reduction_kernel(const int* input, int* reduction_res, ReduceLambda reduce_op)
+RAFT_KERNEL test_reduction_kernel(const int* input, int* reduction_res, ReduceLambda reduce_op)
 {
   assert(gridDim.x == 1);
   __shared__ int red_buf[max_warps_per_block];
@@ -43,10 +43,10 @@ __global__ void test_reduction_kernel(const int* input, int* reduction_res, Redu
 }
 
 template <typename ReduceLambda>
-__global__ void test_ranked_reduction_kernel(const int* input,
-                                             int* reduction_res,
-                                             int* out_rank,
-                                             ReduceLambda reduce_op)
+RAFT_KERNEL test_ranked_reduction_kernel(const int* input,
+                                         int* reduction_res,
+                                         int* out_rank,
+                                         ReduceLambda reduce_op)
 {
   assert(gridDim.x == 1);
   __shared__ int red_buf[2 * max_warps_per_block];
@@ -59,7 +59,7 @@ __global__ void test_ranked_reduction_kernel(const int* input,
   }
 }
 
-__global__ void test_block_random_sample_kernel(const int* input, int* reduction_res)
+RAFT_KERNEL test_block_random_sample_kernel(const int* input, int* reduction_res)
 {
   assert(gridDim.x == 1);
   __shared__ int red_buf[2 * max_warps_per_block];
@@ -71,7 +71,7 @@ __global__ void test_block_random_sample_kernel(const int* input, int* reduction
 }
 
 template <int TPB>
-__global__ void test_binary_reduction_kernel(const int* input, int* reduction_res)
+RAFT_KERNEL test_binary_reduction_kernel(const int* input, int* reduction_res)
 {
   assert(gridDim.x == 1);
   __shared__ int shared[TPB / WarpSize];
diff --git a/docs/source/ann_benchmarks_low_level.md b/docs/source/ann_benchmarks_low_level.md
index d08a3a1791..55238954ba 100644
--- a/docs/source/ann_benchmarks_low_level.md
+++ b/docs/source/ann_benchmarks_low_level.md
@@ -2,57 +2,53 @@
 #### End-to-end Example
 An end-to-end example (run from the RAFT source code root directory):
 ```bash
-# (1) prepare a dataset
-pushd
+# (0) get raft sources
+git clone https://github.com/rapidsai/raft.git
+cd raft
 
-cd cpp/bench/ann
-mkdir data && cd data
-wget http://ann-benchmarks.com/glove-100-angular.hdf5
+# (1) prepare a dataset
+export PYTHONPATH=python/raft-ann-bench/src:$PYTHONPATH
+python -m raft-ann-bench.get_dataset --dataset glove-100-angular --normalize
 
-# option -n is used here to normalize vectors so cosine distance is converted
+# option --normalize is used here to normalize vectors so cosine distance is converted
 # to inner product; don't use -n for l2 distance
-python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5
-
-mkdir glove-100-inner
-mv glove-100-angular.base.fbin glove-100-inner/base.fbin
-mv glove-100-angular.query.fbin glove-100-inner/query.fbin
-mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin
-mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin
-popd
 
 # (2) build index
-./cpp/build/RAFT_IVF_FLAT_ANN_BENCH \
-  --data_prefix=cpp/bench/ann/data \
+$CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH \
+  --data_prefix=datasets \
   --build \
   --benchmark_filter="raft_ivf_flat\..*" \
-  cpp/bench/ann/conf/glove-100-inner.json
+  python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json 
 
 # (3) search
-./cpp/build/RAFT_IVF_FLAT_ANN_BENCH \
-  --data_prefix=cpp/bench/ann/data \
+$CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH\
+  --data_prefix=datasets \
   --benchmark_min_time=2s \
   --benchmark_out=ivf_flat_search.csv \
   --benchmark_out_format=csv \
   --benchmark_counters_tabular \
   --search \
-  --benchmark_filter="raft_ivf_flat\..*"
-  cpp/bench/ann/conf/glove-100-inner.json
+  --benchmark_filter="raft_ivf_flat\..*" \
+    python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json 
+
 
 # optional step: plot QPS-Recall figure using data in ivf_flat_search.csv with your favorite tool
 ```
 
 ##### Step 1: Prepare Dataset
+Note: the preferred way to download and process smaller (million scale) datasets is to use the `get_dataset` script as demonstrated in the example above.
+
 A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation.
 
 The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
 These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order.
 
-Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
+Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
 
 Commonly used datasets can be downloaded from two websites:
 1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
 
-    However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    However, these datasets are in HDF5 format. Use `python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
     ```bash
     pip3 install numpy h5py
     ```
@@ -72,8 +68,8 @@ Commonly used datasets can be downloaded from two websites:
 
 2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
     ```bash
-    $ cpp/bench/ann/scripts/split_groundtruth.pl
-    usage: script/split_groundtruth.pl input output_prefix
+    $ python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl
+    usage: split_groundtruth.pl input output_prefix
     ```
     Take Deep-1B dataset as an example:
     ```bash
@@ -82,7 +78,7 @@ Commonly used datasets can be downloaded from two websites:
     mkdir -p data/deep-1B && cd data/deep-1B
     # download manually "Ground Truth" file of "Yandex DEEP"
     # suppose the file name is deep_new_groundtruth.public.10K.bin
-    ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+    /path/to/raft/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
     popd
     ```
diff --git a/docs/source/build.md b/docs/source/build.md
index 4a8748deb6..4be0a84090 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -1,12 +1,41 @@
 # Installation
 
-### Conda
+RAFT currently provides libraries for C++ and Python. The C++ libraries, including the header-only and optional shared library, can be installed with Conda. 
+
+Both the C++ and Python APIs require CMake to build from source.
+
+## Table of Contents
+
+- [Install C++ and Python through Conda](#installing-c-and-python-through-conda)
+- [Installing Python through Pip](#installing-python-through-pip)
+- [Building C++ and Python from source](#building-c-and-python-from-source)
+  - [CUDA/GPU requirements](#cudagpu-requirements)
+  - [Build dependencies](#build-dependencies)
+    - [Required](#required)
+    - [Optional](#optional)
+    - [Conda environment scripts](#conda-environment-scripts)
+  - [Header-only C++](#header-only-c)
+  - [C++ shared library](#c-shared-library-optional)
+  - [ccache and sccache](#ccache-and-sccache)
+  - [C++ tests](#c-tests)
+  - [C++ primitives microbenchmarks](#c-primitives-microbenchmarks)
+  - [Python libraries](#python-libraries)
+- [Using CMake directly](#using-cmake-directly)
+- [Build documentation](#build-documentation)
+- [Using RAFT in downstream projects](#using-raft-c-in-downstream-projects)
+  - [CMake targets](#cmake-targets)
+
+------
+
+## Installing C++ and Python through Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-headers` RAFT headers
-- `libraft` (optional) shared library containing pre-compiled template instantiations and runtime API.
-- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
-- `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
+- `libraft-headers` C++ headers
+- `libraft` (optional) C++ shared library containing pre-compiled template instantiations and runtime API.
+- `pylibraft` (optional) Python library
+- `raft-dask` (optional) Python library for deployment of multi-node multi-GPU algorithms that use the RAFT `raft::comms` abstraction layer in Dask clusters.
+- `raft-ann-bench` (optional) Benchmarking tool for easily producing benchmarks that compare RAFT's vector search algorithms against other state-of-the-art implementations.
+- `raft-ann-bench-cpu` (optional) Reproducible benchmarking tool similar to above, but doesn't require CUDA to be installed on the machine. Can be used to test in environments with competitive CPUs.
 
 Use the following command, depending on your CUDA version, to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command.
 ```bash
@@ -19,19 +48,35 @@ mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-vers
 mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0
 ```
 
-You can also install the conda packages individually using the `mamba` command above.
+Note that the above commands will also install `libraft-headers` and `libraft`.
 
-After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
+You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project:
+```bash
+# for CUDA 12.0
+mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
+```
 
-### Pip
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-23.12/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
-pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install):
+## Installing Python through Pip
+
+`pylibraft` and `raft-dask` both have packages that can be [installed through pip](https://rapids.ai/pip.html#install). 
+
+For CUDA 11 packages:
 ```bash
 pip install pylibraft-cu11 --extra-index-url=https://pypi.nvidia.com
 pip install raft-dask-cu11 --extra-index-url=https://pypi.nvidia.com
 ```
 
-## Building and installing RAFT
+And CUDA 12 packages:
+```bash
+pip install pylibraft-cu12 --extra-index-url=https://pypi.nvidia.com
+pip install raft-dask-cu12 --extra-index-url=https://pypi.nvidia.com
+```
+
+These packages statically build RAFT's pre-compiled instantiations, so the C++ headers and pre-compiled shared library won't be readily available to use in your code. 
+
+## Building C++ and Python from source
 
 ### CUDA/GPU Requirements
 - cmake 3.26.4+
@@ -57,9 +102,23 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth
 - [Googlebench](https://github.com/google/benchmark) - Needed to build benchmarks
 - [Doxygen](https://github.com/doxygen/doxygen) - Needed to build docs
 
-All of RAFT's C++ APIs can be used header-only but pre-compiled shared libraries also contain some host-accessible APIs and template instantiations to accelerate compile times.
+#### Conda environment scripts
+
+Conda environment scripts are provided for installing the necessary dependencies to build both the C++ and Python libraries from source. It is preferred to use `mamba`, as it provides significant speedup over `conda`:
+```bash
+mamba env create --name rapids_raft -f conda/environments/all_cuda-120_arch-x86_64.yaml
+mamba activate rapids_raft
+```
+
+All of RAFT's C++ APIs can be used header-only and optional pre-compiled shared libraries provide some host-accessible runtime APIs and template instantiations to accelerate compile times.
+
+The process for building from source with CUDA 11 differs slightly in that your host system will also need to have CUDA toolkit installed which is greater than, or equal to, the version you install into you conda environment. Installing CUDA toolkit into your host system is necessary because `nvcc` is not provided with Conda's cudatoolkit dependencies for CUDA 11. The following example will install create and install dependencies for a CUDA 11.8 conda environment
+```bash
+mamba env create --name rapids_raft -f conda/environments/all_cuda-118_arch-x86_64.yaml
+mamba activate rapids_raft
+```
 
-The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the headers, tests, benchmarks, and individual shared libraries.
+The recommended way to build and install RAFT from source is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides CMake options for building and installing the headers, tests, benchmarks, and the pre-compiled shared library.
 
 ### Header-only C++
 
@@ -68,9 +127,8 @@ The recommended way to build and install RAFT is to use the `build.sh` script in
 The following example will download the needed dependencies and install the RAFT headers into `$INSTALL_PREFIX/include/raft`. 
 ```bash
 ./build.sh libraft
-
 ```
-The `-n` flag can be passed to just have the build download the needed dependencies. Since RAFT is primarily used at build-time, the dependencies will never be installed by the RAFT build.
+The `-n` flag can be passed to just have the build download the needed dependencies. Since RAFT's C++ headers are primarily used during build-time in downstream projects, the dependencies will never be installed by the RAFT build.
 ```bash
 ./build.sh libraft -n
 ```
@@ -80,7 +138,6 @@ Once installed, `libraft` headers (and dependencies which were downloaded and in
 ./build.sh libraft --uninstall
 ```
 
-
 ### C++ Shared Library (optional)
 
 A shared library can be built for speeding up compile times. The shared library also contains a runtime API that allows you to invoke RAFT APIs directly from C++ source files (without `nvcc`). The shared library can also significantly improve re-compile times both while developing RAFT and using its APIs to develop applications. Pass the `--compile-lib` flag to `build.sh` to build the library:
@@ -104,7 +161,7 @@ Once installed, the shared library, headers (and any dependencies downloaded and
 ./build.sh libraft --cache-tool=ccache
 ```
 
-### Tests
+### C++ Tests
 
 Compile the tests using the `tests` target in `build.sh`.
 
@@ -131,72 +188,35 @@ It can take sometime to compile all of the tests. You can build individual tests
 ./build.sh libraft tests -n --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST
 ```
 
-### Benchmarks
+### C++ Primitives Microbenchmarks
 
-The benchmarks are broken apart by algorithm category, so you will find several binaries in `cpp/build/` named `*_BENCH`.
+The benchmarks are broken apart by algorithm category, so you will find several binaries in `cpp/build/` named `*_PRIMS_BENCH`.
 ```bash
-./build.sh libraft bench
+./build.sh libraft bench-prims
 ```
 
-It can take sometime to compile all of the benchmarks. You can build individual benchmarks by providing a semicolon-separated list to the `--limit-bench` option in `build.sh`:
+It can take sometime to compile all of the benchmarks. You can build individual benchmarks by providing a semicolon-separated list to the `--limit-bench-prims` option in `build.sh`:
 
 ```bash
-./build.sh libraft bench -n --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH
-```
-
-### C++ Using Cmake Directly
-
-Use `CMAKE_INSTALL_PREFIX` to install RAFT into a specific location. The snippet below will install it into the current conda environment:
-```bash
-cd cpp
-mkdir build
-cd build
-cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARY=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
-make -j<parallel_level> install
+./build.sh libraft bench-prims -n --limit-bench=NEIGHBORS_PRIMS_BENCH;DISTANCE_PRIMS_BENCH;LINALG_PRIMS_BENCH
 ```
 
-RAFT's cmake has the following configurable flags available:.
-
-| Flag                            | Possible Values      | Default Value | Behavior                                                                     |
-|---------------------------------|----------------------| --- |------------------------------------------------------------------------------|
-| BUILD_TESTS                     | ON, OFF              | ON | Compile Googletests                                                          |
-| BUILD_PRIMS_BENCH                     | ON, OFF              | OFF | Compile benchmarks                                                           |
-| BUILD_ANN_BENCH               | ON, OFF              | OFF | Compile end-to-end ANN benchmarks |
-| RAFT_COMPILE_LIBRARY      | ON, OFF              | ON if either BUILD_TESTS or BUILD_PRIMS_BENCH is ON; otherwise OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| raft_FIND_COMPONENTS            | compiled distributed | | Configures the optional components as a space-separated list                 |
-| RAFT_ENABLE_CUBLAS_DEPENDENCY   | ON, OFF | ON | Link against cublas library in `raft::raft`                                  | 
-| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against cusolver library in `raft::raft`                                | 
-| RAFT_ENABLE_CUSPARSE_DEPENDENCY | ON, OFF | ON | Link against cusparse library in `raft::raft`                                | 
-| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against curand library in `raft::raft`                                  | 
-| DETECT_CONDA_ENV                | ON, OFF              | ON | Enable detection of conda environment for dependencies                       |
-| RAFT_NVTX                       | ON, OFF              | OFF | Enable NVTX Markers                                                          |
-| CUDA_ENABLE_KERNELINFO          | ON, OFF              | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer`         |
-| CUDA_ENABLE_LINEINFO            | ON, OFF              | OFF | Enable the -lineinfo option for nvcc                                         |
-| CUDA_STATIC_RUNTIME             | ON, OFF              | OFF | Statically link the CUDA runtime                                             |
-
-Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components.
+In addition to microbenchmarks for individual primitives, RAFT contains a reproducible benchmarking tool for evaluating the performance of RAFT's vector search algorithms against the existing state-of-the-art. Please refer to the [RAFT ANN Benchmarks](https://docs.rapids.ai/api/raft/nightly/raft_ann_benchmarks/) guide for more information on this tool.
 
-### Python
+### Python libraries
 
-Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.8 conda environment:
-
-```bash
-mamba env create --name raft_env_name -f conda/environments/all_cuda-118_arch-x86_64.yaml
-mamba activate raft_env_name
-```
-
-The Python APIs can be built and installed using the `build.sh` script:
+The Python libraries can be built and installed using the `build.sh` script:
 
 ```bash
 # to build pylibraft
 ./build.sh libraft pylibraft --compile-lib
-# to build raft-dask
+# to build raft-dask (depends on pylibraft)
 ./build.sh libraft pylibraft raft-dask --compile-lib
 ```
 
-`setup.py` can also be used to build the Python APIs manually:
+`setup.py` can also be used to build the Python libraries manually:
 
-```
+```bash
 cd python/raft-dask
 python setup.py build_ext --inplace
 python setup.py install
@@ -206,7 +226,7 @@ python setup.py build_ext --inplace
 python setup.py install
 ```
 
-To run the Python tests:
+Python tests are automatically installed with the corresponding libraries. To run Python tests:
 ```bash
 cd python/raft-dask
 py.test -s -v
@@ -220,27 +240,56 @@ The Python packages can also be uninstalled using the `build.sh` script:
 ./build.sh pylibraft raft-dask --uninstall
 ```
 
-### Documentation
+### Using CMake directly
 
-The documentation requires that the C++ headers and python packages have been built and installed.
+When building RAFT from source, the `build.sh` script offers a nice wrapper around the `cmake` commands to ease the burdens of manually configuring the various available cmake options. When more fine-grained control over the CMake configuration is desired, the `cmake` command can be invoked directly as the below example demonstrates. 
 
-The following will build the docs along with the C++ and Python packages:
+The `CMAKE_INSTALL_PREFIX` installs RAFT into a specific location. The example below installs RAFT into the current Conda environment:
+```bash
+cd cpp
+mkdir build
+cd build
+cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARY=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
+make -j<parallel_level> install
+```
+
+RAFT's CMake has the following configurable flags available:
+
+| Flag                            | Possible Values      | Default Value | Behavior                                                                     |
+|---------------------------------|----------------------| --- |------------------------------------------------------------------------------|
+| BUILD_TESTS                     | ON, OFF              | ON | Compile Googletests                                                          |
+| BUILD_PRIMS_BENCH               | ON, OFF              | OFF | Compile benchmarks                                                           |
+| BUILD_ANN_BENCH                 | ON, OFF              | OFF | Compile end-to-end ANN benchmarks |
+| CUDA_ENABLE_KERNELINFO          | ON, OFF              | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer`         |
+| CUDA_ENABLE_LINEINFO            | ON, OFF              | OFF | Enable the -lineinfo option for nvcc                                         |
+| CUDA_STATIC_RUNTIME             | ON, OFF              | OFF | Statically link the CUDA runtime                                             |
+| DETECT_CONDA_ENV                | ON, OFF              | ON | Enable detection of conda environment for dependencies                       |
+| raft_FIND_COMPONENTS            | compiled distributed | | Configures the optional components as a space-separated list                 |
+| RAFT_COMPILE_LIBRARY            | ON, OFF              | ON if either BUILD_TESTS or BUILD_PRIMS_BENCH is ON; otherwise OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
+| RAFT_ENABLE_CUBLAS_DEPENDENCY   | ON, OFF | ON | Link against cublas library in `raft::raft`                                  | 
+| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against cusolver library in `raft::raft`                                | 
+| RAFT_ENABLE_CUSPARSE_DEPENDENCY | ON, OFF | ON | Link against cusparse library in `raft::raft`                                | 
+| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against curand library in `raft::raft`                                  | 
+| RAFT_NVTX                       | ON, OFF              | OFF | Enable NVTX Markers                                                          |
+
+### Build documentation
+
+The documentation requires that the C++ and Python libraries have been built and installed. The following will build the docs along with the C++ and Python packages:
 
 ```
 ./build.sh libraft pylibraft raft-dask docs --compile-lib
 ```
 
+## Using RAFT C++ in downstream projects
 
-## Using RAFT in downstream projects
+There are a few different strategies for including RAFT in downstream projects, depending on whether the [required build dependencies](#build-dependencies) have already been installed and are available on the `lib` and `include` search paths.
 
-There are a few different strategies for including RAFT in downstream projects, depending on whether the [required build dependencies](#build-dependencies) have already been installed and are available on the `lib` and `include` paths.
-
-Using cmake, you can enable CUDA support right in your project's declaration:
+When using the GPU parts of RAFT, you will need to enable CUDA support in your CMake project declaration:
 ```cmake
 project(YOUR_PROJECT VERSION 0.1 LANGUAGES CXX CUDA)
 ```
 
-Please note that some additional compiler flags might need to be added when building against RAFT. For example, if you see an error like this `The experimental flag '--expt-relaxed-constexpr' can be used to allow this.`. The necessary flags can be set with cmake:
+Note that some additional compiler flags might need to be added when building against RAFT. For example, if you see an error like this `The experimental flag '--expt-relaxed-constexpr' can be used to allow this.`. The necessary flags can be set with CMake:
 ```cmake
 target_compile_options(your_target_name PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda --expt-relaxed-constexpr>)
 ```
@@ -256,95 +305,14 @@ PROPERTIES CXX_STANDARD                        17
            INTERFACE_POSITION_INDEPENDENT_CODE ON)
 ```
 
+The [C++ example template project](https://github.com/rapidsai/raft/tree/HEAD/cpp/template) provides an end-to-end buildable example of what a `CMakeLists.txt` that uses RAFT should look like. The items below point out some of the needed details.
 
-### C++ header-only integration (without cmake)
-
-While not a highly suggested method for building against RAFT, when all of the needed [build dependencies](#build-dependencies) are already satisfied, RAFT can be integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path:
-```cmake
-set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
-ExternalProject_Add(raft
-  GIT_REPOSITORY    git@github.com:rapidsai/raft.git
-  GIT_TAG           branch-23.12
-  PREFIX            ${RAFT_GIT_DIR}
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   "")
-set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/raft/cpp/include CACHE STRING "RAFT include variable")
-```
-### C++ header-only integration (with cmake)
-
-
-When using cmake, you can install RAFT headers into your environment with `./build.sh libraft`. 
-
-If the RAFT headers have already been installed into your environment with cmake or through conda, such as by using the `build.sh` script, use `find_package(raft)` and the `raft::raft` target.
-
-### Using C++ pre-compiled shared libraries
-
-Use `find_package(raft COMPONENTS compiled distributed)` to enable the shared library and transitively pass dependencies through separate targets for each component. In this example, the `raft::compiled` and `raft::distributed` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as NCCL for the `distributed` component).
-
-The pre-compiled libraries contain template instantiations for commonly used types, such as single- and double-precision floating-point. By default, these are used automatically when the `RAFT_COMPILED` macro is defined during compilation. This definition is automatically added by CMake.
-
-### Building RAFT C++ from source in cmake
-
-RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake).
-
-The following example is similar to invoking `find_package(raft)` but uses `rapids_cpm_find`, which provides a richer and more flexible configuration landscape by using CPM to fetch any dependencies not already available to the build. The `raft::raft` link target will be made available and it's recommended that it be used as a `PRIVATE` link dependency in downstream projects. The `COMPILE_LIBRARY` option enables the building the shared libraries.
-
-The following `cmake` snippet enables a flexible configuration of RAFT:
-
-```cmake
-
-set(RAFT_VERSION "23.12")
-set(RAFT_FORK "rapidsai")
-set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
-
-function(find_and_configure_raft)
-  set(oneValueArgs VERSION FORK PINNED_TAG COMPILE_LIBRARY)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-                            "${multiValueArgs}" ${ARGN} )
-  
-  #-----------------------------------------------------
-  # Invoke CPM find_package()
-  #-----------------------------------------------------
-
-  rapids_cpm_find(raft ${PKG_VERSION}
-          GLOBAL_TARGETS      raft::raft
-          BUILD_EXPORT_SET    projname-exports
-          INSTALL_EXPORT_SET  projname-exports
-          CPM_ARGS
-          GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
-          GIT_TAG        ${PKG_PINNED_TAG}
-          SOURCE_SUBDIR  cpp
-          FIND_PACKAGE_ARGUMENTS "COMPONENTS compiled distributed"
-          OPTIONS
-          "BUILD_TESTS OFF"
-          "BUILD_PRIMS_BENCH OFF"
-          "BUILD_ANN_BENCH OFF"
-          "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_LIBRARY}"
-  )
-
-endfunction()
-
-# Change pinned tag here to test a commit in CI
-# To use a different RAFT locally, set the CMake variable
-# CPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
-        FORK             ${RAFT_FORK}
-        PINNED_TAG       ${RAFT_PINNED_TAG}
-        COMPILE_LIBRARY          NO
-)
-```
-
-You can find a fully-functioning [example template project](../../cpp/template/README.md) in the `cpp/template` directory, which provides everything you need to build a new application with RAFT or incorporate RAFT Into your existing libraries.
-
-## Uninstall
+#### CMake Targets
 
-Once built and installed, RAFT can be safely uninstalled using `build.sh` by specifying any or all of the installed components. Please note that since `pylibraft` depends on `libraft`, uninstalling `pylibraft` will also uninstall `libraft`:
-```bash
-./build.sh libraft pylibraft raft-dask --uninstall
-```
+The `raft::raft` CMake target is made available when including RAFT into your CMake project but additional CMake targets can be made available by adding to the `COMPONENTS` option in CMake's `find_package(raft)` (refer to [CMake docs](https://cmake.org/cmake/help/latest/command/find_package.html#basic-signature) to learn more). The components should be separated by spaces. The `raft::raft` target will always be available. Note that the `distributed` component also exports additional dependencies.
 
-Leaving off the installed components will uninstall everything that's been installed:
-```bash
-./build.sh --uninstall
-```
+| Component   | Target              | Description                                              | Base Dependencies                      |
+|-------------|---------------------|----------------------------------------------------------|----------------------------------------|
+| n/a         | `raft::raft`        | Full RAFT header library                                 | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS |
+| compiled    | `raft::compiled`    | Pre-compiled template instantiations and runtime library | raft::raft                             |
+| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs                      | raft::raft, UCX, NCCL         
\ No newline at end of file
diff --git a/docs/source/cpp_api/cluster.rst b/docs/source/cpp_api/cluster.rst
index 77c8332bbd..b0485992b3 100644
--- a/docs/source/cpp_api/cluster.rst
+++ b/docs/source/cpp_api/cluster.rst
@@ -13,5 +13,6 @@ fundamental clustering algorithms which are, themselves, considered reusable bui
    :caption: Contents:
 
    cluster_kmeans.rst
+   cluster_kmeans_balanced.rst
    cluster_slhc.rst
    cluster_spectral.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/cluster_kmeans_balanced.rst b/docs/source/cpp_api/cluster_kmeans_balanced.rst
new file mode 100644
index 0000000000..5d07fcc1e3
--- /dev/null
+++ b/docs/source/cpp_api/cluster_kmeans_balanced.rst
@@ -0,0 +1,13 @@
+K-Means
+=======
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/cluster/kmeans_balanced.cuh>``
+
+.. doxygennamespace:: raft::cluster::kmeans_balanced
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b5d6abbbab..ee89aed5a6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -35,14 +35,14 @@ While not exhaustive, the following general categories help summarize the accele
 
    * - Category
      - Examples
+   * - Nearest Neighbors
+     - pairwise distances, vector search, epsilon neighborhoods, neighborhood graph construction
    * - Data Formats
      - sparse & dense, conversions, data generation
    * - Dense Operations
      - linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems
    * - Sparse Operations
      - linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling
-   * - Nearest Neighbors
-     - pairwise distances, vector search, epsilon neighborhoods, neighborhood graph construction
    * - Basic Clustering
      - spectral clustering, hierarchical clustering, k-means
    * - Solvers
@@ -61,9 +61,10 @@ While not exhaustive, the following general categories help summarize the accele
    cpp_api.rst
    pylibraft_api.rst
    using_libraft.md
+   vector_search_tutorial.md
    raft_ann_benchmarks.md
    raft_dask_api.rst
-   using_comms.rst
+   using_raft_comms.rst
    developer_guide.md
    contributing.md
 
diff --git a/docs/source/pylibraft_api.rst b/docs/source/pylibraft_api.rst
index 84955283cb..df25b76985 100644
--- a/docs/source/pylibraft_api.rst
+++ b/docs/source/pylibraft_api.rst
@@ -10,5 +10,6 @@ Python API
    pylibraft_api/cluster.rst
    pylibraft_api/common.rst
    pylibraft_api/distance.rst
+   pylibraft_api/matrix.rst
    pylibraft_api/neighbors.rst
    pylibraft_api/random.rst
diff --git a/docs/source/pylibraft_api/matrix.rst b/docs/source/pylibraft_api/matrix.rst
new file mode 100644
index 0000000000..884a466ec1
--- /dev/null
+++ b/docs/source/pylibraft_api/matrix.rst
@@ -0,0 +1,11 @@
+Matrix
+======
+
+This page provides `pylibraft` class references for the publicly-exposed elements of the `pylibraft.matrix` package.
+
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+.. autofunction:: pylibraft.matrix.select_k
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 2e8572c299..315e2245d8 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -4,20 +4,27 @@ This project provides a benchmark program for various ANN search implementations
 
 ## Table of Contents
 
-- [Installing and Running the Benchmarks](#installing--and-running-the-benchmarks)
-    - [Using conda](#conda)
-    - [Using Docker](#docker)
-- [End-to-end example: Million-scale](end-to-end-example-million-scale)
-- [End-to-end example: Billion-scale](#end-to-end-example-billion-scale)
+- [Installing the benchmarks](#installing-the-benchmarks)
+    - [Conda](#conda)
+    - [Docker](#docker)
+- [How to run the benchmarks](#how-to-run-the-benchmarks)
+  - [Step 1: prepare dataset](#step-1-prepare-dataset)
+  - [Step 2: build and search index](#step-2-build-and-search-index)
+  - [Step 3: data export](#step-3-data-export)
+  - [Step 4: plot results](#step-4-plot-results)
+- [Running the benchmarks](#running-the-benchmarks)
+  - [End to end: small-scale (<1M to 10M)](#end-to-end-small-scale-benchmarks-1m-to-10m)
+  - [End to end: large-scale (>10M)](#end-to-end-large-scale-benchmarks-10m-vectors)
+  - [Running with Docker containers](#running-with-docker-containers)
 - [Creating and customizing dataset configurations](#creating-and-customizing-dataset-configurations)
 - [Adding a new ANN algorithm](#adding-a-new-ann-algorithm)
 
-## Installing and Running the Benchmarks
+## Installing the benchmarks
 
 There are two main ways pre-compiled benchmarks are distributed:
 
-- [Conda](#Conda): Great solution for users not using containers but want an easy to install and use Python package. Pip wheels are planned to be added as an alternative for users that cannot use conda and prefer to not use containers.
-- [Docker](#Docker): Great solution that only needs docker and NVIDIA docker to use. Provides a single docker run command for basic dataset benchmarking, as well as all the functionality of the conda solution inside the containers.
+- [Conda](#Conda): For users not using containers but want an easy to install and use Python package. Pip wheels are planned to be added as an alternative for users that cannot use conda and prefer to not use containers.
+- [Docker](#Docker): Only needs docker and [NVIDIA docker](https://github.com/NVIDIA/nvidia-docker) to use. Provides a single docker run command for basic dataset benchmarking, as well as all the functionality of the conda solution inside the containers.
 
 ## Conda
 
@@ -39,48 +46,6 @@ The channel `rapidsai` can easily be substituted `rapidsai-nightly` if nightly b
 
 Please see the [build instructions](ann_benchmarks_build.md) to build the benchmarks from source.
 
-## Running the benchmarks
-
-### Python Package Usage
-There are 4 general steps to running the benchmarks and visualizing the results:
-1. Prepare Dataset
-2. Build Index and Search Index
-3. Data Export
-4. Plot Results
-
-We provide a collection of lightweight Python scripts that are wrappers over
-lower level scripts and executables to run our benchmarks. Either Python scripts or
-[low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks,
-however plots are only provided through our Python scripts.
-
-### End-to-end example: Million-scale
-
-The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the `RAPIDS_DATASET_ROOT_DIR` environment variable if defined, otherwise a datasets sub-folder from where the script is being called:
-
-```bash
-
-# (1) prepare dataset.
-python -m raft-ann-bench.get_dataset --dataset deep-image-96-angular --normalize
-
-# (2) build and search index
-python -m raft-ann-bench.run --dataset deep-image-96-inner
-
-# (3) export data
-python -m raft-ann-bench.data_export --dataset deep-image-96-inner
-
-# (4) plot results
-python -m raft-ann-bench.plot --dataset deep-image-96-inner
-```
-
-Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft-ann-bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
-- `deep-image-96-angular`
-- `fashion-mnist-784-euclidean`
-- `glove-50-angular`
-- `glove-100-angular`
-- `mnist-784-euclidean`
-- `nytimes-256-angular`
-- `sift-128-euclidean`
-
 ## Docker
 
 We provide images for GPU enabled systems, as well as systems without a GPU. The following images are available:
@@ -89,12 +54,12 @@ We provide images for GPU enabled systems, as well as systems without a GPU. The
 - `raft-ann-bench-datasets`: Contains the GPU and CPU benchmarks with million-scale datasets already included in the container. Best suited for users that want to run multiple million scale datasets already included in the image.
 - `raft-ann-bench-cpu`: Contains only CPU benchmarks with minimal size. Best suited for users that want the smallest containers to reproduce benchmarks on systems without a GPU.
 
-Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft-ann-bench), meanwhile release (stable) versions are located in [NGC](https://hub.docker.com/r/rapidsai/raft-ann-bench), starting with release 23.10.
+Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft-ann-bench/tags), meanwhile release (stable) versions are located in [NGC](https://hub.docker.com/r/rapidsai/raft-ann-bench), starting with release 23.12.
 
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:23.10a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:23.12a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -108,118 +73,28 @@ You can see the exact versions as well in the dockerhub site:
 - [RAFT ANN Benchmark with datasets preloaded images](https://hub.docker.com/r/rapidsai/raft-ann-bench-cpu/tags)
 - [RAFT ANN Benchmark CPU only images](https://hub.docker.com/r/rapidsai/raft-ann-bench-datasets/tags)
 
-**Note:** GPU containers use the CUDA toolkit from inside the container, the only requirement is a driver installed on the host machine that supports that version. So, for example, CUDA 11.8 containers can run in systems with a CUDA 12.x capable driver.
-
--  The following command (only available after RAPIDS 23.10 release) pulls the container:
+**Note:** GPU containers use the CUDA toolkit from inside the container, the only requirement is a driver installed on the host machine that supports that version. So, for example, CUDA 11.8 containers can run in systems with a CUDA 12.x capable driver. Please also note that the Nvidia-Docker runtime from the [Nvidia Container Toolkit](https://github.com/NVIDIA/nvidia-docker) is required to use GPUs inside docker containers.
 
-```bash
-docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:23.08-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.
-```
-
-### Container Usage
+[//]: # (-  The following command &#40;only available after RAPIDS 23.10 release&#41; pulls the container:)
 
-The container can be used in two different ways:
+[//]: # ()
+[//]: # (```bash)
 
-1. **Automated benchmark with single `docker run` (ease mode)**: Helper scripts are included to ease the procedure of running benchmarks end-to-end:
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:23.12-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
-For GPU systems, where `$DATA_FOLDER` is a local folder where you want datasets stored in `$DATA_FOLDER/datasets` and results in `$DATA_FOLDER/result` (we highly recommend `$DATA_FOLDER` to be a dedicated folder for the datasets and results of the containers):
+[//]: # (```)
 
-```bash
-export DATA_FOLDER=path/to/store/datasets/and/results
-docker run --gpus all --rm -it -u $(id -u) \
-    -v $DATA_FOLDER:/home/rapids/benchmarks  \
-    rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \
-    "--dataset deep-image-96-angular" \
-    "--normalize" \
-    "--algorithms raft_cagra,raft_ivf_pq" \
-    ""
-```
-
-Where:
-
-```bash
-export DATA_FOLDER=path/to/store/datasets/and/results # <- local folder to store datasets and results
-docker run --gpus all --rm -it -u $(id -u) \
-    -v $DATA_FOLDER:/home/rapids/benchmarks  \
-    rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \ # <- image to use, either `raft-ann-bench` or `raft-ann-bench-datasets`, can choose RAPIDS, cuda and python versions.
-    "--dataset deep-image-96-angular" \ # <- dataset name
-    "--normalize" \ # <- whether to normalize the dataset, leave string empty ("") to not normalize.
-    "--algorithms raft_cagra" \ # <- what algorithm(s) to use as a ; separated list, as well as any other argument to pass to `raft_ann_benchmarks.run`
-    "" # optional arguments to pass to `raft_ann_benchmarks.plot`
-```
-
-*** Note about user and file permissions: *** The flag `-u $(id -u)` allows the user inside the container to match the `uid` of the user outside the container, allowing the container to read and write to the mounted volume indicated by $DATA_FOLDER.
-
-For CPU systems the same interface applies, except for not needing the gpus argument and using the cpu images:
-```bash
-export DATA_FOLDER=path/to/store/datasets/and/results
-docker run  all --rm -it -u $(id -u) \
-    -v $DATA_FOLDER:/home/rapids/benchmarks  \
-    rapidsai/raft-ann-bench-cpu:23.10a-py3.10 \
-     "--dataset deep-image-96-angular" \
-     "--normalize" \
-     "--algorithms raft_cagra" \
-     ""
-```
 
-**Note:** The user inside the containers is `root`. To workaround this, the scripts in the containers fix the user of the output files after the benchmarks are run. If the benchmarks are interrupted, the owner of the `datasets/results` produced by the container will be wrong, and will need to be manually fixed by the user.
 
-2. **Using the preinstalled `raft_ann_benchmarks` python package (advanced mode)**: The docker containers are built using the conda packages described in the following section, so they can be used directly as if they were installed manually following the instructions in the next section. This is recommended for advanced users, and is the option that allows the full flexibility of the benchmarking scripts. To use the python scripts directly, use the following command:
+## How to run the benchmarks
 
-```bash
-export DATA_FOLDER=path/to/store/datasets/and/results
-docker run --gpus all --rm -it -u $(id -u) \
-    -v $DATA_FOLDER:/home/rapids/benchmarks  \
-    rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \
-    --entrypoint /bin/bash
-```
-
-This will drop you into a command line in the container, with the `raft_ann_benchmarks` python package ready to use, as was described in the prior [conda section](#conda):
-
-```
-(base) root@00b068fbb862:/home/rapids#
-```
-
-Additionally, the containers could be run in dettached mode without any issue.
-
-## End-to-end example: Billion-scale
-`raft-ann-bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale)
-because they are so large. You should instead use our billion-scale datasets guide to download and prepare them.
-All other python  mentioned below work as intended once the
-billion-scale dataset has been downloaded.
-To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html)
-
-The steps below demonstrate how to download, install, and run benchmarks on a subset of 100M vectors from the Yandex Deep-1B dataset. Please note that datasets of this scale are recommended for GPUs with larger amounts of memory, such as the A100 or H100. 
-```bash
-
-mkdir -p datasets/deep-1B
-# (1) prepare dataset
-# download manually "Ground Truth" file of "Yandex DEEP"
-# suppose the file name is deep_new_groundtruth.public.10K.bin
-python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
-# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
-
-# (2) build and search index
-python -m raft-ann-bench.run --dataset deep-1B
-
-# (3) export data
-python -m raft-ann-bench.data_export --dataset deep-1B
-
-# (4) plot results
-python -m raft-ann-bench.plot --dataset deep-1B
-```
-
-The usage of `python -m raft-ann-bench.split_groundtruth` is:
-```bash
-usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH
-
-options:
-  -h, --help            show this help message and exit
-  --groundtruth GROUNDTRUTH
-                        Path to billion-scale dataset groundtruth file (default: None)
-```
+We provide a collection of lightweight Python scripts to run the benchmarks. There are 4 general steps to running the benchmarks and visualizing the results. 
+1. Prepare Dataset
+2. Build Index and Search Index
+3. Data Export
+4. Plot Results
 
-#### Step 1: Prepare Dataset<a id='prep-dataset'></a>
+### Step 1: Prepare Dataset
 The script `raft-ann-bench.get_dataset` will download and unpack the dataset in directory
 that the user provides. As of now, only million-scale datasets are supported by this
 script. For more information on [datasets and formats](ann_benchmarks_dataset.md).
@@ -237,7 +112,7 @@ options:
 ```
 
 When option `normalize` is provided to the script, any dataset that has cosine distances
-will be normalized to inner product. So, for example, the dataset `glove-100-angular` 
+will be normalized to inner product. So, for example, the dataset `glove-100-angular`
 will be written at location `datasets/glove-100-inner/`.
 
 ### Step 2: Build and Search Index
@@ -286,7 +161,7 @@ a configuration file will be searched for as `python/raft-ann-bench/src/raft-ann
 For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.json>`
 and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.json>`.
 
-`dataset-path` : 
+`dataset-path` :
 1. data is read from `<dataset-path>/<dataset>`
 2. indices are built in `<dataset-path>/<dataset>/index`
 3. build/search results are stored in `<dataset-path>/<dataset>/result`
@@ -294,7 +169,7 @@ and an index search statistics JSON file in `<dataset-path/<dataset>/result/sear
 `build` and `search` : if both parameters are not supplied to the script then
 it is assumed both are `True`.
 
-`indices` and `algorithms` : these parameters ensure that the algorithm specified for an index 
+`indices` and `algorithms` : these parameters ensure that the algorithm specified for an index
 is available in `algos.yaml` and not disabled, as well as having an associated executable.
 
 ### Step 3: Data Export
@@ -313,7 +188,7 @@ options:
 Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
 and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.
 
-#### Step 4: Plot Results
+### Step 4: Plot Results
 The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
 CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.
 
@@ -347,6 +222,149 @@ The figure below is the resulting plot of running our benchmarks as of August 20
 
 ![Throughput vs recall plot comparing popular ANN algorithms with RAFT's at batch size 10](../../img/raft-vector-search-batch-10.png)
 
+## Running the benchmarks
+
+### End to end: small-scale benchmarks (<1M to 10M)
+
+The steps below demonstrate how to download, install, and run benchmarks on a subset of 10M vectors from the Yandex Deep-1B dataset By default the datasets will be stored and used from the folder indicated by the `RAPIDS_DATASET_ROOT_DIR` environment variable if defined, otherwise a datasets sub-folder from where the script is being called:
+
+```bash
+
+# (1) prepare dataset.
+python -m raft-ann-bench.get_dataset --dataset deep-image-96-angular --normalize
+
+# (2) build and search index
+python -m raft-ann-bench.run --dataset deep-image-96-inner
+
+# (3) export data
+python -m raft-ann-bench.data_export --dataset deep-image-96-inner
+
+# (4) plot results
+python -m raft-ann-bench.plot --dataset deep-image-96-inner
+```
+
+Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft-ann-bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
+
+| Dataset Name | Train Rows | Columns | Test Rows      | Distance   | 
+|-----|------------|----|----------------|------------|
+| `deep-image-96-angular` | 10M        | 96 | 10K            | Angular    |
+| `fashion-mnist-784-euclidean` | 60K        | 784 | 10K |  Euclidean |
+| `glove-50-angular` | 1.1M       | 50 | 10K | Angular |
+| `glove-100-angular` | 1.1M | 100 | 10K | Angular |
+| `mnist-784-euclidean` | 60K | 784 | 10K | Euclidean |
+| `nytimes-256-angular` | 290K | 256 | 10K | Angular |
+| `sift-128-euclidean` | 1M | 128 | 10K | Euclidean|
+
+All of the datasets above contain ground test datasets with 100 neighbors. Thus `k` for these datasets must be  less than or equal to 100. 
+
+### End to end: large-scale benchmarks (>10M vectors)
+`raft-ann-bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale)
+due to their size. You should instead use our billion-scale datasets guide to download and prepare them.
+All other python commands mentioned below work as intended once the
+billion-scale dataset has been downloaded.
+To download billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html)
+
+The steps below demonstrate how to download, install, and run benchmarks on a subset of 100M vectors from the Yandex Deep-1B dataset. Please note that datasets of this scale are recommended for GPUs with larger amounts of memory, such as the A100 or H100. 
+```bash
+
+mkdir -p datasets/deep-1B
+# (1) prepare dataset
+# download manually "Ground Truth" file of "Yandex DEEP"
+# suppose the file name is deep_new_groundtruth.public.10K.bin
+python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
+# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
+
+# (2) build and search index
+python -m raft-ann-bench.run --dataset deep-1B
+
+# (3) export data
+python -m raft-ann-bench.data_export --dataset deep-1B
+
+# (4) plot results
+python -m raft-ann-bench.plot --dataset deep-1B
+```
+
+The usage of `python -m raft-ann-bench.split_groundtruth` is:
+```bash
+usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH
+
+options:
+  -h, --help            show this help message and exit
+  --groundtruth GROUNDTRUTH
+                        Path to billion-scale dataset groundtruth file (default: None)
+```
+
+
+
+### Running with Docker containers
+
+Two methods are provided for running the benchmarks with the Docker containers. 
+
+#### End-to-end run on GPU
+
+When no other entrypoint is provided, an end-to-end script will run through all the steps in [Running the benchmarks](#running-the-benchmarks) above. 
+
+For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder where you want datasets stored in `$DATA_FOLDER/datasets` and results in `$DATA_FOLDER/result` (we highly recommend `$DATA_FOLDER` to be a dedicated folder for the datasets and results of the containers):
+```bash
+export DATA_FOLDER=path/to/store/datasets/and/results
+docker run --gpus all --rm -it -u $(id -u)                      \
+    -v $DATA_FOLDER:/data/benchmarks                            \
+    rapidsai/raft-ann-bench:23.12a-cuda11.8-py3.10              \
+    "--dataset deep-image-96-angular"                           \
+    "--normalize"                                               \
+    "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
+    ""
+```
+
+Usage of the above command is as follows:
+
+| Argument                                                  | Description                                                                                        |
+|-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
+| `rapidsai/raft-ann-bench:23.12a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
+| `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
+| `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
+| `""`                                                      | Additional (optional) arguments that will be passed to the `plot` script.                          |
+
+***Note about user and file permissions:*** The flag `-u $(id -u)` allows the user inside the container to match the `uid` of the user outside the container, allowing the container to read and write to the mounted volume indicated by the `$DATA_FOLDER` variable.
+
+#### End-to-end run on CPU
+
+The container arguments in the above section also be used for the CPU-only container, which can be used on systems that don't have a GPU installed. 
+
+***Note:*** the image changes to `raft-ann-bench-cpu` container and the `--gpus all` argument is no longer used:
+```bash
+export DATA_FOLDER=path/to/store/datasets/and/results
+docker run  --rm -it -u $(id -u)                  \
+    -v $DATA_FOLDER:/data/benchmarks              \
+    rapidsai/raft-ann-bench-cpu:23.12a-py3.10     \
+     "--dataset deep-image-96-angular"            \
+     "--normalize"                                \
+     "--algorithms hnswlib --batch-size 10 -k 10" \
+     ""
+```
+
+#### Manually run the scripts inside the container
+
+All of the `raft-ann-bench` images contain the Conda packages, so they can be used directly by logging directly into the container itself:
+
+```bash
+export DATA_FOLDER=path/to/store/datasets/and/results
+docker run --gpus all --rm -it -u $(id -u)          \
+    --entrypoint /bin/bash                          \
+    --workdir /data/benchmarks                      \
+    -v $DATA_FOLDER:/data/benchmarks                \
+    rapidsai/raft-ann-bench:23.12a-cuda11.8-py3.10 
+```
+
+This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
+
+```
+(base) root@00b068fbb862:/data/benchmarks# python -m raft-ann-bench.get_dataset --dataset deep-image-96-angular --normalize
+```
+
+Additionally, the containers can be run in detached mode without any issue.
+
 ## Creating and customizing dataset configurations
 
 A single configuration file will often define a set of algorithms, with associated index and search parameters, for a specific dataset. A configuration file uses json format with 4 major parts:
@@ -384,12 +402,12 @@ The `index` section will contain a list of index objects, each of which will hav
 
 The table below contains the possible settings for the `algo` field. Each unique algorithm will have its own set of `build_param` and `search_params` settings. The [ANN Algorithm Parameter Tuning Guide](ann_benchmarks_param_tuning.md) contains detailed instructions on choosing build and search parameters for each supported algorithm.
 
-| Library   | Algorithms                                                      |
-|-----------|-----------------------------------------------------------------|
-| FAISS GPU | `faiss_flat`, `faiss_gpu_ivf_flat`, `faiss_gpu_ivf_pq`          |
-| FAISS CPU | `faiss_flat`, `faiss_ivf_flat`, `faiss_ivf_pq`                  |
-| GGNN      | `ggnn`                                                          |
-| HNSWlib   | `hnswlib`                                                       |
+| Library   | Algorithms                                                       |
+|-----------|------------------------------------------------------------------|
+| FAISS GPU | `faiss_gpu_flat`, `faiss_gpu_ivf_flat`, `faiss_gpu_ivf_pq`       |
+| FAISS CPU | `faiss_cpu_flat`, `faiss_cpu_ivf_flat`, `faiss_cpu_ivf_pq`       |
+| GGNN      | `ggnn`                                                           |
+| HNSWlib   | `hnswlib`                                                        |
 | RAFT      | `raft_brute_force`, `raft_cagra`, `raft_ivf_flat`, `raft_ivf_pq` |
 
 
@@ -398,6 +416,8 @@ The table below contains the possible settings for the `algo` field. Each unique
 By default, the index will be placed in `bench/ann/data/<dataset_name>/index/<name>`. Using `sift-128-euclidean` for the dataset with the `algo` example above, the indexes would be placed in `bench/ann/data/sift-128-euclidean/index/algo_name/param1_val1-param2_val2`.
 
 
+
+
 ## Adding a new ANN algorithm
 
 ### Implementation and Configuration
diff --git a/docs/source/using_comms.rst b/docs/source/using_raft_comms.rst
similarity index 100%
rename from docs/source/using_comms.rst
rename to docs/source/using_raft_comms.rst
diff --git a/docs/source/vector_search_tutorial.md b/docs/source/vector_search_tutorial.md
new file mode 100644
index 0000000000..126ac534c6
--- /dev/null
+++ b/docs/source/vector_search_tutorial.md
@@ -0,0 +1,343 @@
+# Vector Search in C++ Tutorial
+
+RAFT has several important algorithms for performing vector search on the GPU and this tutorial walks through the primary vector search APIs from start to finish to provide a reference for quick setup and C++ API usage.
+
+This tutorial assumes RAFT has been installed and/or added to your build so that you are able to compile and run RAFT code. If not done already, please follow the [build and install instructions](build.md) and consider taking a look at the [example c++ template project](https://github.com/rapidsai/raft/tree/HEAD/cpp/template) for ready-to-go examples that you can immediately build and start playing with. Also take a look at RAFT's library of [reproducible vector search benchmarks](raft_ann_benchmarks.md) to run benchmarks that compare RAFT against other state-of-the-art nearest neighbors algorithms at scale.
+
+For more information about the various APIs demonstrated in this tutorial, along with comprehensive usage examples of all the APIs offered by RAFT, please refer to the [RAFT's C++ API Documentation](https://docs.rapids.ai/api/raft/nightly/cpp_api/). 
+
+## Step 1: Starting off with RAFT
+
+### CUDA Development? 
+
+If you are reading this tuturial then you probably know about CUDA and its relationship to general-purpose GPU computing (GPGPU). You probably also know about Nvidia GPUs but might not necessarily be familiar with the programming model nor GPU computing. The good news is that extensive knowledge of CUDA and GPUs are not needed in order to get started with or build applications with RAFT. RAFT hides away most of the complexities behind simple single-threaded stateless functions that are inherently asynchronous, meaning the result of a computation isn't necessarily read to be used when the function executes and control is given back to the user. The functions are, however, allowed to be chained together in a sequence of calls that don't need to wait for subsequent computations to complete in order to continue execution. In fact, the only time you need to wait for the computation to complete is when you are ready to use the result.
+
+A common structure you will encounter when using RAFT is a `raft::device_resources` object. This object is a container for important resources for a single GPU that might be needed during computation. If communicating with multiple GPUs, multiple `device_resources` might be needed, one for each GPU. `device_resources` contains several methods for managing its state but most commonly, you'll call the `sync_stream()` to guarantee all recently submitted computation has completed (as mentioned above.)
+
+A simple example of using `raft::device_resources` in RAFT:
+
+```c++
+#include <raft/core/device_resources.hpp>
+
+raft::device_resources res;
+// Call a bunch of RAFT functions in sequence...
+res.sync_stream()
+```
+
+### Host vs Device Memory
+
+We differentiate between two different types of memory. `host` memory is your traditional RAM memory that is primarily accessible by applications on the CPU. `device` memory, on the other hand, is what we call the special memory on the GPU, which is not accessible from the CPU. In order to access host memory from the GPU, it needs to be explicitly copied to the GPU and in order to access device memory by the CPU, it needs to be explicitly copied there. We have several mechanisms available for allocating and managing the lifetime of device memory on the stack so that we don't need to explicitly allocate and free pointers on the heap. For example, instead of a `std::vector` for host memory, we can use `rmm::device_uvector` on the device. The following function will copy an array from host memory to device memory:
+
+```c++
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <vector>
+
+raft::device_resources res;
+
+std::vector<int> my_host_vector = {0, 1, 2, 3, 4};
+rmm::device_uvector<int> my_device_vector(my_host_vector.size(), res.get_stream());
+
+raft::copy(my_device_vector.data(), my_host_vector.data(), my_host_vector.size(), res.get_stream());
+```
+
+Since a stream is involved in the copy operation above, RAFT functions can be invoked immediately so long as the same `device_resources` instances is used (or, more specifically, the same main stream from the `devices_resources`.) As you might notice in the example above, `res.get_stream()` can be used to extract the main stream from a `device_resources` instance.
+
+### Multi-dimensional data representation
+
+`rmm::device_uvector` is a great mechanism for allocating and managing a chunk of device memory. While it's possible to use a single array to represent objects in higher dimensions like matrices, it lacks the means to pass that information along. For example, in addition to knowing that we have a 2d structure, we would need to know the number of rows, the number of columns, and even whether we read the columns or rows first (referred to as column- or row-major respectively).
+
+For this reason, RAFT relies on the `mdspan` standard, which was composed specifically for this purpose. To be even more, `mdspan` itself doesn't actually allocate or own any data on host or device because it's just a view over an existing memory on host device. The `mdspan` simply gives us a way to represent multi-dimensional data so we can pass along the needed metadata to our APIs. Even more powerful is that we can design functions that only accept a matrix of `float` in device memory that is laid out in row-major format. 
+
+The memory-owning counterpart to the `mdspan` is the `mdarray` and the `mdarray` can allocate memory on device or host and carry along with it the metadata about its shape and layout. An `mdspan` can be produced from an `mdarray` for invoking RAFT APIs with `mdarray.view()`. They also follow similar paradigms to the STL, where we represent an immutable `mdspan` of `int` using `mdspan<const int>` instead of `const mdspan<int>` to ensure it's the type carried along by the `mdspan` that's not allowed to change. 
+
+Many RAFT functions require `mdspan<const T>` to represent immutable input data and there's no implicit conversion between `mdspan<T>` and `mdspan<const T>` we use `raft::make_const_mdspan()` to alleviate the pain of constructing a new `mdspan` to invoke these functions.
+
+The following example demonstrates how to create `mdarray` matrices in both device and host memory, copy one to the other, and create mdspans out of them:
+
+```c++
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/copy.hpp>
+
+raft::device_resources res;
+
+int n_rows = 10;
+int n_cols = 10;
+
+auto device_matrix = raft::make_device_matrix<float>(res, n_rows, n_cols);
+auto host_matrix = raft::make_host_matrix<float>(res, n_rows, n_cols);
+
+// Set the diagonal to 1
+for(int i = 0; i < n_rows; i++) {
+    host_matrix(i, i) = 1;
+}
+
+raft::copy(res, device_matrix.view(), host_matrix.view());
+```
+
+## Step 2: Generate some data
+
+Let's build upon the fundamentals from the prior section and actually invoke some of RAFT's computational APIs on the device. A good starting point is data generation.
+
+```c++
+#include <raft/core/device_mdarray.hpp>
+#include <raft/random/make_blobs.cuh>
+
+raft::device_resources res;
+
+int n_rows = 10000;
+int n_cols = 10000;
+
+auto dataset = raft::make_device_matrix<float, int>(res, n_rows, n_cols);
+auto labels = raft::make_device_vector<float, int>(res, n_rows);
+
+raft::make_blobs(res, dataset.view(), labels.view());
+```
+
+That's it. We've now generated a random 10kx10k matrix with points that cleanly separate into Gaussian clusters, along with a vector of cluster labels for each of the data points. Notice the `cuh` extension in the header file include for `make_blobs`. This signifies to us that this file contains CUDA device functions like kernel code so the CUDA compiler, `nvcc` is needed in order to compile any code that uses it. Generally, any source files that include headers with a `cuh` extension use the `.cu` extension instead of `.cpp`. The rule here is that `cpp` source files contain code which can be compiled with a C++ compiler like `g++` while `cu` files require the CUDA compiler.
+
+Since the `make_blobs` code generates the random dataset on the GPU device, we didn't need to do any host to device copies in this one. `make_blobs` is also asynchronous, so if we don't need to copy and use the data in host memory right away, we can continue calling RAFT functions with the `device_resources` instance and the data transformations will all be scheduled on the same stream.
+
+## Step 3: Using brute-force indexes
+
+### Build brute-force index
+
+Consider the `(10k, 10k)` shaped random matrix we generated in the previous step. We want to be able to find the k-nearest neighbors for all points of the matrix, or what we refer to as the all-neighbors graph, which means finding the neighbors of all data points within the same matrix.
+```c++
+#include <raft/neighbors/brute_force.cuh>
+
+raft::device_resources res;
+
+// set number of neighbors to search for
+int const k = 64;
+
+auto bfknn_index = raft::neighbors::brute_force::build(res,
+                                                       raft::make_const_mdspan(dataset.view()));
+```
+
+### Query brute-force index
+
+```c++
+
+// using matrix `dataset` from previous example
+auto search = raft::make_const_mdspan(dataset.view());
+
+// Indices and Distances are of dimensions (n, k)
+// where n is number of rows in the search matrix
+auto reference_indices = raft::make_device_matrix<int, int>(search.extent(0), k); // stores index of neighbors
+auto reference_distances = raft::make_device_matrix<float, int>(search.extent(0), k); // stores distance to neighbors
+
+raft::neighbors::brute_force::search(res,
+                                     bfknn_index,
+                                     search,
+                                     raft::make_const_mdspan(indices.view()),
+                                     raft::make_const_mdspan(distances.view()));
+```
+
+We have established several things here by building a flat index. Now we know the exact 64 neighbors of all points in the matrix, and this algorithm can be generally useful in several ways:
+1. Creating a baseline to compare against when building an approximate nearest neighbors index.
+2. Directly using the brute-force algorithm when accuracy is more important than speed of computation. Don't worry, our implementation is still the best in-class and will provide not only significant speedups over other brute force methods, but also be quick relatively when the matrices are small!
+
+
+## Step 4: Using the ANN indexes
+
+### Build a CAGRA index
+
+Next we'll train an ANN index. We'll use our graph-based CAGRA algorithm for this example but the other index types use a very similar pattern.
+
+```c++
+#include <raft/neighbors/cagra.cuh>
+
+raft::device_resources res;
+
+// use default index parameters
+cagra::index_params index_params;
+
+auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+```
+
+### Query the CAGRA index
+
+Now that we've trained a CAGRA index, we can query it by first allocating our output `mdarray` objects and passing the trained index model into the search function. 
+
+```c++
+// create output arrays
+auto indices = raft::make_device_matrix<uint32_t>(res, n_rows, k);
+auto distances = raft::make_device_matrix<float>(res, n_rows, k);
+
+// use default search parameters
+cagra::search_params search_params;
+
+// search K nearest neighbors
+cagra::search<float, uint32_t>(
+res, search_params, index, search, indices.view(), distances.view());
+```
+
+## Step 7: Evaluate neighborhood quality
+
+In step 3 we built a flat index and queried for exact neighbors while in step 4 we build an ANN index and queried for approximate neighbors. How do you quickly figure out the quality of our approximate neighbors and whether it's in an acceptable range based on your needs? Just compute the `neighborhood_recall` which gives a single value in the range [0, 1]. Closer the value to 1, higher the quality of the approximation.
+
+```c++
+#include <raft/stats/neighborhood_recall.cuh>
+
+raft::device_resources res;
+
+// Assuming matrices as type raft::device_matrix_view and variables as
+// indices : approximate neighbor indices
+// reference_indices : exact neighbor indices
+// distances : approximate neighbor distances
+// reference_distances : exact neighbor distances
+
+// We want our `neighborhood_recall` value in host memory
+float const recall_scalar = 0.0;
+auto recall_value = raft::make_host_scalar(recall_scalar);
+
+raft::stats::neighborhood_recall(res,
+                                 raft::make_const_mdspan(indices.view()),
+                                 raft::make_const_mdspan(reference_indices.view()),
+                                 recall_value.view(),
+                                 raft::make_const_mdspan(distances),
+                                 raft::make_const_mdspan(reference_distances));
+
+res.sync_stream();
+```
+
+Notice we can run invoke the functions for index build and search for both algorithms, one right after the other, because we don't need to access any outputs from the algorithms in host memory. We will need to synchronize the stream on the `raft::device_resources` instance before we can read the result of the `neighborhood_recall` computation, though. 
+
+Similar to a Numpy array, when we use a `host_scalar`, we are really using a multi-dimensional structure that contains only a single dimension, and further a single element. We can use element indexing to access the resulting element directly.
+```c++
+std::cout << recall_value(0) << std::endl;
+```
+
+While it may seem like unnecessary additional work to wrap the result in a `host_scalar` mdspan, this API choice is made intentionally to support the possibility of also receiving the result as a `device_scalar` so that it can be used directly on the device for follow-on computations without having to incur the synchronization or transfer cost of bringing the result to host. This pattern becomes even more important when the result is being computed in a loop, such as an iterative solver, and the cost of synchronization and device-to-host (d2h) transfer becomes very expensive. 
+
+## Advanced features
+
+The following sections present some advanced features that we have found can be useful for squeezing more utilization out of GPU hardware. As you've seen in this tutorial, RAFT provides several very useful tools and building blocks for developing accelerated applications beyond vector search capabilities.
+
+### Stream pools
+
+Within each CPU thread, CUDA uses `streams` to submit asynchronous work. You can think of a stream as a queue. Each stream can submit work to the GPU independently of other streams but work submitted within each stream is queued and executed in the order in which it was submitted. Similar to how we can use thread pools to bound the parallelism of CPU threads, we can use CUDA stream pools to bound the amount of concurrent asynchronous work that can be scheduled on a GPU. Each instance of `device_resources` has a main stream, but can also create a stream pool. For a single CPU thread, multiple different instances of `device_resources` can be created with different main streams and used to invoke a series of RAFT functions concurrently on the same or different GPU devices, so long as the target devices have available resources to perform the work. Once a device is saturated, queued work on streams will be scheduled and wait for a chance to do more work. During this time the streams are waiting, the CPU thread will still continue its own execution asynchronously unless `sync_stream_pool()` is called, causing the thread to block and wait for the thread pools to complete.
+
+Also, beware that before splitting GPU work onto multiple different concurrent streams, it can often be important to wait for the main stream in the `device_resources`. This can be done with `wait_stream_pool_on_stream()`.
+
+To summarize, if wanting to execute multiple different streams in parallel, we would often use a stream pool like this:
+```c++
+#include <raft/core/device_resources.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream.hpp>
+
+int n_streams = 5;
+
+rmm::cuda_stream stream;
+std::shared_ptr<rmm::cuda_stream_pool> stream_pool(5)
+raft::device_resources res(stream.view(), stream_pool);
+
+// Submit some work on the main stream...
+
+res.wait_stream_pool_on_stream()
+for(int i = 0; i < n_streams; ++i) {
+    rmm::cuda_stream_view stream_from_pool = res.get_next_usable_stream();
+    raft::device_resources pool_res(stream_from_pool);
+    // Submit some work with pool_res...
+}
+
+res.sync_stream_pool();
+```
+
+### Device resources manager
+
+In multi-threaded applications, it is often useful to create a set of
+`raft::device_resources` objects on startup to avoid the overhead of
+re-initializing underlying resources every time a `raft::device_resources` object
+is needed. To help simplify this common initialization logic, RAFT
+provides a `raft::device_resources_manager` to handle this for downstream
+applications. On startup, the application can specify certain limits on the
+total resource consumption of the `raft::device_resources` objects that will be
+generated:
+```c++
+#include <raft/core/device_resources_manager.hpp>
+
+void initialize_application() {
+  // Set the total number of CUDA streams to use on each GPU across all CPU
+  // threads. If this method is not called, the default stream per thread
+  // will be used.
+  raft::device_resources_manager::set_streams_per_device(16);
+
+  // Create a memory pool with given max size in bytes. Passing std::nullopt will allow
+  // the pool to grow to the available memory of the device.
+  raft::device_resources_manager::set_max_mem_pool_size(std::nullopt);
+
+  // Set the initial size of the memory pool in bytes.
+  raft::device_resources_manager::set_init_mem_pool_size(16000000);
+
+  // If neither of the above methods are called, no memory pool will be used
+}
+```
+While this example shows some commonly used settings,
+`raft::device_resources_manager` provides support for several other
+resource options and constraints, including options to initialize entire
+stream pools that can be used by an individual `raft::device_resources` object. After
+this initialization method is called, the following function could be called
+from any CPU thread:
+```c++
+void foo() {
+  raft::device_resources const& res = raft::device_resources_manager::get_device_resources();
+  // Submit some work with res
+  res.sync_stream();
+}
+```
+
+If any `raft::device_resources_manager` setters are called _after_ the first
+call to `raft::device_resources_manager::get_device_resources()`, these new
+settings are ignored, and a warning will be logged. If a thread calls
+`raft::device_resources_manager::get_device_resources()` multiple times, it is
+guaranteed to access the same underlying `raft::device_resources` object every
+time. This can be useful for chaining work in different calls on the same
+thread without keeping a persistent reference to the resources object.
+
+### Device memory resources
+
+The RAPIDS software ecosystem makes heavy use of the [RAPIDS Memory Manager](https://github.com/rapidsai/rmm) (RMM) to enable zero-copy sharing of device memory across various GPU-enabled libraries such as PyTorch, Jax, Tensorflow, and FAISS. A really powerful feature of RMM is the ability to set a memory resource, such as a pooled memory resource that allocates a block of memory up front to speed up subsequent smaller allocations, and have all the libraries in the GPU ecosystem recognize and use that same memory resource for all of their memory allocations.
+
+As an example, the following code snippet creates a `pool_memory_resource` and sets it as the default memory resource, which means all other libraries that use RMM will now allocate their device memory from this same pool:
+```c++
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+rmm::mr::cuda_memory_resource cuda_mr;
+// Construct a resource that uses a coalescing best-fit pool allocator
+rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{&cuda_mr};
+rmm::mr::set_current_device_resource(&pool_mr); // Updates the current device resource pointer to `pool_mr`
+```
+
+The `raft::device_resources` object will now also use the `rmm::current_device_resource`.  This isn't limited to C++, however. Often a user will be interacting with PyTorch, RAPIDS, or Tensorflow through Python and so they can set and use RMM's `current_device_resource` [right in Python](https://github.com/rapidsai/rmm#using-rmm-in-python-code).
+
+### Workspace memory resource
+
+As mentioned above, `raft::device_resources` will use `rmm::current_device_resource` by default for all memory allocations. However, there are times when a particular algorithm might benefit from using a different memory resource such as a `managed_memory_resource`, which creates a unified memory space between device and host memory, paging memory in and out of device as needed. Most of RAFT's algorithms allocate temporary memory as needed to perform their computations and we can control the memory resource used for these temporary allocations through the `workspace_resource` in the `raft::device_resources` instance. 
+
+For some applications, the `managed_memory_resource`, can enable a memory space that is larger than the GPU, thus allowing a natural spilling to host memory when needed. This isn't always the best way to use managed memory, though, as it can quickly lead to thrashing and severely impact performance. Still, when it can be used, it provides a very powerful tool that can also avoid out of memory errors when enough host memory is available. 
+
+The following creates a managed memory allocator and set it as the `workspace_resource` of the `raft::device_resources` instance:
+```c++
+#include <raft/core/device_resources.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+
+std::shared_ptr<rmm::mr::managed_memory_resource> managed_resource;
+raft::device_resource res(managed_resource);
+```
+
+The `workspace_resource` uses an `rmm::mr::limiting_resource_adaptor`, which limits the total amount of allocation possible. This allows RAFT algorithms to work within the confines of the memory constraints imposed by the user so that things like batch sizes can be automatically set to reasonable values without exceeding the allotted memory. By default, this limit restricts the memory allocation space for temporary workspace buffers to the memory available on the device. 
+
+The below example specifies the total number of bytes that RAFT can use for temporary workspace allocations to 3GB:
+```c++
+#include <raft/core/device_resources.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+
+#include <optional>
+
+std::shared_ptr<rmm::mr::managed_memory_resource> managed_resource;
+raft::device_resource res(managed_resource, std::make_optional<std::size_t>(3 * 1024^3));
+```
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json
index bc4ae40ff8..55abca25d2 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json
@@ -84,16 +84,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file": "bigann-100M/hnswlib/M12",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -102,16 +102,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file": "bigann-100M/hnswlib/M16",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -120,16 +120,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file": "bigann-100M/hnswlib/M24",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -138,16 +138,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file": "bigann-100M/hnswlib/M36",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
index 3885876022..ea92a0de18 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
@@ -20,16 +20,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file": "deep-100M/hnswlib/M12",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -38,16 +38,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file": "deep-100M/hnswlib/M16",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -56,16 +56,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file": "deep-100M/hnswlib/M24",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -74,16 +74,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file": "deep-100M/hnswlib/M36",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
index 2cdb7235fc..3d69e775a1 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
@@ -18,16 +18,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/deep-image-96-inner/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/deep-image-96-inner/hnswlib/M12"
     },
@@ -37,16 +37,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/deep-image-96-inner/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/deep-image-96-inner/hnswlib/M16"
     },
@@ -56,16 +56,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/deep-image-96-inner/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/deep-image-96-inner/hnswlib/M24"
     },
@@ -75,16 +75,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/deep-image-96-inner/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/deep-image-96-inner/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
index ed45be4e9d..2c86b0c4ee 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
@@ -18,16 +18,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/fashion-mnist-784-euclidean/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/fashion-mnist-784-euclidean/hnswlib/M12"
     },
@@ -37,16 +37,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/fashion-mnist-784-euclidean/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/fashion-mnist-784-euclidean/hnswlib/M16"
     },
@@ -56,16 +56,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/fashion-mnist-784-euclidean/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/fashion-mnist-784-euclidean/hnswlib/M24"
     },
@@ -75,16 +75,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/fashion-mnist-784-euclidean/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/fashion-mnist-784-euclidean/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json
index 3ada85834f..c5480900a7 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json
@@ -17,16 +17,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/gist-960-euclidean/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/gist-960-euclidean/hnswlib/M12"
     },
@@ -36,16 +36,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/gist-960-euclidean/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/gist-960-euclidean/hnswlib/M16"
     },
@@ -55,16 +55,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/gist-960-euclidean/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/gist-960-euclidean/hnswlib/M24"
     },
@@ -74,16 +74,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/gist-960-euclidean/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/gist-960-euclidean/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
index 4476da64d4..2074ef13a3 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
@@ -17,16 +17,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-angular/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-angular/hnswlib/M12"
     },
@@ -36,16 +36,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-angular/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-angular/hnswlib/M16"
     },
@@ -55,16 +55,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-angular/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-angular/hnswlib/M24"
     },
@@ -74,16 +74,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-angular/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-angular/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
index 06ce6e3ac4..5da3fa18d3 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
@@ -17,16 +17,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-inner/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-inner/hnswlib/M12"
     },
@@ -36,16 +36,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-inner/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-inner/hnswlib/M16"
     },
@@ -55,16 +55,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-inner/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-inner/hnswlib/M24"
     },
@@ -74,16 +74,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-100-inner/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-100-inner/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
index 90a41f46a2..11fa07c5c9 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
@@ -17,16 +17,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-angular/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-angular/hnswlib/M12"
     },
@@ -36,16 +36,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-angular/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-angular/hnswlib/M16"
     },
@@ -55,16 +55,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-angular/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-angular/hnswlib/M24"
     },
@@ -74,16 +74,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-angular/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-angular/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
index 41dec5adb3..32613b7c16 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
@@ -17,16 +17,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-inner/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-inner/hnswlib/M12"
     },
@@ -36,16 +36,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-inner/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-inner/hnswlib/M16"
     },
@@ -55,16 +55,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-inner/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-inner/hnswlib/M24"
     },
@@ -74,16 +74,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/glove-50-inner/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/glove-50-inner/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json
index 2d7a2eb7d4..943d09231a 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json
@@ -17,16 +17,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/lastfm-65-angular/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/lastfm-65-angular/hnswlib/M12"
     },
@@ -36,16 +36,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/lastfm-65-angular/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/lastfm-65-angular/hnswlib/M16"
     },
@@ -55,16 +55,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/lastfm-65-angular/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/lastfm-65-angular/hnswlib/M24"
     },
@@ -74,16 +74,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/lastfm-65-angular/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/lastfm-65-angular/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
index fff3bca1d7..04e7ecb469 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
@@ -18,16 +18,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/mnist-784-euclidean/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/mnist-784-euclidean/hnswlib/M12"
     },
@@ -37,16 +37,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/mnist-784-euclidean/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/mnist-784-euclidean/hnswlib/M16"
     },
@@ -56,16 +56,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/mnist-784-euclidean/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/mnist-784-euclidean/hnswlib/M24"
     },
@@ -75,16 +75,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/mnist-784-euclidean/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/mnist-784-euclidean/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
index 176c8d281c..df2a16f1f8 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
@@ -18,16 +18,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-angular/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-angular/hnswlib/M12"
     },
@@ -37,16 +37,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-angular/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-angular/hnswlib/M16"
     },
@@ -56,16 +56,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-angular/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-angular/hnswlib/M24"
     },
@@ -75,16 +75,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-angular/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-angular/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
index f849abad35..18942a95c3 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
@@ -18,16 +18,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-inner/hnswlib/M12",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-inner/hnswlib/M12"
     },
@@ -37,16 +37,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-inner/hnswlib/M16",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-inner/hnswlib/M16"
     },
@@ -56,16 +56,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-inner/hnswlib/M24",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-inner/hnswlib/M24"
     },
@@ -75,16 +75,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file" : "index/nytimes-256-inner/hnswlib/M36",
       "search_params" : [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ],
       "search_result_file" : "result/nytimes-256-inner/hnswlib/M36"
     },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json
index ff639cb90c..791261251a 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json
@@ -19,16 +19,16 @@
       "build_param": {"M":12, "efConstruction":500, "numThreads":32},
       "file": "sift-128-euclidean/hnswlib/M12",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -37,16 +37,16 @@
       "build_param": {"M":16, "efConstruction":500, "numThreads":32},
       "file": "sift-128-euclidean/hnswlib/M16",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -55,16 +55,16 @@
       "build_param": {"M":24, "efConstruction":500, "numThreads":32},
       "file": "sift-128-euclidean/hnswlib/M24",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {
@@ -73,16 +73,16 @@
       "build_param": {"M":36, "efConstruction":500, "numThreads":32},
       "file": "sift-128-euclidean/hnswlib/M36",
       "search_params": [
-        {"ef":10, "numThreads":1},
-        {"ef":20, "numThreads":1},
-        {"ef":40, "numThreads":1},
-        {"ef":60, "numThreads":1},
-        {"ef":80, "numThreads":1},
-        {"ef":120, "numThreads":1},
-        {"ef":200, "numThreads":1},
-        {"ef":400, "numThreads":1},
-        {"ef":600, "numThreads":1},
-        {"ef":800, "numThreads":1}
+        {"ef":10},
+        {"ef":20},
+        {"ef":40},
+        {"ef":60},
+        {"ef":80},
+        {"ef":120},
+        {"ef":200},
+        {"ef":400},
+        {"ef":600},
+        {"ef":800}
       ]
     },
     {