From a7730b15ad0a95800fd49c47118be6514c03c714 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 19 Jul 2024 17:26:24 -0400
Subject: [PATCH 01/31] DOC v24.10 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 ++--
 .devcontainer/cuda11.8-pip/devcontainer.json  | 10 +++----
 .../cuda12.5-conda/devcontainer.json          |  6 ++--
 .devcontainer/cuda12.5-pip/devcontainer.json  | 10 +++----
 .github/workflows/build.yaml                  | 16 +++++-----
 .github/workflows/pr.yaml                     | 26 ++++++++--------
 .github/workflows/test.yaml                   | 10 +++----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            | 14 ++++-----
 .../all_cuda-118_arch-x86_64.yaml             | 14 ++++-----
 .../all_cuda-125_arch-aarch64.yaml            | 14 ++++-----
 .../all_cuda-125_arch-x86_64.yaml             | 14 ++++-----
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  4 +--
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  4 +--
 .../bench_ann_cuda-120_arch-aarch64.yaml      |  4 +--
 .../bench_ann_cuda-120_arch-x86_64.yaml       |  4 +--
 .../recipes/raft-dask/conda_build_config.yaml |  4 +--
 .../cmake/thirdparty/fetch_rapids.cmake       |  2 +-
 dependencies.yaml                             | 30 +++++++++----------
 docs/source/build.md                          |  2 +-
 docs/source/developer_guide.md                |  6 ++--
 docs/source/raft_ann_benchmarks.md            | 12 ++++----
 python/pylibraft/pyproject.toml               |  4 +--
 .../raft-dask/cmake/thirdparty/get_ucxx.cmake |  4 +--
 python/raft-dask/pyproject.toml               | 10 +++----
 26 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 27f619f391..a2b12764c5 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index db2a5dbcc6..0fdbac9d91 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,27 +5,27 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.10": {
       "version": "1.15.0"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 836a5feacd..adc79408a3 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 28798cbbf5..39d1c0c969 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,27 +5,27 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.10": {
       "version": "1.15.0"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
       "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e6f7043f82..2b0ae5099c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       package-name: pylibraft
   wheel-build-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e6c9604221..98cb724623 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0eba0f27d1..f6ca417dcf 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: _ZN\d+raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index d6f3ef9320..25ce059630 100755
--- a/README.md
+++ b/README.md
@@ -293,7 +293,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.5
 ```
 
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.08/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.10/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ### Installing Python through Pip
 
diff --git a/VERSION b/VERSION
index ec8489fda9..7c7ba04436 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index f2ae41822c..1a28edb974 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,8 +20,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -44,18 +44,18 @@ dependencies:
 - nvcc_linux-aarch64=11.8
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8f5f4d3088..71a7fc4b02 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,8 +20,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -44,18 +44,18 @@ dependencies:
 - nvcc_linux-64=11.8
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 2042156224..7bb4a40c05 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -21,8 +21,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -40,18 +40,18 @@ dependencies:
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
 name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index a2586cc211..58153398f7 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,8 +21,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -40,18 +40,18 @@ dependencies:
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
 name: all_cuda-125_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 000a8f4a1c..7a961d1ca9 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -30,7 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -40,7 +40,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 52b3a8dc69..3e3f2511d8 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -30,7 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -40,7 +40,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 27baeda4b8..972cce6f88 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -36,7 +36,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 5274d56bf6..1ecae4658b 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -36,7 +36,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index decd1fad18..e6afed2890 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -17,10 +17,10 @@ c_stdlib_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.39.*"
+  - "0.40.*"
 
 ucxx_version:
-  - "0.39.*"
+  - "0.40.*"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index 0f1d5ff020..f64a924cf5 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.08")
+set(RAPIDS_VERSION "24.10")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/dependencies.yaml b/dependencies.yaml
index b16d37f4bb..c04d585e3c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -170,7 +170,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - nccl>=2.9.9
-          - libucxx==0.39.*,>=0.0.0a0
+          - libucxx==0.40.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -209,7 +209,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_conda rmm==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -234,10 +234,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &rmm_cu12 rmm-cu12==24.8.*,>=0.0.0a0
+              - &rmm_cu12 rmm-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - &rmm_cu11 rmm-cu11==24.8.*,>=0.0.0a0
+              - &rmm_cu11 rmm-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_conda] }
   checks:
     common:
@@ -475,15 +475,15 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - rapids-dask-dependency==24.10.*,>=0.0.0a0
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==24.8.*,>=0.0.0a0
-          - &ucx_py_conda ucx-py==0.39.*,>=0.0.0a0
+          - &pylibraft_conda pylibraft==24.10.*,>=0.0.0a0
+          - &ucx_py_conda ucx-py==0.40.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -495,12 +495,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &pylibraft_cu12 pylibraft-cu12==24.8.*,>=0.0.0a0
-              - &ucx_py_cu12 ucx-py-cu12==0.39.*,>=0.0.0a0
+              - &pylibraft_cu12 pylibraft-cu12==24.10.*,>=0.0.0a0
+              - &ucx_py_cu12 ucx-py-cu12==0.40.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - &pylibraft_cu11 pylibraft-cu11==24.8.*,>=0.0.0a0
-              - &ucx_py_cu11 ucx-py-cu11==0.39.*,>=0.0.0a0
+              - &pylibraft_cu11 pylibraft-cu11==24.10.*,>=0.0.0a0
+              - &ucx_py_cu11 ucx-py-cu11==0.40.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
   test_python_common:
     common:
@@ -520,7 +520,7 @@ dependencies:
         packages:
           # UCXX is not currently a hard-dependency thus only installed during tests,
           # this will change in the future.
-          - &distributed_ucxx_conda distributed-ucxx==0.39.*,>=0.0.0a0
+          - &distributed_ucxx_conda distributed-ucxx==0.40.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -531,10 +531,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - distributed-ucxx-cu12==0.39.*,>=0.0.0a0
+              - distributed-ucxx-cu12==0.40.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - distributed-ucxx-cu11==0.39.*,>=0.0.0a0
+              - distributed-ucxx-cu11==0.40.*,>=0.0.0a0
           - {matrix: null, packages: [*distributed_ucxx_conda]}
   depends_on_ucx_build:
     common:
diff --git a/docs/source/build.md b/docs/source/build.md
index 64f3bd01a2..4ba087e68d 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.08/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.10/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ## Installing Python through Pip
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 36588f3450..516819b1c1 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 21a8404212..fc11a56ac8 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -62,7 +62,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:24.08a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:24.10a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -83,7 +83,7 @@ You can see the exact versions as well in the dockerhub site:
 [//]: # ()
 [//]: # (```bash)
 
-[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.08-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.10-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
 [//]: # (```)
 
@@ -344,7 +344,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run --gpus all --rm -it -u $(id -u)                      \
     -v $DATA_FOLDER:/data/benchmarks                            \
-    rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10              \
+    rapidsai/raft-ann-bench:24.10a-cuda11.8-py3.10              \
     "--dataset deep-image-96-angular"                           \
     "--normalize"                                               \
     "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
@@ -355,7 +355,7 @@ Usage of the above command is as follows:
 
 | Argument                                                  | Description                                                                                        |
 |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
-| `rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `rapidsai/raft-ann-bench:24.10a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
 | `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
 | `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
 | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
@@ -372,7 +372,7 @@ The container arguments in the above section also be used for the CPU-only conta
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run  --rm -it -u $(id -u)                  \
     -v $DATA_FOLDER:/data/benchmarks              \
-    rapidsai/raft-ann-bench-cpu:24.08a-py3.10     \
+    rapidsai/raft-ann-bench-cpu:24.10a-py3.10     \
      "--dataset deep-image-96-angular"            \
      "--normalize"                                \
      "--algorithms hnswlib --batch-size 10 -k 10" \
@@ -389,7 +389,7 @@ docker run --gpus all --rm -it -u $(id -u)          \
     --entrypoint /bin/bash                          \
     --workdir /data/benchmarks                      \
     -v $DATA_FOLDER:/data/benchmarks                \
-    rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10 
+    rapidsai/raft-ann-bench:24.10a-cuda11.8-py3.10 
 ```
 
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 1329a3990b..436fdf9500 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -33,7 +33,7 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python",
     "numpy>=1.23,<2.0a0",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -120,7 +120,7 @@ requires = [
     "cuda-python",
     "cython>=3.0.0",
     "ninja",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
 
diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
index de6a4b109c..db6039393a 100644
--- a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
@@ -47,9 +47,9 @@ endfunction()
 # Change pinned tag here to test a commit in CI
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_ucxx(VERSION  0.39
+find_and_configure_ucxx(VERSION  0.40
         FORK             rapidsai
-        PINNED_TAG       branch-0.39
+        PINNED_TAG       branch-0.40
         EXCLUDE_FROM_ALL YES
         UCXX_STATIC      ${RAFT_DASK_UCXX_STATIC}
     )
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 24ee27ddea..4b0ae06316 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -31,14 +31,14 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==24.8.*,>=0.0.0a0",
-    "distributed-ucxx==0.39.*,>=0.0.0a0",
+    "dask-cuda==24.10.*,>=0.0.0a0",
+    "distributed-ucxx==0.40.*,>=0.0.0a0",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "pylibraft==24.8.*,>=0.0.0a0",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
-    "ucx-py==0.39.*,>=0.0.0a0",
+    "pylibraft==24.10.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
+    "ucx-py==0.40.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From f5202353460f259d7a8e7ca6323468d037db6ce1 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 30 Jul 2024 15:45:13 -0500
Subject: [PATCH 02/31] Update pip devcontainers to UCX v1.17.0 (#2401)

Contributes to https://github.com/rapidsai/build-planning/issues/77.

Follow-up to https://github.com/rapidsai/devcontainers/pull/338

Proposes updating the pip devcontainers to use v1.17.0, as part of an effort to use that version across RAPIDS.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/raft/pull/2401
---
 .devcontainer/cuda11.8-pip/devcontainer.json | 5 +----
 .devcontainer/cuda12.5-pip/devcontainer.json | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 0fdbac9d91..f319536b18 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
@@ -15,9 +15,6 @@
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.10": {
-      "version": "1.15.0"
-    },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
       "version": "11.8",
       "installcuBLAS": true,
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 39d1c0c969..26b9a56e48 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
@@ -15,9 +15,6 @@
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.10": {
-      "version": "1.15.0"
-    },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
       "version": "12.5",
       "installcuBLAS": true,

From dbdde82a00ac39d0a34279441f5efd7c73a4d7d8 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 8 Aug 2024 10:21:23 -0500
Subject: [PATCH 03/31] Improve update-version.sh (#2408)

This PR improves `update-version.sh` by clarifying suffix handling and ucxx/ucx-py version handling.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2408
---
 ci/release/update-version.sh | 43 +++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 194ad9a07b..075eb896f2 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -18,16 +18,15 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
-#Get <major>.<minor> for next version
+# Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
-NEXT_UCX_PY_VERSION="${NEXT_UCX_PY_SHORT_TAG}.*"
+NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 # Need to distutils-normalize the original version
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))")
+NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCXX_SHORT_TAG}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -37,8 +36,8 @@ function sed_runner() {
 }
 
 sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake
-sed_runner 's/'"find_and_configure_ucxx(VERSION .*"'/'"find_and_configure_ucxx(VERSION  ${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
-sed_runner 's/'"branch-.*"'/'"branch-${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+sed_runner 's/'"find_and_configure_ucxx(VERSION .*"'/'"find_and_configure_ucxx(VERSION  ${NEXT_UCXX_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+sed_runner 's/'"branch-.*"'/'"branch-${NEXT_UCXX_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
 
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" > VERSION
@@ -46,35 +45,33 @@ echo "${NEXT_FULL_TAG}" > VERSION
 DEPENDENCIES=(
   dask-cuda
   pylibraft
-  pylibraft-cu11
-  pylibraft-cu12
   rmm
-  rmm-cu11
-  rmm-cu12
   rapids-dask-dependency
-  # ucx-py and ucxx are handled separately below
+)
+UCXX_DEPENDENCIES=(
+  ucx-py
+  libucxx
+  distributed-ucxx
 )
 for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
-    sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
+  done
+  for DEP in "${UCXX_DEPENDENCIES[@]}"; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXX_SHORT_TAG_PEP440}.*/,>=0.0.0a0/g" "${FILE}"
   done
-  sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
-  sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
-  sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
-  sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
-  sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
-  sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
-  sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
 done
 for FILE in python/*/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
+  done
+  for DEP in "${UCXX_DEPENDENCIES[@]}"; do
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_UCXX_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
-  sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
 done
 
-sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
-sed_runner "/^ucxx_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCXX_SHORT_TAG_PEP440}.*\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+sed_runner "/^ucxx_version:$/ {n;s/.*/  - \"${NEXT_UCXX_SHORT_TAG_PEP440}.*\"/}" conda/recipes/raft-dask/conda_build_config.yaml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"

From a1f4844df016cb7dfd9ceaed92c4a6700baab21b Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 8 Aug 2024 12:51:37 -0500
Subject: [PATCH 04/31] Use tool.scikit-build.cmake.version, set
 scikit-build-core minimum-version (#2406)

Contributes to https://github.com/rapidsai/build-planning/issues/58.

`scikit-build-core==0.10.0` was released today (https://github.com/scikit-build/scikit-build-core/releases/tag/v0.10.0), and wheel-building configurations across RAPIDS are incompatible with it.

This proposes upgrading to that version and fixing configuration here in a way that:

* is compatible with that new `scikit-build-core` version
* takes advantage of the forward-compatibility mechanism (`minimum-version`) that `scikit-build-core` provides, to reduce the risk of needing to do this again in the future

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/raft/pull/2406
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-120_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-120_arch-x86_64.yaml  | 2 +-
 conda/recipes/pylibraft/meta.yaml                       | 2 +-
 conda/recipes/raft-dask/meta.yaml                       | 2 +-
 dependencies.yaml                                       | 4 ++--
 python/pylibraft/pyproject.toml                         | 5 +++--
 python/raft-dask/pyproject.toml                         | 5 +++--
 13 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 1a28edb974..ef8524dce3 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -51,7 +51,7 @@ dependencies:
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 71a7fc4b02..6ffb27bb29 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -51,7 +51,7 @@ dependencies:
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 7bb4a40c05..fd0e380a1c 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -47,7 +47,7 @@ dependencies:
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 58153398f7..ad4ecb7ff2 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -47,7 +47,7 @@ dependencies:
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 7a961d1ca9..eff1c56840 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -41,6 +41,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 3e3f2511d8..87b19d2952 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -41,6 +41,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 972cce6f88..ff3451c15c 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -37,6 +37,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 1ecae4658b..085e099ae8 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -37,6 +37,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rmm==24.10.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 31086e30aa..4ef85fc0e5 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -54,7 +54,7 @@ requirements:
     - libraft-headers {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 641a11a241..74b26b5935 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -54,7 +54,7 @@ requirements:
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - ucx-py {{ ucx_py_version }}
     - ucxx {{ ucxx_version }}
     - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index 234cafa035..630b5a50c9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -154,10 +154,10 @@ dependencies:
           - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
       - output_types: [conda]
         packages:
-          - scikit-build-core>=0.7.0
+          - scikit-build-core>=0.10.0
       - output_types: [requirements, pyproject]
         packages:
-          - scikit-build-core[pyproject]>=0.7.0
+          - scikit-build-core[pyproject]>=0.10.0
   rapids_build:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index b09615cc00..4c1d8e8fa1 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -16,7 +16,7 @@
 
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "rapids_build_backend.build"
 
@@ -102,7 +102,8 @@ skip = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 1bb26e98e8..4fadfa5c9f 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -17,7 +17,7 @@
 build-backend = "rapids_build_backend.build"
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -105,7 +105,8 @@ skip = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true

From 32f3703b0d28e5e8934fc463cf3ec6bd6cc82a79 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 8 Aug 2024 13:56:14 -0400
Subject: [PATCH 05/31] Update pre-commit hooks (#2409)

This PR updates pre-commit hooks to the latest versions that are supported without causing style check errors.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2409
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a0a4b4be87..9aed6bf387 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -99,7 +99,7 @@ repos:
         hooks:
               - id: check-json
       - repo: https://github.com/rapidsai/pre-commit-hooks
-        rev: v0.2.0
+        rev: v0.3.1
         hooks:
           - id: verify-copyright
             files: |

From db079987260d8462eaccfd5173346f6180bc70bd Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 22 Aug 2024 06:13:44 -0700
Subject: [PATCH 06/31] [FEA] Support for half-float mixed precise in
 brute-force (#2382)

- distance supports half-float
- SDDMM support half-float
- gemm supports multi-type compose
- transpose & copy support half
- random supports half

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2382
---
 cpp/include/raft/core/detail/copy.hpp         |  76 +++---
 cpp/include/raft/core/math.hpp                |  14 +-
 cpp/include/raft/core/operators.hpp           |  18 +-
 .../distance/detail/masked_distance_base.cuh  |   4 +-
 .../detail/pairwise_distance_base.cuh         |   4 +-
 cpp/include/raft/linalg/contractions.cuh      |  14 +-
 .../raft/linalg/detail/contractions.cuh       |  14 +-
 cpp/include/raft/linalg/detail/gemm.hpp       | 111 ++++-----
 cpp/include/raft/linalg/detail/norm.cuh       | 154 ++++++------
 cpp/include/raft/linalg/detail/transpose.cuh  | 138 +++++++++--
 cpp/include/raft/linalg/gemm.cuh              |  65 +++---
 cpp/include/raft/linalg/norm.cuh              |  22 +-
 cpp/include/raft/random/detail/rng_device.cuh |  21 ++
 cpp/include/raft/random/detail/rng_impl.cuh   |   3 +-
 .../sparse/convert/detail/bitmap_to_csr.cuh   |  22 +-
 .../raft/sparse/detail/cusparse_wrappers.h    |  68 +++++-
 .../raft/sparse/distance/detail/utils.cuh     |  44 ++--
 .../sparse/linalg/detail/masked_matmul.cuh    |  20 +-
 .../raft/sparse/linalg/detail/sddmm.hpp       |  16 +-
 .../raft/sparse/linalg/masked_matmul.hpp      |  11 +-
 cpp/include/raft/sparse/linalg/sddmm.hpp      |  17 +-
 .../knn/detail/epsilon_neighborhood.cuh       |   2 +-
 cpp/include/raft/util/cuda_dev_essentials.cuh |  19 +-
 cpp/include/raft/util/cudart_utils.hpp        |   6 +-
 cpp/test/core/mdspan_copy.cu                  |  53 +++++
 cpp/test/linalg/norm.cu                       |  93 +++++---
 cpp/test/linalg/transpose.cu                  |  43 ++++
 cpp/test/sparse/masked_matmul.cu              | 221 +++++++++++++-----
 cpp/test/sparse/sddmm.cu                      | 198 +++++++++++-----
 29 files changed, 1050 insertions(+), 441 deletions(-)

diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
index 04e74c4e58..4faded5041 100644
--- a/cpp/include/raft/core/detail/copy.hpp
+++ b/cpp/include/raft/core/detail/copy.hpp
@@ -32,6 +32,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #ifdef __CUDACC__
+#include <raft/linalg/transpose.cuh>
 #include <raft/util/cuda_dev_essentials.cuh>
 #endif
 #endif
@@ -449,38 +450,51 @@ mdspan_copyable_t<DstType, SrcType> copy(resources const& res, DstType&& dst, Sr
 #endif
   } else if constexpr (config::can_use_cublas) {
 #ifndef RAFT_DISABLE_CUDA
-    auto constexpr const alpha = typename std::remove_reference_t<DstType>::value_type{1};
-    auto constexpr const beta  = typename std::remove_reference_t<DstType>::value_type{0};
-    if constexpr (std::is_same_v<typename config::dst_layout_type, layout_c_contiguous>) {
-      CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
-                                            CUBLAS_OP_T,
-                                            CUBLAS_OP_N,
-                                            dst.extent(1),
-                                            dst.extent(0),
-                                            &alpha,
-                                            src.data_handle(),
-                                            src.extent(0),
-                                            &beta,
-                                            dst.data_handle(),
-                                            dst.extent(1),
-                                            dst.data_handle(),
-                                            dst.extent(1),
-                                            resource::get_cuda_stream(res)));
+    if constexpr (!((std::is_same_v<typename std::remove_reference_t<DstType>::value_type, half>)&&(
+                    std::is_same_v<typename std::remove_reference_t<SrcType>::value_type, half>))) {
+      auto constexpr const alpha = typename std::remove_reference_t<DstType>::value_type{1};
+      auto constexpr const beta  = typename std::remove_reference_t<DstType>::value_type{0};
+      if constexpr (std::is_same_v<typename config::dst_layout_type, layout_c_contiguous>) {
+        CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+                                              CUBLAS_OP_T,
+                                              CUBLAS_OP_N,
+                                              dst.extent(1),
+                                              dst.extent(0),
+                                              &alpha,
+                                              src.data_handle(),
+                                              src.extent(0),
+                                              &beta,
+                                              dst.data_handle(),
+                                              dst.extent(1),
+                                              dst.data_handle(),
+                                              dst.extent(1),
+                                              resource::get_cuda_stream(res)));
+      } else {
+        CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+                                              CUBLAS_OP_T,
+                                              CUBLAS_OP_N,
+                                              dst.extent(0),
+                                              dst.extent(1),
+                                              &alpha,
+                                              src.data_handle(),
+                                              src.extent(1),
+                                              &beta,
+                                              dst.data_handle(),
+                                              dst.extent(0),
+                                              dst.data_handle(),
+                                              dst.extent(0),
+                                              resource::get_cuda_stream(res)));
+      }
     } else {
-      CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
-                                            CUBLAS_OP_T,
-                                            CUBLAS_OP_N,
-                                            dst.extent(0),
-                                            dst.extent(1),
-                                            &alpha,
-                                            src.data_handle(),
-                                            src.extent(1),
-                                            &beta,
-                                            dst.data_handle(),
-                                            dst.extent(0),
-                                            dst.data_handle(),
-                                            dst.extent(0),
-                                            resource::get_cuda_stream(res)));
+#ifdef __CUDACC__
+      raft::linalg::transpose(res, dst, src);
+#else
+      // Should never actually reach this because of enable_ifs. Included for
+      // safety.
+      RAFT_FAIL(
+        "raft::copy called in a way that requires custom kernel. Please use "
+        "raft/core/copy.cuh and include the header in a .cu file");
+#endif
     }
 #else
     // Not possible to reach this due to enable_ifs. Included for safety.
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index e082aaf41a..c5de345082 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -106,7 +106,13 @@ template <typename T>
 RAFT_INLINE_FUNCTION auto asin(T x)
 {
 #ifdef __CUDA_ARCH__
-  return ::asin(x);
+  if constexpr (std::is_same<T, __half>::value) {
+    float x_float      = __half2float(x);
+    float result_float = ::asin(x_float);
+    return __float2half(result_float);
+  } else {
+    return ::asin(x);
+  }
 #else
   return std::asin(x);
 #endif
@@ -337,6 +343,12 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
                 ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
                   std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
     return ::max(x, y);
+  } else if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, __half>) {
+    const float f_y = __half2float(y);
+    return (x < f_y) ? f_y : x;
+  } else if constexpr (std::is_same_v<T1, __half> && std::is_same_v<T2, float>) {
+    const float f_x = __half2float(x);
+    return (f_x < y) ? y : f_x;
   }
   // Else, check that the types are the same and provide a generic implementation
   else {
diff --git a/cpp/include/raft/core/operators.hpp b/cpp/include/raft/core/operators.hpp
index e42801fe32..6b10baa332 100644
--- a/cpp/include/raft/core/operators.hpp
+++ b/cpp/include/raft/core/operators.hpp
@@ -19,6 +19,8 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/math.hpp>
 
+#include <cuda_fp16.h>
+
 #include <algorithm>
 #include <cmath>
 #include <tuple>
@@ -104,13 +106,27 @@ struct sq_op {
   {
     return in * in;
   }
+
+  template <typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const half& in, UnusedArgs...) const
+  {
+    return __half2float(in) * __half2float(in);
+  }
 };
 
 struct add_op {
   template <typename T1, typename T2>
   constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
   {
-    return a + b;
+    if constexpr (std::is_same_v<T1, half> && std::is_same_v<T2, half>) {
+      return __half2float(a) + __half2float(b);
+    } else if constexpr (std::is_same_v<T1, half>) {
+      return __half2float(a) + b;
+    } else if constexpr (std::is_same_v<T2, half>) {
+      return a + __half2float(b);
+    } else {
+      return a + b;
+    }
   }
 };
 
diff --git a/cpp/include/raft/distance/detail/masked_distance_base.cuh b/cpp/include/raft/distance/detail/masked_distance_base.cuh
index 55da634145..96b778f11f 100644
--- a/cpp/include/raft/distance/detail/masked_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/masked_distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -266,7 +266,7 @@ struct MaskedDistances : public BaseClass {
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
-        acc[i][j] = BaseClass::Zero;
+        acc[i][j] = BaseClass::Zero();
       }
     }
   }
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index c6b09be31e..a8a541bf53 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -200,7 +200,7 @@ struct PairwiseDistances : public BaseClass {
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
-        acc[i][j] = BaseClass::Zero;
+        acc[i][j] = BaseClass::Zero();
       }
     }
   }
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index cb6488bedf..b284bb3370 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -164,6 +164,12 @@ struct Policy4x4<double, _veclen> {
   typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
   typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
 };
+
+template <int _veclen>
+struct Policy4x4<half, _veclen> {
+  typedef KernelPolicy<half, _veclen, 64, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<half, _veclen, 64, 4, 4, 16, 16> ColPolicy;
+};
 /** @} */
 
 /**
@@ -204,6 +210,12 @@ struct Policy2x8<double, _veclen> {
   // this is not used just for keeping compiler happy.
   typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
 };
+
+template <int _veclen>
+struct Policy2x8<half, _veclen> {
+  typedef KernelPolicy<half, _veclen, 16, 2, 8, 8, 32> Policy;
+};
+
 /** @} */
 
 /**
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index b15cb222b4..3bdcc22c1f 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,9 @@ struct Contractions_NT {
   /** block of Y data loaded from global mem after `ldgXY()` */
   DataT ldgDataY[P::LdgPerThY][P::Veclen];
 
-  static constexpr DataT Zero = (DataT)0;
+  // static constexpr DataT Zero = DataT{0};
+
+  static constexpr DataT Zero() { return DataT{0}; }
 
  public:
   /**
@@ -197,7 +199,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataX[i][j] = Zero;
+            ldgDataX[i][j] = Zero();
           }
         }
       }
@@ -211,7 +213,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataX[i][j] = Zero;
+            ldgDataX[i][j] = Zero();
           }
         }
       }
@@ -235,7 +237,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataY[i][j] = Zero;
+            ldgDataY[i][j] = Zero();
           }
         }
       }
@@ -249,7 +251,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataY[i][j] = Zero;
+            ldgDataY[i][j] = Zero();
           }
         }
       }
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 236c840040..af6a78638c 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -27,82 +27,83 @@
 
 namespace raft::linalg::detail {
 
-template <typename T, bool DevicePointerMode = false>
+template <typename A_T, typename B_T, typename C_T, typename S_T, bool DevicePointerMode = false>
 void legacy_gemm(raft::resources const& res,
                  const bool trans_a,
                  const bool trans_b,
                  const int m,
                  const int n,
                  const int k,
-                 const T* alpha,
-                 const T* A,
+                 const S_T* alpha,
+                 const A_T* A,
                  const int lda,
-                 const T* B,
+                 const B_T* B,
                  const int ldb,
-                 const T* beta,
-                 T* C,
+                 const S_T* beta,
+                 C_T* C,
                  const int ldc,
                  cudaStream_t stream)
 {
-  return legacy_matmul<DevicePointerMode, T, T, T, T>(res,
-                                                      trans_a,
-                                                      trans_b,
-                                                      static_cast<uint64_t>(m),
-                                                      static_cast<uint64_t>(n),
-                                                      static_cast<uint64_t>(k),
-                                                      alpha,
-                                                      A,
-                                                      static_cast<uint64_t>(lda),
-                                                      B,
-                                                      static_cast<uint64_t>(ldb),
-                                                      beta,
-                                                      C,
-                                                      static_cast<uint64_t>(ldc),
-                                                      stream);
+  return legacy_matmul<DevicePointerMode, S_T, A_T, B_T, C_T>(res,
+                                                              trans_a,
+                                                              trans_b,
+                                                              static_cast<uint64_t>(m),
+                                                              static_cast<uint64_t>(n),
+                                                              static_cast<uint64_t>(k),
+                                                              alpha,
+                                                              A,
+                                                              static_cast<uint64_t>(lda),
+                                                              B,
+                                                              static_cast<uint64_t>(ldb),
+                                                              beta,
+                                                              C,
+                                                              static_cast<uint64_t>(ldc),
+                                                              stream);
 }
 
-template <typename T>
+template <typename A_T, typename B_T, typename C_T, typename S_T>
 void legacy_gemm(raft::resources const& res,
-                 const T* a,
+                 const A_T* a,
                  int n_rows_a,
                  int n_cols_a,
-                 const T* b,
-                 T* c,
+                 const B_T* b,
+                 C_T* c,
                  int n_rows_c,
                  int n_cols_c,
                  cublasOperation_t trans_a,
                  cublasOperation_t trans_b,
-                 T alpha,
-                 T beta,
+                 S_T alpha,
+                 S_T beta,
                  cudaStream_t stream)
 {
   int m  = n_rows_c;
   int n  = n_cols_c;
   auto k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
-  return legacy_matmul<false, T, T, T, T>(res,
-                                          trans_a == CUBLAS_OP_T,
-                                          trans_b == CUBLAS_OP_T,
-                                          static_cast<uint64_t>(n_rows_c),
-                                          static_cast<uint64_t>(n_cols_c),
-                                          static_cast<uint64_t>(k),
-                                          &alpha,
-                                          a,
-                                          static_cast<uint64_t>(trans_a == CUBLAS_OP_T ? k : m),
-                                          b,
-                                          static_cast<uint64_t>(trans_b == CUBLAS_OP_T ? n : k),
-                                          &beta,
-                                          c,
-                                          static_cast<uint64_t>(m),
-                                          stream);
+  return legacy_matmul<false, S_T, A_T, B_T, C_T>(
+    res,
+    trans_a == CUBLAS_OP_T,
+    trans_b == CUBLAS_OP_T,
+    static_cast<uint64_t>(n_rows_c),
+    static_cast<uint64_t>(n_cols_c),
+    static_cast<uint64_t>(k),
+    &alpha,
+    a,
+    static_cast<uint64_t>(trans_a == CUBLAS_OP_T ? k : m),
+    b,
+    static_cast<uint64_t>(trans_b == CUBLAS_OP_T ? n : k),
+    &beta,
+    c,
+    static_cast<uint64_t>(m),
+    stream);
 }
 
-template <typename T>
+template <typename A_T, typename B_T, typename C_T>
 void legacy_gemm(raft::resources const& res,
-                 const T* a,
+                 const A_T* a,
                  int n_rows_a,
                  int n_cols_a,
-                 const T* b,
-                 T* c,
+                 const B_T* b,
+                 C_T* c,
                  int n_rows_c,
                  int n_cols_c,
                  cublasOperation_t trans_a,
@@ -110,14 +111,14 @@ void legacy_gemm(raft::resources const& res,
                  cudaStream_t stream)
 {
   return legacy_gemm(
-    res, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, T{1}, T{0}, stream);
+    res, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, C_T{1}, C_T{0}, stream);
 }
 
-template <typename T, bool DevicePointerMode = false>
+template <typename x_T, typename y_T, typename z_T, typename s_T, bool DevicePointerMode = false>
 void legacy_gemm(raft::resources const& res,
-                 T* z,
-                 T* x,
-                 T* y,
+                 z_T* z,
+                 x_T* x,
+                 y_T* y,
                  int _M,
                  int _N,
                  int _K,
@@ -125,11 +126,11 @@ void legacy_gemm(raft::resources const& res,
                  bool isXColMajor,
                  bool isYColMajor,
                  cudaStream_t stream,
-                 const T* alpha,
-                 const T* beta)
+                 const s_T* alpha,
+                 const s_T* beta)
 {
   if (isZColMajor) {
-    return legacy_matmul<DevicePointerMode, T, T, T, T>(
+    return legacy_matmul<DevicePointerMode, s_T, x_T, y_T, z_T>(
       res,
       !isXColMajor,
       !isYColMajor,
@@ -146,7 +147,7 @@ void legacy_gemm(raft::resources const& res,
       static_cast<uint64_t>(_M),
       stream);
   } else {
-    return legacy_gemm<T, DevicePointerMode>(
+    return legacy_gemm<x_T, y_T, z_T, s_T, DevicePointerMode>(
       res, z, y, x, _N, _M, _K, true, !isYColMajor, !isXColMajor, stream, alpha, beta);
   }
 }
diff --git a/cpp/include/raft/linalg/detail/norm.cuh b/cpp/include/raft/linalg/detail/norm.cuh
index ed7e360848..24da634575 100644
--- a/cpp/include/raft/linalg/detail/norm.cuh
+++ b/cpp/include/raft/linalg/detail/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-template <typename Type, typename IdxType, typename Lambda>
-void rowNormCaller(Type* dots,
+template <typename Type, typename IdxType, typename Lambda, typename OutType = Type>
+void rowNormCaller(OutType* dots,
                    const Type* data,
                    IdxType D,
                    IdxType N,
@@ -36,53 +36,53 @@ void rowNormCaller(Type* dots,
 {
   switch (type) {
     case L1Norm:
-      raft::linalg::reduce<Type, Type, IdxType>(dots,
-                                                data,
-                                                D,
-                                                N,
-                                                (Type)0,
-                                                rowMajor,
-                                                true,
-                                                stream,
-                                                false,
-                                                raft::abs_op(),
-                                                raft::add_op(),
-                                                fin_op);
+      raft::linalg::reduce<Type, OutType, IdxType>(dots,
+                                                   data,
+                                                   D,
+                                                   N,
+                                                   (OutType)0,
+                                                   rowMajor,
+                                                   true,
+                                                   stream,
+                                                   false,
+                                                   raft::abs_op(),
+                                                   raft::add_op(),
+                                                   fin_op);
       break;
     case L2Norm:
-      raft::linalg::reduce<Type, Type, IdxType>(dots,
-                                                data,
-                                                D,
-                                                N,
-                                                (Type)0,
-                                                rowMajor,
-                                                true,
-                                                stream,
-                                                false,
-                                                raft::sq_op(),
-                                                raft::add_op(),
-                                                fin_op);
+      raft::linalg::reduce<Type, OutType, IdxType>(dots,
+                                                   data,
+                                                   D,
+                                                   N,
+                                                   (OutType)0,
+                                                   rowMajor,
+                                                   true,
+                                                   stream,
+                                                   false,
+                                                   raft::sq_op(),
+                                                   raft::add_op(),
+                                                   fin_op);
       break;
     case LinfNorm:
-      raft::linalg::reduce<Type, Type, IdxType>(dots,
-                                                data,
-                                                D,
-                                                N,
-                                                (Type)0,
-                                                rowMajor,
-                                                true,
-                                                stream,
-                                                false,
-                                                raft::abs_op(),
-                                                raft::max_op(),
-                                                fin_op);
+      raft::linalg::reduce<Type, OutType, IdxType>(dots,
+                                                   data,
+                                                   D,
+                                                   N,
+                                                   (OutType)0,
+                                                   rowMajor,
+                                                   true,
+                                                   stream,
+                                                   false,
+                                                   raft::abs_op(),
+                                                   raft::max_op(),
+                                                   fin_op);
       break;
     default: THROW("Unsupported norm type: %d", type);
   };
 }
 
-template <typename Type, typename IdxType, typename Lambda>
-void colNormCaller(Type* dots,
+template <typename Type, typename IdxType, typename Lambda, typename OutType = Type>
+void colNormCaller(OutType* dots,
                    const Type* data,
                    IdxType D,
                    IdxType N,
@@ -93,46 +93,46 @@ void colNormCaller(Type* dots,
 {
   switch (type) {
     case L1Norm:
-      raft::linalg::reduce<Type, Type, IdxType>(dots,
-                                                data,
-                                                D,
-                                                N,
-                                                (Type)0,
-                                                rowMajor,
-                                                false,
-                                                stream,
-                                                false,
-                                                raft::abs_op(),
-                                                raft::add_op(),
-                                                fin_op);
+      raft::linalg::reduce<Type, OutType, IdxType>(dots,
+                                                   data,
+                                                   D,
+                                                   N,
+                                                   (OutType)0,
+                                                   rowMajor,
+                                                   false,
+                                                   stream,
+                                                   false,
+                                                   raft::abs_op(),
+                                                   raft::add_op(),
+                                                   fin_op);
       break;
     case L2Norm:
-      raft::linalg::reduce<Type, Type, IdxType>(dots,
-                                                data,
-                                                D,
-                                                N,
-                                                (Type)0,
-                                                rowMajor,
-                                                false,
-                                                stream,
-                                                false,
-                                                raft::sq_op(),
-                                                raft::add_op(),
-                                                fin_op);
+      raft::linalg::reduce<Type, OutType, IdxType>(dots,
+                                                   data,
+                                                   D,
+                                                   N,
+                                                   (OutType)0,
+                                                   rowMajor,
+                                                   false,
+                                                   stream,
+                                                   false,
+                                                   raft::sq_op(),
+                                                   raft::add_op(),
+                                                   fin_op);
       break;
     case LinfNorm:
-      raft::linalg::reduce<Type, Type, IdxType>(dots,
-                                                data,
-                                                D,
-                                                N,
-                                                (Type)0,
-                                                rowMajor,
-                                                false,
-                                                stream,
-                                                false,
-                                                raft::abs_op(),
-                                                raft::max_op(),
-                                                fin_op);
+      raft::linalg::reduce<Type, OutType, IdxType>(dots,
+                                                   data,
+                                                   D,
+                                                   N,
+                                                   (OutType)0,
+                                                   rowMajor,
+                                                   false,
+                                                   stream,
+                                                   false,
+                                                   raft::abs_op(),
+                                                   raft::max_op(),
+                                                   fin_op);
       break;
     default: THROW("Unsupported norm type: %d", type);
   };
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index 999e7f1974..ec60aacc9c 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -28,10 +28,84 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cmath>
+
 namespace raft {
 namespace linalg {
 namespace detail {
 
+template <typename IndexType, int TILE_DIM, int BLOCK_ROWS>
+RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
+                                  IndexType n_cols,
+                                  const half* __restrict__ in,
+                                  half* __restrict__ out)
+{
+  __shared__ half tile[TILE_DIM][TILE_DIM + 1];
+
+  for (int block_offset_y = 0; block_offset_y < n_rows; block_offset_y += gridDim.y * TILE_DIM) {
+    for (int block_offset_x = 0; block_offset_x < n_cols; block_offset_x += gridDim.x * TILE_DIM) {
+      auto x = block_offset_x + blockIdx.x * TILE_DIM + threadIdx.x;
+      auto y = block_offset_y + blockIdx.y * TILE_DIM + threadIdx.y;
+
+      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+        if (x < n_cols && (y + j) < n_rows) {
+          tile[threadIdx.y + j][threadIdx.x] = __ldg(&in[(y + j) * n_cols + x]);
+        }
+      }
+      __syncthreads();
+
+      x = block_offset_y + blockIdx.y * TILE_DIM + threadIdx.x;
+      y = block_offset_x + blockIdx.x * TILE_DIM + threadIdx.y;
+
+      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
+        if (x < n_rows && (y + j) < n_cols) {
+          out[(y + j) * n_rows + x] = tile[threadIdx.x][threadIdx.y + j];
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+template <typename IndexType>
+void transpose_half(
+  raft::resources const& handle, IndexType n_rows, IndexType n_cols, const half* in, half* out)
+{
+  if (n_cols == 0 || n_rows == 0) return;
+  auto stream = resource::get_cuda_stream(handle);
+
+  int dev_id, sm_count;
+
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  constexpr int tpb         = 256;
+  constexpr int block_dim_x = 128 / sizeof(half);
+  constexpr int block_dim_y = tpb / block_dim_x;
+
+  dim3 blocks(block_dim_x, block_dim_y);
+
+  int max_active_blocks = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, transpose_half_kernel<IndexType, block_dim_x, block_dim_y>, tpb, 0));
+  int num_blocks = max_active_blocks * sm_count;
+
+  int grid_x = (n_cols + block_dim_x - 1) / block_dim_x;
+  int grid_y = (n_rows + block_dim_x - 1) / block_dim_x;
+
+  float ratio = static_cast<float>(grid_y) / static_cast<float>(grid_x);
+  int adjusted_grid_y =
+    std::max(std::min(grid_y, static_cast<int>(std::sqrt(num_blocks * ratio))), 1);
+  int adjusted_grid_x = std::max(std::min(grid_x, num_blocks / adjusted_grid_y), 1);
+
+  dim3 grids(adjusted_grid_x, adjusted_grid_y);
+
+  transpose_half_kernel<IndexType, block_dim_x, block_dim_y>
+    <<<grids, blocks, 0, stream>>>(n_rows, n_cols, in, out);
+
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename math_t>
 void transpose(raft::resources const& handle,
                math_t* in,
@@ -40,28 +114,31 @@ void transpose(raft::resources const& handle,
                int n_cols,
                cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
-  RAFT_CUBLAS_TRY(cublasSetStream(cublas_h, stream));
-
   int out_n_rows = n_cols;
   int out_n_cols = n_rows;
 
-  const math_t alpha = 1.0;
-  const math_t beta  = 0.0;
-  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                             CUBLAS_OP_T,
-                             CUBLAS_OP_N,
-                             out_n_rows,
-                             out_n_cols,
-                             &alpha,
-                             in,
-                             n_rows,
-                             &beta,
-                             out,
-                             out_n_rows,
-                             out,
-                             out_n_rows,
-                             stream));
+  if constexpr (std::is_same_v<math_t, half>) {
+    transpose_half(handle, out_n_rows, out_n_cols, in, out);
+  } else {
+    cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
+    RAFT_CUBLAS_TRY(cublasSetStream(cublas_h, stream));
+    const math_t alpha = 1.0;
+    const math_t beta  = 0.0;
+    RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
+                               CUBLAS_OP_T,
+                               CUBLAS_OP_N,
+                               out_n_rows,
+                               out_n_cols,
+                               &alpha,
+                               in,
+                               n_rows,
+                               &beta,
+                               out,
+                               out_n_rows,
+                               out,
+                               out_n_rows,
+                               stream));
+  }
 }
 
 template <typename math_t>
@@ -112,6 +189,17 @@ void transpose_row_major_impl(
                         resource::get_cuda_stream(handle)));
 }
 
+template <typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
+void transpose_row_major_impl(
+  raft::resources const& handle,
+  raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
+  raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
+{
+  auto out_n_rows = in.extent(1);
+  auto out_n_cols = in.extent(0);
+  transpose_half<IndexType>(handle, out_n_cols, out_n_rows, in.data_handle(), out.data_handle());
+}
+
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
 void transpose_col_major_impl(
   raft::resources const& handle,
@@ -138,6 +226,18 @@ void transpose_col_major_impl(
                         out.stride(1),
                         resource::get_cuda_stream(handle)));
 }
+
+template <typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
+void transpose_col_major_impl(
+  raft::resources const& handle,
+  raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
+  raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
+{
+  auto out_n_rows = in.extent(1);
+  auto out_n_cols = in.extent(0);
+  transpose_half<IndexType>(handle, out_n_rows, out_n_cols, in.data_handle(), out.data_handle());
+}
+
 };  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 7b8d35706b..5444d0c861 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -41,7 +41,10 @@ namespace raft::linalg {
  * @brief the wrapper of cublas gemm function
  *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
  *
- * @tparam math_t the element type
+ * @tparam A_t the element type of A
+ * @tparam B_t the element type of B
+ * @tparam C_t the element type of C
+ * @tparam S_t the element type of alpha and beta
  * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
  * @param [in] handle raft handle
  * @param [in] trans_a cublas transpose op for A
@@ -59,20 +62,20 @@ namespace raft::linalg {
  * @param [in] ldc leading dimension of C
  * @param [in] stream
  */
-template <typename math_t, bool DevicePointerMode = false>
+template <typename A_t, typename B_t, typename C_t, typename S_t, bool DevicePointerMode = false>
 void gemm(raft::resources const& handle,
           const bool trans_a,
           const bool trans_b,
           const int m,
           const int n,
           const int k,
-          const math_t* alpha,
-          const math_t* A,
+          const S_t* alpha,
+          const A_t* A,
           const int lda,
-          const math_t* B,
+          const B_t* B,
           const int ldb,
-          const math_t* beta,
-          math_t* C,
+          const S_t* beta,
+          C_t* C,
           const int ldc,
           cudaStream_t stream)
 {
@@ -83,7 +86,10 @@ void gemm(raft::resources const& handle,
 /**
  * @brief the wrapper of cublas gemm function
  *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
+ * @tparam A_t the element type of A
+ * @tparam B_t the element type of B
+ * @tparam C_t the element type of C
+ * @tparam S_t the element type of alpha and beta
  * @param handle raft handle
  * @param a input matrix
  * @param n_rows_a number of rows of A
@@ -98,19 +104,19 @@ void gemm(raft::resources const& handle,
  * @param beta scalar
  * @param stream cuda stream
  */
-template <typename math_t>
+template <typename A_t, typename B_t, typename C_t, typename S_t>
 void gemm(raft::resources const& handle,
-          const math_t* a,
+          const A_t* a,
           int n_rows_a,
           int n_cols_a,
-          const math_t* b,
-          math_t* c,
+          const B_t* b,
+          C_t* c,
           int n_rows_c,
           int n_cols_c,
           cublasOperation_t trans_a,
           cublasOperation_t trans_b,
-          math_t alpha,
-          math_t beta,
+          S_t alpha,
+          S_t beta,
           cudaStream_t stream)
 {
   detail::legacy_gemm(
@@ -120,7 +126,9 @@ void gemm(raft::resources const& handle,
 /**
  * @brief the wrapper of cublas gemm function
  *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
- * @tparam math_t the type of input/output matrices
+ * @tparam A_t the element type of A
+ * @tparam B_t the element type of B
+ * @tparam C_t the element type of C
  * @param handle raft handle
  * @param a input matrix
  * @param n_rows_a number of rows of A
@@ -133,13 +141,13 @@ void gemm(raft::resources const& handle,
  * @param trans_b cublas transpose op for B
  * @param stream cuda stream
  */
-template <typename math_t>
+template <typename A_t, typename B_t, typename C_t>
 void gemm(raft::resources const& handle,
-          const math_t* a,
+          const A_t* a,
           int n_rows_a,
           int n_cols_a,
-          const math_t* b,
-          math_t* c,
+          const B_t* b,
+          C_t* c,
           int n_rows_c,
           int n_cols_c,
           cublasOperation_t trans_a,
@@ -154,7 +162,10 @@ void gemm(raft::resources const& handle,
  * @brief A wrapper for CUBLS GEMM function designed for handling all possible
  * combinations of operand layouts.
  * It computes the following equation: Z = alpha . X * Y + beta . Z
- * @tparam T Data type of input/output matrices (float/double)
+ * @tparam z_T the element type of z
+ * @tparam x_T the element type of x
+ * @tparam y_T the element type of y
+ * @tparam s_T the element type of alpha and beta, equal to z_T by default
  * @param handle raft handle
  * @param z output matrix of size M rows x N columns
  * @param x input matrix of size M rows x K columns
@@ -169,11 +180,11 @@ void gemm(raft::resources const& handle,
  * @param alpha scalar
  * @param beta scalar
  */
-template <typename T>
+template <typename z_T, typename x_T, typename y_T, typename s_T = z_T>
 void gemm(raft::resources const& handle,
-          T* z,
-          T* x,
-          T* y,
+          z_T* z,
+          x_T* x,
+          y_T* y,
           int _M,
           int _N,
           int _K,
@@ -181,10 +192,10 @@ void gemm(raft::resources const& handle,
           bool isXColMajor,
           bool isYColMajor,
           cudaStream_t stream,
-          T alpha = T(1.0),
-          T beta  = T(0.0))
+          s_T alpha = s_T(1.0),
+          s_T beta  = s_T(0.0))
 {
-  return detail::legacy_gemm<T, false>(
+  return detail::legacy_gemm<x_T, y_T, z_T, s_T, false>(
     handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, &alpha, &beta);
 }
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 97a5d6135d..4270149793 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -41,6 +41,7 @@ namespace linalg {
  * @tparam Type the data type
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
+ * @tparam OutType output type, equal to Type by default
  * @param dots the output vector of row-wise dot products
  * @param data the input matrix
  * @param D number of columns of data
@@ -50,8 +51,11 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
-void rowNorm(Type* dots,
+template <typename Type,
+          typename IdxType = int,
+          typename Lambda  = raft::identity_op,
+          typename OutType = Type>
+void rowNorm(OutType* dots,
              const Type* data,
              IdxType D,
              IdxType N,
@@ -68,6 +72,7 @@ void rowNorm(Type* dots,
  * @tparam Type the data type
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
+ * @tparam OutType output type, equal to Type by default
  * @param dots the output vector of column-wise dot products
  * @param data the input matrix
  * @param D number of columns of data
@@ -77,8 +82,11 @@ void rowNorm(Type* dots,
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
-void colNorm(Type* dots,
+template <typename Type,
+          typename IdxType = int,
+          typename Lambda  = raft::identity_op,
+          typename OutType = Type>
+void colNorm(OutType* dots,
              const Type* data,
              IdxType D,
              IdxType N,
@@ -97,7 +105,8 @@ void colNorm(Type* dots,
 
 /**
  * @brief Compute norm of the input matrix and perform fin_op
- * @tparam ElementType Input/Output data type
+ * @tparam ElementType Input data type
+ * @tparam OutType output data type
  * @tparam LayoutPolicy the layout of input (raft::row_major or raft::col_major)
  * @tparam IdxType Integer type used to for addressing
  * @tparam Lambda device final lambda
@@ -110,12 +119,13 @@ void colNorm(Type* dots,
  * @param[in] fin_op the final lambda op
  */
 template <typename ElementType,
+          typename OutputType,
           typename LayoutPolicy,
           typename IndexType,
           typename Lambda = raft::identity_op>
 void norm(raft::resources const& handle,
           raft::device_matrix_view<const ElementType, IndexType, LayoutPolicy> in,
-          raft::device_vector_view<ElementType, IndexType> out,
+          raft::device_vector_view<OutputType, IndexType> out,
           NormType type,
           Apply apply,
           Lambda fin_op = raft::identity_op())
diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
index 12c67679ba..ffbb87bd0c 100644
--- a/cpp/include/raft/random/detail/rng_device.cuh
+++ b/cpp/include/raft/random/detail/rng_device.cuh
@@ -22,6 +22,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda_fp16.h>
+
 #include <curand_kernel.h>
 
 #include <random>
@@ -504,6 +506,12 @@ struct PhiloxGenerator {
     return ret;
   }
 
+  DI half next_half()
+  {
+    float ret = next_float();
+    return __float2half(ret);
+  }
+
   DI void next(float& ret)
   {
     // ret = curand_uniform(&(this->philox_state));
@@ -516,6 +524,12 @@ struct PhiloxGenerator {
     ret = next_double();
   }
 
+  DI void next(half& ret)
+  {
+    // ret = curand_uniform_double(&(this->philox_state));
+    ret = next_half();
+  }
+
   DI void next(uint32_t& ret) { ret = next_u32(); }
   DI void next(uint64_t& ret) { ret = next_u64(); }
   DI void next(int32_t& ret) { ret = next_i32(); }
@@ -636,6 +650,12 @@ struct PCGenerator {
     return ret;
   }
 
+  HDI half next_half()
+  {
+    float ret = next_float();
+    return __float2half(ret);
+  }
+
   HDI void next(uint32_t& ret) { ret = next_u32(); }
   HDI void next(uint64_t& ret) { ret = next_u64(); }
   HDI void next(int32_t& ret) { ret = next_i32(); }
@@ -643,6 +663,7 @@ struct PCGenerator {
 
   HDI void next(float& ret) { ret = next_float(); }
   HDI void next(double& ret) { ret = next_double(); }
+  HDI void next(half& ret) { ret = next_half(); }
 
   /** @} */
 
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 61a944e9b6..88654dbe5d 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -30,6 +30,7 @@
 #include <rmm/device_scalar.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda_fp16.h>
 
 namespace raft {
 namespace random {
@@ -85,7 +86,7 @@ template <typename OutType, typename LenType = int>
 void uniform(
   RngState& rng_state, OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
 {
-  static_assert(std::is_floating_point<OutType>::value,
+  static_assert(std::is_floating_point<OutType>::value || std::is_same_v<OutType, half>,
                 "Type for 'uniform' can only be floating point!");
   UniformDistParams<OutType> params;
   params.start = start;
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index b1b0291a85..769d5de9be 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -66,24 +66,30 @@ RAFT_KERNEL __launch_bounds__(calc_nnz_by_rows_tpb) calc_nnz_by_rows_kernel(cons
     index_t e_bit  = s_bit + num_cols;
     index_t l_sum  = 0;
 
+    int s_gap = 0;
+    int e_gap = 0;
+
     while (offset < num_cols) {
       index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
       std::remove_const_t<bitmap_t> l_bitmap = 0;
 
       if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
 
-      if (s_bit > bitmap_idx * BITS_PER_BITMAP) {
-        l_bitmap >>= (s_bit - bitmap_idx * BITS_PER_BITMAP);
-        l_bitmap <<= (s_bit - bitmap_idx * BITS_PER_BITMAP);
-      }
+      offset += BITS_PER_BITMAP * warpSize;
 
-      if ((bitmap_idx + 1) * BITS_PER_BITMAP > e_bit) {
-        l_bitmap <<= ((bitmap_idx + 1) * BITS_PER_BITMAP - e_bit);
-        l_bitmap >>= ((bitmap_idx + 1) * BITS_PER_BITMAP - e_bit);
+      s_gap = s_bit - bitmap_idx * BITS_PER_BITMAP;
+      if (s_gap > 0) {
+        l_bitmap >>= s_gap;
+        l_bitmap <<= s_gap;
+        offset -= s_gap;
       }
 
+      e_gap = (bitmap_idx + 1) * BITS_PER_BITMAP - e_bit;
+      if (e_gap > 0) {
+        l_bitmap <<= e_gap;
+        l_bitmap >>= e_gap;
+      }
       l_sum += static_cast<index_t>(raft::detail::popc(l_bitmap));
-      offset += BITS_PER_BITMAP * warpSize;
     }
 
     l_sum = cg::reduce(tile, l_sum, cg::plus<index_t>());
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index ae552cc687..53a78a8f56 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -207,6 +207,27 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
                            CUDA_R_64F);
 }
 template <>
+inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int32_t* csrRowOffsets,
+                                          int32_t* csrColInd,
+                                          half* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_BASE_ZERO,
+                           CUDA_R_16F);
+}
+template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
                                           int64_t rows,
                                           int64_t cols,
@@ -302,6 +323,16 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
 {
   return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order);
 }
+template <>
+inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
+                                            int64_t rows,
+                                            int64_t cols,
+                                            int64_t ld,
+                                            half* values,
+                                            cusparseOrder_t order)
+{
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_16F, order);
+}
 /** @} */
 
 /**
@@ -658,7 +689,7 @@ inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
                                       const T* beta,
                                       cusparseSpMatDescr_t matC,
                                       cusparseSDDMMAlg_t alg,
-                                      T* externalBuffer,
+                                      void* externalBuffer,
                                       cudaStream_t stream);
 template <>
 inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
@@ -670,7 +701,7 @@ inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
                                       const float* beta,
                                       cusparseSpMatDescr_t matC,
                                       cusparseSDDMMAlg_t alg,
-                                      float* externalBuffer,
+                                      void* externalBuffer,
                                       cudaStream_t stream)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
@@ -684,7 +715,7 @@ inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
                        matC,
                        CUDA_R_32F,
                        alg,
-                       static_cast<void*>(externalBuffer));
+                       externalBuffer);
 }
 template <>
 inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
@@ -696,7 +727,7 @@ inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
                                       const double* beta,
                                       cusparseSpMatDescr_t matC,
                                       cusparseSDDMMAlg_t alg,
-                                      double* externalBuffer,
+                                      void* externalBuffer,
                                       cudaStream_t stream)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
@@ -710,7 +741,34 @@ inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
                        matC,
                        CUDA_R_64F,
                        alg,
-                       static_cast<void*>(externalBuffer));
+                       externalBuffer);
+}
+
+template <>
+inline cusparseStatus_t cusparsesddmm(cusparseHandle_t handle,
+                                      cusparseOperation_t opA,
+                                      cusparseOperation_t opB,
+                                      const half* alpha,
+                                      const cusparseDnMatDescr_t matA,
+                                      const cusparseDnMatDescr_t matB,
+                                      const half* beta,
+                                      cusparseSpMatDescr_t matC,
+                                      cusparseSDDMMAlg_t alg,
+                                      void* externalBuffer,
+                                      cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSDDMM(handle,
+                       opA,
+                       opB,
+                       static_cast<void const*>(alpha),
+                       matA,
+                       matB,
+                       static_cast<void const*>(beta),
+                       matC,
+                       CUDA_R_16F,
+                       alg,
+                       externalBuffer);
 }
 
 /** @} */
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index 42b545180b..864d61ba2f 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -20,6 +20,7 @@
 #include <raft/distance/distance_types.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda_fp16.h>
 #include <cuda_pipeline.h>
 
 namespace raft {
@@ -41,8 +42,8 @@ inline int max_cols_per_block()
          sizeof(value_t);
 }
 
-template <typename value_idx, typename value_t>
-RAFT_KERNEL faster_dot_on_csr_kernel(value_t* __restrict__ dot,
+template <typename value_idx, typename value_t, typename dot_t = value_t>
+RAFT_KERNEL faster_dot_on_csr_kernel(dot_t* __restrict__ dot,
                                      const value_idx* __restrict__ indptr,
                                      const value_idx* __restrict__ cols,
                                      const value_t* __restrict__ A,
@@ -74,25 +75,28 @@ RAFT_KERNEL faster_dot_on_csr_kernel(value_t* __restrict__ dot,
         cur_row = row;
       }
 
-      value_t l_dot_ = 0.0;
+      dot_t l_dot_ = 0.0;
       for (value_idx k = vec_id; k < dim; k += blockDim.x) {
         asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x));
-        l_dot_ += s_A[k] * __ldcg(B_col + k);
+        if constexpr ((std::is_same_v<dot_t, float> && std::is_same_v<value_t, half>)) {
+          l_dot_ += __half2float(s_A[k]) * __half2float(__ldcg(B_col + k));
+        } else {
+          l_dot_ += s_A[k] * __ldcg(B_col + k);
+        }
       }
-      l_dot_ += __shfl_down_sync(0xffffffff, l_dot_, 16);
-      l_dot_ += __shfl_down_sync(0xffff, l_dot_, 8);
-      l_dot_ += __shfl_down_sync(0xff, l_dot_, 4);
-      l_dot_ += __shfl_down_sync(0xf, l_dot_, 2);
-      l_dot_ += __shfl_down_sync(0x3, l_dot_, 1);
 
-      if (lane_id == 0) { atomicAdd_block(dot + dot_id, l_dot_); }
+      typedef cub::WarpReduce<dot_t> WarpReduce;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      dot_t warp_sum = WarpReduce(temp_storage).Sum(l_dot_);
+
+      if (lane_id == 0) { atomicAdd_block(dot + dot_id, warp_sum); }
     }
   }
 }
 
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename dot_t = value_t>
 void faster_dot_on_csr(raft::resources const& handle,
-                       value_t* dot,
+                       dot_t* dot,
                        const value_idx nnz,
                        const value_idx* indptr,
                        const value_idx* cols,
@@ -115,47 +119,47 @@ void faster_dot_on_csr(raft::resources const& handle,
   if (dim < 128) {
     constexpr int tpb = 64;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
     auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
     auto block_y =
       (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
     dim3 blocks(block_x, block_y, 1);
 
-    faster_dot_on_csr_kernel<value_idx, value_t>
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
       <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
 
   } else if (dim < 256) {
     constexpr int tpb = 128;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
     auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
     auto block_y =
       (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
     dim3 blocks(block_x, block_y, 1);
 
-    faster_dot_on_csr_kernel<value_idx, value_t>
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
       <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
   } else if (dim < 512) {
     constexpr int tpb = 256;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
     auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
     auto block_y =
       (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
     dim3 blocks(block_x, block_y, 1);
 
-    faster_dot_on_csr_kernel<value_idx, value_t>
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
       <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
   } else {
     constexpr int tpb = 512;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
     auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
     auto block_y =
       (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
     dim3 blocks(block_x, block_y, 1);
 
-    faster_dot_on_csr_kernel<value_idx, value_t>
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
       <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
   }
 
diff --git a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
index ef74316d04..276960628d 100644
--- a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
@@ -37,14 +37,14 @@ namespace sparse {
 namespace linalg {
 namespace detail {
 
-template <typename value_t, typename index_t, typename nnz_t, typename bitmap_t>
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
 void masked_matmul(raft::resources const& handle,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major>& A,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major>& B,
                    raft::core::bitmap_view<const bitmap_t, index_t>& mask,
-                   raft::device_csr_matrix_view<value_t, index_t, index_t, nnz_t>& C,
-                   std::optional<raft::host_scalar_view<value_t>> alpha,
-                   std::optional<raft::host_scalar_view<value_t>> beta)
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t>& C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha,
+                   std::optional<raft::host_scalar_view<output_t>> beta)
 {
   index_t m   = A.extent(0);
   index_t n   = B.extent(0);
@@ -60,24 +60,24 @@ void masked_matmul(raft::resources const& handle,
 
   auto stream = raft::resource::get_cuda_stream(handle);
 
-  auto C_matrix = raft::make_device_csr_matrix<value_t, index_t>(handle, compressed_C_view);
+  auto C_matrix = raft::make_device_csr_matrix<output_t, index_t>(handle, compressed_C_view);
 
   // fill C
   raft::sparse::convert::bitmap_to_csr(handle, mask, C_matrix);
 
   if (m > 10 || alpha.has_value() || beta.has_value()) {
-    auto C_view = raft::make_device_csr_matrix_view<value_t, index_t, index_t, index_t>(
+    auto C_view = raft::make_device_csr_matrix_view<output_t, index_t, index_t, index_t>(
       C.get_elements().data(), compressed_C_view);
 
     // create B col_major view
     auto B_col_major = raft::make_device_matrix_view<const value_t, index_t, raft::col_major>(
       B.data_handle(), dim, n);
 
-    value_t default_alpha = static_cast<value_t>(1.0f);
-    value_t default_beta  = static_cast<value_t>(0.0f);
+    output_t default_alpha = static_cast<output_t>(1.0f);
+    output_t default_beta  = static_cast<output_t>(0.0f);
 
-    if (!alpha.has_value()) { alpha = raft::make_host_scalar_view<value_t>(&default_alpha); }
-    if (!beta.has_value()) { beta = raft::make_host_scalar_view<value_t>(&default_beta); }
+    if (!alpha.has_value()) { alpha = raft::make_host_scalar_view<output_t>(&default_alpha); }
+    if (!beta.has_value()) { beta = raft::make_host_scalar_view<output_t>(&default_beta); }
 
     raft::sparse::linalg::sddmm(handle,
                                 A,
diff --git a/cpp/include/raft/sparse/linalg/detail/sddmm.hpp b/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
index 5088a20f46..f2e4aba644 100644
--- a/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
+++ b/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
@@ -35,11 +35,7 @@ namespace detail {
  * It computes the following equation: C = alpha · (op_a(A) * op_b(B) ∘ spy(C)) + beta · C
  * where A,B are device matrix views and C is a CSR device matrix view
  *
- * @tparam ValueType Data type of input/output matrices (float/double)
- * @tparam IndexType Type of C
- * @tparam LayoutPolicyA layout of A
- * @tparam LayoutPolicyB layout of B
- * @tparam NZType Type of C
+ * @tparam OutputType Data type of input/output matrices (float/double)
  *
  * @param[in] handle raft resource handle
  * @param[in] descr_a input dense descriptor
@@ -50,15 +46,15 @@ namespace detail {
  * @param[in] alpha scalar pointer
  * @param[in] beta scalar pointer
  */
-template <typename ValueType>
+template <typename OutputType>
 void sddmm(raft::resources const& handle,
            cusparseDnMatDescr_t& descr_a,
            cusparseDnMatDescr_t& descr_b,
            cusparseSpMatDescr_t& descr_c,
            cusparseOperation_t op_a,
            cusparseOperation_t op_b,
-           const ValueType* alpha,
-           const ValueType* beta)
+           const OutputType* alpha,
+           const OutputType* beta)
 {
   auto alg = CUSPARSE_SDDMM_ALG_DEFAULT;
   size_t bufferSize;
@@ -78,7 +74,7 @@ void sddmm(raft::resources const& handle,
 
   resource::sync_stream(handle);
 
-  rmm::device_uvector<ValueType> tmp(bufferSize, resource::get_cuda_stream(handle));
+  rmm::device_uvector<uint8_t> tmp(bufferSize, resource::get_cuda_stream(handle));
 
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesddmm(resource::get_cusparse_handle(handle),
                                                         op_a,
@@ -89,7 +85,7 @@ void sddmm(raft::resources const& handle,
                                                         beta,
                                                         descr_c,
                                                         alg,
-                                                        tmp.data(),
+                                                        reinterpret_cast<void*>(tmp.data()),
                                                         resource::get_cuda_stream(handle)));
 }
 
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.hpp b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
index 560cd3f715..6cf6e834b9 100644
--- a/cpp/include/raft/sparse/linalg/masked_matmul.hpp
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
@@ -35,7 +35,8 @@ namespace linalg {
  * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
  * and added to beta times the original matrix C.
  *
- * @tparam value_t Data type of elements in the input/output matrices (e.g., float, double)
+ * @tparam value_t Data type of elements in the input matrices (e.g., half, float, double)
+ * @tparam output_t Data type of elements in the output matrices (e.g., float, double)
  * @tparam index_t Type used for matrix indices
  * @tparam nnz_t Type used for the number of non-zero entries in CSR format
  * @tparam bitmap_t Type of the bitmap used for the mask
@@ -52,14 +53,14 @@ namespace linalg {
  * std::nullopt)
  * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
  */
-template <typename value_t, typename index_t, typename nnz_t, typename bitmap_t>
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
 void masked_matmul(raft::resources const& handle,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
                    raft::core::bitmap_view<const bitmap_t, index_t> mask,
-                   raft::device_csr_matrix_view<value_t, index_t, index_t, nnz_t> C,
-                   std::optional<raft::host_scalar_view<value_t>> alpha = std::nullopt,
-                   std::optional<raft::host_scalar_view<value_t>> beta  = std::nullopt)
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
 {
   detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
 }
diff --git a/cpp/include/raft/sparse/linalg/sddmm.hpp b/cpp/include/raft/sparse/linalg/sddmm.hpp
index c19f1d9081..96387e6c8b 100644
--- a/cpp/include/raft/sparse/linalg/sddmm.hpp
+++ b/cpp/include/raft/sparse/linalg/sddmm.hpp
@@ -29,11 +29,12 @@ namespace linalg {
  * followed by an element-wise multiplication with the sparsity pattern of C.
  * It computes the following equation: C = alpha · (opA(A) * opB(B) ∘ spy(C)) + beta · C
  * where A,B are device matrix views and C is a CSR device matrix view
- * @tparam ValueType Data type of input/output matrices (float/double)
+ * @tparam ValueType Data type of input/output matrices (float/double/half)
  * @tparam IndexType Type of C
  * @tparam NZType Type of C
  * @tparam LayoutPolicyA layout of A
  * @tparam LayoutPolicyB layout of B
+ * @tparam OutputType output type, equal to ValueType by default
  * @param[in] handle raft handle
  * @param[in] A input raft::device_matrix_view
  * @param[in] B input raft::device_matrix_view
@@ -47,21 +48,23 @@ template <typename ValueType,
           typename IndexType,
           typename NZType,
           typename LayoutPolicyA,
-          typename LayoutPolicyB>
+          typename LayoutPolicyB,
+          typename OutputType>
 void sddmm(raft::resources const& handle,
            raft::device_matrix_view<const ValueType, IndexType, LayoutPolicyA> A,
            raft::device_matrix_view<const ValueType, IndexType, LayoutPolicyB> B,
-           raft::device_csr_matrix_view<ValueType, IndexType, IndexType, NZType> C,
+           raft::device_csr_matrix_view<OutputType, IndexType, IndexType, NZType> C,
            const raft::linalg::Operation opA,
            const raft::linalg::Operation opB,
-           raft::host_scalar_view<ValueType> alpha,
-           raft::host_scalar_view<ValueType> beta)
+           raft::host_scalar_view<OutputType> alpha,
+           raft::host_scalar_view<OutputType> beta)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(A), "A is not contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(B), "B is not contiguous");
 
-  static_assert(std::is_same_v<ValueType, float> || std::is_same_v<ValueType, double>,
-                "The `ValueType` of sddmm only supports float/double.");
+  static_assert(std::is_same_v<ValueType, half> || std::is_same_v<ValueType, float> ||
+                  std::is_same_v<ValueType, double>,
+                "The `ValueType` of sddmm only supports float/double/half.");
 
   auto descrA = detail::create_descriptor(A);
   auto descrB = detail::create_descriptor(B);
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index 7a5a217959..0cc3b864fa 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -69,7 +69,7 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
-        acc[i][j] = BaseClass::Zero;
+        acc[i][j] = BaseClass::Zero();
       }
     }
     this->stsXY();
diff --git a/cpp/include/raft/util/cuda_dev_essentials.cuh b/cpp/include/raft/util/cuda_dev_essentials.cuh
index bb9ebbba59..26f48af68b 100644
--- a/cpp/include/raft/util/cuda_dev_essentials.cuh
+++ b/cpp/include/raft/util/cuda_dev_essentials.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cuda_fp16.h>
+
 // This file provides a few essential functions for use in __device__ code. The
 // scope is necessarily limited to ensure that compilation times are minimized.
 // Please make sure not to include large / expensive files from here.
@@ -114,4 +116,19 @@ HDI void swapVals(T& a, T& b)
   b     = tmp;
 }
 
+/**
+ * @brief Convert half to float
+ * @tparam T the datatype of the value
+ * @param a need to convert
+ */
+template <typename T>
+HDI auto to_float(T& a)
+{
+  if constexpr (std::is_same_v<typename std::remove_const<T>::type, half>) {
+    return __half2float(a);
+  } else {
+    return a;
+  }
+}
+
 }  // namespace raft
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 2b334d1bbf..f9e7f521be 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -189,7 +189,11 @@ void print_host_vector(const char* variable_name,
   out << variable_name << "=[";
   for (size_t i = 0; i < componentsCount; ++i) {
     if (i != 0) out << ",";
-    out << host_mem[i];
+    if constexpr (std::is_same_v<T, half>) {
+      out << __half2float(host_mem[i]);
+    } else {
+      out << host_mem[i];
+    }
   }
   out << "];" << std::endl;
 }
diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu
index b68ba38914..419c1e0859 100644
--- a/cpp/test/core/mdspan_copy.cu
+++ b/cpp/test/core/mdspan_copy.cu
@@ -161,6 +161,59 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda)
     }
   }
 }
+
+TEST(MDSpanCopy, Mdspan2DDeviceDeviceCudaHalfWithTranspose)
+{
+  auto res            = device_resources{};
+  auto constexpr rows = std::uint32_t{30};
+  auto constexpr cols = std::uint32_t{20};
+  auto in_left        = make_device_mdarray<half, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto in_right = make_device_mdarray<half, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; };
+
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      in_left(i, j)  = gen_unique_entry(i, j);
+      in_right(i, j) = gen_unique_entry(i, j);
+    }
+  }
+
+  auto out_left = make_device_mdarray<half, std::uint32_t, layout_c_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+  auto out_right = make_device_mdarray<half, std::uint32_t, layout_f_contiguous, rows, cols>(
+    res, extents<std::uint32_t, rows, cols>{});
+
+  res.sync_stream();
+
+  // Test dtype conversion with transpose
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_right.view(), in_left.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(__half2float(out_right(i, j)),
+                        __half2float(gen_unique_entry(i, j)),
+                        CompareApprox<float>{0.0001}));
+    }
+  }
+  static_assert(
+    detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
+    "Current implementation should use kernel for this copy");
+  copy(res, out_left.view(), in_right.view());
+  res.sync_stream();
+  for (auto i = std::uint32_t{}; i < rows; ++i) {
+    for (auto j = std::uint32_t{}; j < cols; ++j) {
+      ASSERT_TRUE(match(__half2float(out_left(i, j)),
+                        __half2float(gen_unique_entry(i, j)),
+                        CompareApprox<float>{0.0001}));
+    }
+  }
+}
+
 TEST(MDSpanCopy, Mdspan3DDeviceHostCuda)
 {
   auto res                   = device_resources{};
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index e4f064949c..f91350f222 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -24,6 +24,8 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
+#include <cuda_fp16.h>
+
 #include <gtest/gtest.h>
 
 namespace raft {
@@ -48,39 +50,48 @@ template <typename T, typename IdxT>
 }
 
 ///// Row-wise norm test definitions
-template <typename Type, typename IdxT>
+template <typename Type, typename IdxT, typename OutType = Type>
 RAFT_KERNEL naiveRowNormKernel(
-  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
+  OutType* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
 {
-  Type acc      = (Type)0;
+  OutType acc   = (OutType)0;
   IdxT rowStart = threadIdx.x + static_cast<IdxT>(blockIdx.x) * blockDim.x;
   if (rowStart < N) {
     for (IdxT i = 0; i < D; ++i) {
-      if (type == L2Norm) {
-        acc += data[rowStart * D + i] * data[rowStart * D + i];
+      if constexpr (std::is_same_v<Type, half>) {
+        if (type == L2Norm) {
+          acc += __half2float(data[rowStart * D + i]) * __half2float(data[rowStart * D + i]);
+        } else {
+          acc += raft::abs(__half2float(data[rowStart * D + i]));
+        }
       } else {
-        acc += raft::abs(data[rowStart * D + i]);
+        if (type == L2Norm) {
+          acc += data[rowStart * D + i] * data[rowStart * D + i];
+        } else {
+          acc += raft::abs(data[rowStart * D + i]);
+        }
       }
     }
     dots[rowStart] = do_sqrt ? raft::sqrt(acc) : acc;
   }
 }
 
-template <typename Type, typename IdxT>
+template <typename Type, typename IdxT, typename OutType = Type>
 void naiveRowNorm(
-  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt, cudaStream_t stream)
+  OutType* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt, cudaStream_t stream)
 {
   static const IdxT TPB = 64;
   IdxT nblks            = raft::ceildiv(N, TPB);
-  naiveRowNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  naiveRowNormKernel<Type, IdxT, OutType>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename T, typename IdxT>
-class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
+template <typename T, typename IdxT, typename OutT = T>
+class RowNormTest : public ::testing::TestWithParam<NormInputs<OutT, IdxT>> {
  public:
   RowNormTest()
-    : params(::testing::TestWithParam<NormInputs<T, IdxT>>::GetParam()),
+    : params(::testing::TestWithParam<NormInputs<OutT, IdxT>>::GetParam()),
       stream(resource::get_cuda_stream(handle)),
       data(params.rows * params.cols, stream),
       dots_exp(params.rows, stream),
@@ -94,7 +105,7 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
     IdxT rows = params.rows, cols = params.cols, len = rows * cols;
     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));
     naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
-    auto output_view     = raft::make_device_vector_view<T, IdxT>(dots_act.data(), params.rows);
+    auto output_view     = raft::make_device_vector_view<OutT, IdxT>(dots_act.data(), params.rows);
     auto input_row_major = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
       data.data(), params.rows, params.cols);
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
@@ -119,42 +130,44 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
   raft::resources handle;
   cudaStream_t stream;
 
-  NormInputs<T, IdxT> params;
-  rmm::device_uvector<T> data, dots_exp, dots_act;
+  NormInputs<OutT, IdxT> params;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<OutT> dots_exp, dots_act;
 };
 
 ///// Column-wise norm test definitisons
-template <typename Type, typename IdxT>
+template <typename Type, typename IdxT, typename OutType = Type>
 RAFT_KERNEL naiveColNormKernel(
-  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
+  OutType* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
 {
   IdxT colID = threadIdx.x + static_cast<IdxT>(blockIdx.x) * blockDim.x;
   if (colID >= D) return;  // avoid out-of-bounds thread
 
-  Type acc = 0;
+  OutType acc = 0;
   for (IdxT i = 0; i < N; i++) {
-    Type v = data[colID + i * D];
+    OutType v = data[colID + i * D];
     acc += type == L2Norm ? v * v : raft::abs(v);
   }
 
   dots[colID] = do_sqrt ? raft::sqrt(acc) : acc;
 }
 
-template <typename Type, typename IdxT>
+template <typename Type, typename IdxT, typename OutType = Type>
 void naiveColNorm(
-  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt, cudaStream_t stream)
+  OutType* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt, cudaStream_t stream)
 {
   static const IdxT TPB = 64;
   IdxT nblks            = raft::ceildiv(D, TPB);
-  naiveColNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  naiveColNormKernel<Type, IdxT, OutType>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename T, typename IdxT>
-class ColNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
+template <typename T, typename IdxT, typename OutT = T>
+class ColNormTest : public ::testing::TestWithParam<NormInputs<OutT, IdxT>> {
  public:
   ColNormTest()
-    : params(::testing::TestWithParam<NormInputs<T, IdxT>>::GetParam()),
+    : params(::testing::TestWithParam<NormInputs<OutT, IdxT>>::GetParam()),
       stream(resource::get_cuda_stream(handle)),
       data(params.rows * params.cols, stream),
       dots_exp(params.cols, stream),
@@ -169,7 +182,7 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));
 
     naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
-    auto output_view     = raft::make_device_vector_view<T, IdxT>(dots_act.data(), params.cols);
+    auto output_view     = raft::make_device_vector_view<OutT, IdxT>(dots_act.data(), params.cols);
     auto input_row_major = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
       data.data(), params.rows, params.cols);
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
@@ -196,8 +209,9 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
   raft::resources handle;
   cudaStream_t stream;
 
-  NormInputs<T, IdxT> params;
-  rmm::device_uvector<T> data, dots_exp, dots_act;
+  NormInputs<OutT, IdxT> params;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<OutT> dots_exp, dots_act;
 };
 
 ///// Row- and column-wise tests
@@ -246,6 +260,19 @@ const std::vector<NormInputs<double, int64_t>> inputscd_i64 =
                                                               {true},
                                                               {1234ULL});
 
+const std::vector<NormInputs<float, int>> inputsh_i32 =
+  raft::util::itertools::product<NormInputs<float, int>>(
+    {0.00001f}, {11, 1234}, {7, 33, 128, 500}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+const std::vector<NormInputs<float, int64_t>> inputsh_i64 =
+  raft::util::itertools::product<NormInputs<float, int64_t>>(
+    {0.00001f}, {11, 1234}, {7, 33, 128, 500}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+const std::vector<NormInputs<float, int>> inputsch_i32 =
+  raft::util::itertools::product<NormInputs<float, int>>(
+    {0.00001f}, {7, 33, 128, 500}, {11, 1234}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+const std::vector<NormInputs<float, int64_t>> inputsch_i64 =
+  raft::util::itertools::product<NormInputs<float, int64_t>>(
+    {0.00001f}, {7, 33, 128, 500}, {11, 1234}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+
 typedef RowNormTest<float, int> RowNormTestF_i32;
 typedef RowNormTest<double, int> RowNormTestD_i32;
 typedef RowNormTest<float, int64_t> RowNormTestF_i64;
@@ -255,6 +282,11 @@ typedef ColNormTest<double, int> ColNormTestD_i32;
 typedef ColNormTest<float, int64_t> ColNormTestF_i64;
 typedef ColNormTest<double, int64_t> ColNormTestD_i64;
 
+typedef RowNormTest<half, int, float> RowNormTestH_i32;
+typedef RowNormTest<half, int64_t, float> RowNormTestH_i64;
+typedef ColNormTest<half, int, float> ColNormTestH_i32;
+typedef ColNormTest<half, int64_t, float> ColNormTestH_i64;
+
 #define ROWNORM_TEST(test_type, test_inputs)                                                      \
   TEST_P(test_type, Result)                                                                       \
   {                                                                                               \
@@ -272,5 +304,10 @@ ROWNORM_TEST(ColNormTestD_i32, inputscd_i32);
 ROWNORM_TEST(ColNormTestF_i64, inputscf_i64);
 ROWNORM_TEST(ColNormTestD_i64, inputscd_i64);
 
+ROWNORM_TEST(RowNormTestH_i32, inputsh_i32);
+ROWNORM_TEST(RowNormTestH_i64, inputsh_i64);
+ROWNORM_TEST(ColNormTestH_i32, inputsch_i32);
+ROWNORM_TEST(ColNormTestH_i64, inputsch_i64);
+
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index f6857d3ffa..cbe869a9a5 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -25,6 +25,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda_fp16.h>
+
 #include <gtest/gtest.h>
 
 namespace raft {
@@ -84,6 +86,8 @@ const std::vector<TranposeInputs<float>> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL
 
 const std::vector<TranposeInputs<double>> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
 
+const std::vector<TranposeInputs<half>> inputsh2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
+
 typedef TransposeTest<float> TransposeTestValF;
 TEST_P(TransposeTestValF, Result)
 {
@@ -112,10 +116,49 @@ TEST_P(TransposeTestValD, Result)
                                 raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
+bool validate_half(const half* h_ref, const half* h_result, half tolerance, int len)
+{
+  bool success = true;
+  for (int i = 0; i < len; ++i) {
+    if (raft::abs(__half2float(h_result[i]) - __half2float(h_ref[i])) >= __half2float(tolerance)) {
+      success = false;
+      break;
+    }
+    if (!success) break;
+  }
+  return success;
+}
+
+typedef TransposeTest<half> TransposeTestValH;
+TEST_P(TransposeTestValH, Result)
+{
+  half data_trans_ref_h[params.len];
+  half data_trans_h[params.len];
+  half data_h[params.len];
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(data_trans_ref_h,
+                                data_trans_ref.data(),
+                                params.len * sizeof(half),
+                                cudaMemcpyDeviceToHost,
+                                stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
+    data_trans_h, data_trans.data(), params.len * sizeof(half), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
+    data_h, data.data(), params.len * sizeof(half), cudaMemcpyDeviceToHost, stream));
+
+  resource::sync_stream(handle, stream);
+
+  ASSERT_TRUE(validate_half(data_trans_ref_h, data_trans_h, params.tolerance, params.len));
+  ASSERT_TRUE(validate_half(data_trans_ref_h, data_h, params.tolerance, params.len));
+}
+
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2));
 
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValH, ::testing::ValuesIn(inputsh2));
+
 namespace {
 /**
  * We hide these functions in tests for now until we have a heterogeneous mdarray
diff --git a/cpp/test/sparse/masked_matmul.cu b/cpp/test/sparse/masked_matmul.cu
index 0ece716a1b..f883beae32 100644
--- a/cpp/test/sparse/masked_matmul.cu
+++ b/cpp/test/sparse/masked_matmul.cu
@@ -24,6 +24,7 @@
 
 #include <thrust/reduce.h>
 
+#include <cusparse.h>
 #include <gtest/gtest.h>
 
 #include <iostream>
@@ -32,15 +33,15 @@
 namespace raft {
 namespace sparse {
 
-template <typename value_t, typename index_t>
+template <typename value_t, typename output_t, typename index_t>
 struct MaskedMatmulInputs {
-  value_t tolerance;
+  output_t tolerance;
 
   index_t m;
   index_t k;
   index_t n;
 
-  value_t sparsity;
+  float sparsity;
 
   unsigned long long int seed;
 };
@@ -53,8 +54,13 @@ struct sum_abs_op {
   }
 };
 
-template <typename value_t, typename index_t>
-::std::ostream& operator<<(::std::ostream& os, const MaskedMatmulInputs<value_t, index_t>& params)
+struct float_to_half {
+  __host__ __device__ __half operator()(const float x) const { return __float2half(x); }
+};
+
+template <typename value_t, typename output_t, typename index_t>
+::std::ostream& operator<<(::std::ostream& os,
+                           const MaskedMatmulInputs<value_t, output_t, index_t>& params)
 {
   os << " m: " << params.m << "\tk: " << params.k << "\tn: " << params.n
      << "\tsparsity: " << params.sparsity;
@@ -62,15 +68,33 @@ template <typename value_t, typename index_t>
   return os;
 }
 
+bool isCuSparseVersionGreaterThan_12_0_1()
+{
+  int version;
+  cusparseHandle_t handle;
+  cusparseCreate(&handle);
+  cusparseGetVersion(handle, &version);
+
+  int major = version / 1000;
+  int minor = (version % 1000) / 100;
+  int patch = version % 100;
+
+  cusparseDestroy(handle);
+
+  return (major > 12) || (major == 12 && minor > 0) || (major == 12 && minor == 0 && patch >= 2);
+}
+
 template <typename value_t,
+          typename output_t,
           typename index_t,
           typename bitmap_t      = uint32_t,
           typename LayoutPolicyA = raft::row_major,
           typename LayoutPolicyB = raft::row_major>
-class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<value_t, index_t>> {
+class MaskedMatmulTest
+  : public ::testing::TestWithParam<MaskedMatmulInputs<value_t, output_t, index_t>> {
  public:
   MaskedMatmulTest()
-    : params(::testing::TestWithParam<MaskedMatmulInputs<value_t, index_t>>::GetParam()),
+    : params(::testing::TestWithParam<MaskedMatmulInputs<value_t, output_t, index_t>>::GetParam()),
       stream(resource::get_cuda_stream(handle)),
       a_data_d(0, resource::get_cuda_stream(handle)),
       b_data_d(0, resource::get_cuda_stream(handle)),
@@ -142,7 +166,7 @@ class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<valu
 
   void cpu_sddmm(const std::vector<value_t>& A,
                  const std::vector<value_t>& B,
-                 std::vector<value_t>& vals,
+                 std::vector<output_t>& vals,
                  const std::vector<index_t>& cols,
                  const std::vector<index_t>& row_ptrs,
                  bool is_row_major_A,
@@ -156,11 +180,15 @@ class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<valu
 
     for (index_t i = 0; i < params.m; ++i) {
       for (index_t j = row_ptrs[i]; j < row_ptrs[i + 1]; ++j) {
-        value_t sum = 0;
+        output_t sum = 0;
         for (index_t l = 0; l < params.k; ++l) {
           index_t a_index = i * params.k + l;
           index_t b_index = cols[j] * params.k + l;
-          sum += A[a_index] * B[b_index];
+          if constexpr ((std::is_same_v<output_t, float> && std::is_same_v<value_t, half>)) {
+            sum += __half2float(A[a_index]) * __half2float(B[b_index]);
+          } else {
+            sum += A[a_index] * B[b_index];
+          }
         }
         vals[j] = sum;
       }
@@ -183,29 +211,54 @@ class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<valu
     b_data_d.resize(b_size, stream);
     bitmap_d.resize(bitmap_h.size(), stream);
 
-    auto blobs_a_b = raft::make_device_matrix<value_t, index_t>(handle, 1, a_size + b_size);
+    auto blobs_a_b = raft::make_device_matrix<output_t, index_t>(handle, 1, a_size + b_size);
     auto labels    = raft::make_device_vector<index_t, index_t>(handle, 1);
 
-    raft::random::make_blobs<value_t, index_t>(blobs_a_b.data_handle(),
-                                               labels.data_handle(),
-                                               1,
-                                               a_size + b_size,
-                                               1,
-                                               stream,
-                                               false,
-                                               nullptr,
-                                               nullptr,
-                                               value_t(1.0),
-                                               false,
-                                               value_t(-1.0f),
-                                               value_t(1.0f),
-                                               uint64_t(2024));
-
-    raft::copy(a_data_h.data(), blobs_a_b.data_handle(), a_size, stream);
-    raft::copy(b_data_h.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
-
-    raft::copy(a_data_d.data(), blobs_a_b.data_handle(), a_size, stream);
-    raft::copy(b_data_d.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+    raft::random::make_blobs<output_t, index_t>(blobs_a_b.data_handle(),
+                                                labels.data_handle(),
+                                                1,
+                                                a_size + b_size,
+                                                1,
+                                                stream,
+                                                false,
+                                                nullptr,
+                                                nullptr,
+                                                output_t(1.0),
+                                                false,
+                                                output_t(-1.0f),
+                                                output_t(1.0f),
+                                                uint64_t(2024));
+
+    if constexpr ((std::is_same_v<output_t, float> && std::is_same_v<value_t, half>)) {
+      {
+        thrust::device_ptr<output_t> d_output_ptr =
+          thrust::device_pointer_cast(blobs_a_b.data_handle());
+        thrust::device_ptr<value_t> d_value_ptr = thrust::device_pointer_cast(a_data_d.data());
+        thrust::transform(thrust::cuda::par.on(stream),
+                          d_output_ptr,
+                          d_output_ptr + a_size,
+                          d_value_ptr,
+                          float_to_half());
+      }
+      {
+        thrust::device_ptr<output_t> d_output_ptr =
+          thrust::device_pointer_cast(blobs_a_b.data_handle() + a_size);
+        thrust::device_ptr<value_t> d_value_ptr = thrust::device_pointer_cast(b_data_d.data());
+        thrust::transform(thrust::cuda::par.on(stream),
+                          d_output_ptr,
+                          d_output_ptr + b_size,
+                          d_value_ptr,
+                          float_to_half());
+      }
+      raft::copy(a_data_h.data(), a_data_d.data(), a_size, stream);
+      raft::copy(b_data_h.data(), b_data_d.data(), b_size, stream);
+    } else {
+      raft::copy(a_data_h.data(), blobs_a_b.data_handle(), a_size, stream);
+      raft::copy(b_data_h.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+
+      raft::copy(a_data_d.data(), blobs_a_b.data_handle(), a_size, stream);
+      raft::copy(b_data_d.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+    }
 
     resource::sync_stream(handle);
 
@@ -213,7 +266,7 @@ class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<valu
 
     std::vector<index_t> c_indptr_h(params.m + 1);
     std::vector<index_t> c_indices_h(c_true_nnz);
-    std::vector<value_t> c_data_h(c_true_nnz);
+    std::vector<output_t> c_data_h(c_true_nnz);
 
     cpu_convert_to_csr(bitmap_h, params.m, params.n, c_indices_h, c_indptr_h);
 
@@ -236,7 +289,13 @@ class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<valu
     resource::sync_stream(handle);
   }
 
-  void SetUp() override { make_data(); }
+  void SetUp() override
+  {
+    if (std::is_same_v<value_t, half> && !isCuSparseVersionGreaterThan_12_0_1()) {
+      GTEST_SKIP() << "Skipping all tests for half-float as cuSparse doesn't support it.";
+    }
+    make_data();
+  }
 
   void Run()
   {
@@ -255,33 +314,33 @@ class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<valu
       params.n,
       static_cast<index_t>(c_indices_d.size()));
 
-    auto C = raft::make_device_csr_matrix_view<value_t>(c_data_d.data(), c_structure);
+    auto C = raft::make_device_csr_matrix_view<output_t>(c_data_d.data(), c_structure);
 
     raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
 
     resource::sync_stream(handle);
 
-    ASSERT_TRUE(raft::devArrMatch<value_t>(c_expected_data_d.data(),
-                                           C.get_elements().data(),
-                                           c_expected_data_d.size(),
-                                           raft::CompareApprox<value_t>(params.tolerance),
-                                           stream));
+    ASSERT_TRUE(raft::devArrMatch<output_t>(c_expected_data_d.data(),
+                                            C.get_elements().data(),
+                                            c_expected_data_d.size(),
+                                            raft::CompareApprox<output_t>(params.tolerance),
+                                            stream));
 
-    thrust::device_ptr<value_t> expected_data_ptr =
+    thrust::device_ptr<output_t> expected_data_ptr =
       thrust::device_pointer_cast(c_expected_data_d.data());
-    value_t sum_abs = thrust::reduce(thrust::cuda::par.on(stream),
-                                     expected_data_ptr,
-                                     expected_data_ptr + c_expected_data_d.size(),
-                                     value_t(0.0f),
-                                     sum_abs_op<value_t>());
-    value_t avg     = sum_abs / (1.0f * c_expected_data_d.size());
-
-    ASSERT_GE(avg, (params.tolerance * static_cast<value_t>(0.001f)));
+    output_t sum_abs = thrust::reduce(thrust::cuda::par.on(stream),
+                                      expected_data_ptr,
+                                      expected_data_ptr + c_expected_data_d.size(),
+                                      output_t(0.0f),
+                                      sum_abs_op<output_t>());
+    output_t avg     = sum_abs / (1.0f * c_expected_data_d.size());
+
+    ASSERT_GE(avg, (params.tolerance * static_cast<output_t>(0.001f)));
   }
 
   raft::resources handle;
   cudaStream_t stream;
-  MaskedMatmulInputs<value_t, index_t> params;
+  MaskedMatmulInputs<value_t, output_t, index_t> params;
 
   rmm::device_uvector<value_t> a_data_d;
   rmm::device_uvector<value_t> b_data_d;
@@ -289,40 +348,82 @@ class MaskedMatmulTest : public ::testing::TestWithParam<MaskedMatmulInputs<valu
 
   rmm::device_uvector<index_t> c_indptr_d;
   rmm::device_uvector<index_t> c_indices_d;
-  rmm::device_uvector<value_t> c_data_d;
+  rmm::device_uvector<output_t> c_data_d;
 
-  rmm::device_uvector<value_t> c_expected_data_d;
+  rmm::device_uvector<output_t> c_expected_data_d;
 };
 
-using MaskedMatmulTestF = MaskedMatmulTest<float, int>;
+using MaskedMatmulTestF = MaskedMatmulTest<float, float, int>;
 TEST_P(MaskedMatmulTestF, Result) { Run(); }
 
-using MaskedMatmulTestD = MaskedMatmulTest<double, int>;
+using MaskedMatmulTestD = MaskedMatmulTest<double, double, int>;
 TEST_P(MaskedMatmulTestD, Result) { Run(); }
 
-const std::vector<MaskedMatmulInputs<float, int>> sddmm_inputs_f = {
+using MaskedMatmulTestH = MaskedMatmulTest<half, float, int>;
+TEST_P(MaskedMatmulTestH, Result) { Run(); }
+
+const std::vector<MaskedMatmulInputs<float, float, int>> sddmm_inputs_f = {
+  {0.001f, 2, 255, 1023, 0.19, 1234ULL},
+  {0.001f, 2, 255, 1023 * 2, 0.19, 1234ULL},
+  {0.001f, 2, 255, 1023 * 3, 0.38, 1234ULL},
+  {0.0001f, 10, 255, 13000, 0.01, 1234ULL},
   {0.0001f, 10, 5, 32, 0.1, 1234ULL},
-  {0.0001f, 1024, 32, 1024, 0.1, 1234ULL},
+  {0.001f, 11, 255, 1023, 0.19, 1234ULL},
+  {0.001f, 11, 255, 1023 * 2, 0.19, 1234ULL},
+  {0.001f, 11, 255, 1023 * 3, 0.38, 1234ULL},
   {0.0003f, 32, 1024, 1024, 0.2, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.1, 1234ULL},
   {0.001f, 1024, 1024, 1024, 0.19, 1234ULL},
+  {0.001f, 1023, 1023, 1023 * 3, 0.38, 1234ULL},
+  {0.001f, 1025, 1025, 1025 * 3, 0.31, 1234ULL},
   {0.0001f, 1024, 1024, 32, 0.3, 1234ULL},
   {0.0001f, 1024, 32, 1024, 0.4, 1234ULL},
-  {0.0003f, 32, 1024, 1024, 0.19, 1234ULL},
+  {0.0003f, 31, 1025, 1025, 0.19, 1234ULL},
   {0.001f, 1024, 1024, 1024, 0.1, 1234ULL}};
 
-const std::vector<MaskedMatmulInputs<double, int>> sddmm_inputs_d = {
-  {0.0001f, 10, 5, 32, 0.01, 1234ULL},
-  {0.0001f, 1024, 32, 1024, 0.1, 1234ULL},
+const std::vector<MaskedMatmulInputs<double, double, int>> sddmm_inputs_d = {
+  {0.0001f, 2, 255, 1023, 0.19, 1234ULL},
+  {0.0001f, 2, 255, 1023 * 2, 0.19, 1234ULL},
+  {0.0001f, 2, 255, 1023 * 3, 0.38, 1234ULL},
+  {0.0001f, 10, 255, 13000, 0.01, 1234ULL},
+  {0.0001f, 10, 5, 32, 0.1, 1234ULL},
+  {0.0001f, 11, 255, 1023, 0.19, 1234ULL},
+  {0.0001f, 11, 255, 1023 * 2, 0.19, 1234ULL},
+  {0.0001f, 11, 255, 1023 * 3, 0.38, 1234ULL},
   {0.0001f, 32, 1024, 1024, 0.2, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.1, 1234ULL},
   {0.0001f, 1024, 1024, 1024, 0.19, 1234ULL},
+  {0.0001f, 1023, 1023, 1023 * 3, 0.38, 1234ULL},
+  {0.0001f, 1025, 1025, 1025 * 3, 0.31, 1234ULL},
   {0.0001f, 1024, 1024, 32, 0.3, 1234ULL},
   {0.0001f, 1024, 32, 1024, 0.4, 1234ULL},
-  {0.0001f, 32, 1024, 1024, 0.19, 1234ULL},
+  {0.0001f, 31, 1025, 1025, 0.19, 1234ULL},
   {0.0001f, 1024, 1024, 1024, 0.1, 1234ULL}};
 
+const std::vector<MaskedMatmulInputs<half, float, int>> sddmm_inputs_h = {
+  {0.001f, 2, 255, 1023, 0.19, 1234ULL},
+  {0.001f, 2, 255, 1023 * 2, 0.19, 1234ULL},
+  {0.001f, 2, 255, 1023 * 3, 0.38, 1234ULL},
+  {0.0001f, 10, 255, 13000, 0.01, 1234ULL},
+  {0.0001f, 10, 5, 32, 0.1, 1234ULL},
+  {0.001f, 11, 255, 1023, 0.19, 1234ULL},
+  {0.001f, 11, 255, 1023 * 2, 0.19, 1234ULL},
+  {0.001f, 11, 255, 1023 * 3, 0.38, 1234ULL},
+  {0.0003f, 32, 1024, 1024, 0.2, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.1, 1234ULL},
+  {0.001f, 1024, 1024, 1024, 0.19, 1234ULL},
+  {0.001f, 1023, 1023, 1023 * 3, 0.38, 1234ULL},
+  {0.001f, 1025, 1025, 1025 * 3, 0.31, 1234ULL},
+  {0.0001f, 1024, 1024, 32, 0.3, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.4, 1234ULL},
+  {0.0003f, 31, 1025, 1025, 0.19, 1234ULL},
+  {0.001f, 1024, 1024, 1024, 0.1, 1234ULL}};
+
 INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestF, ::testing::ValuesIn(sddmm_inputs_f));
 
 INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestD, ::testing::ValuesIn(sddmm_inputs_d));
 
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestH, ::testing::ValuesIn(sddmm_inputs_h));
+
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/sddmm.cu b/cpp/test/sparse/sddmm.cu
index 8ff20581c9..26c2c519dd 100644
--- a/cpp/test/sparse/sddmm.cu
+++ b/cpp/test/sparse/sddmm.cu
@@ -22,8 +22,12 @@
 #include <raft/sparse/linalg/sddmm.hpp>
 #include <raft/util/cudart_utils.hpp>
 
+#include <cuda_fp16.h>
+#include <thrust/device_vector.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
+#include <cusparse.h>
 #include <gtest/gtest.h>
 
 #include <iostream>
@@ -32,16 +36,16 @@
 namespace raft {
 namespace sparse {
 
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType, typename OutputType = ValueType>
 struct SDDMMInputs {
-  ValueType tolerance;
+  OutputType tolerance;
 
   IndexType m;
   IndexType k;
   IndexType n;
 
-  ValueType alpha;
-  ValueType beta;
+  OutputType alpha;
+  OutputType beta;
 
   bool transpose_a;
   bool transpose_b;
@@ -59,6 +63,10 @@ struct sum_abs_op {
   }
 };
 
+struct float_to_half {
+  __host__ __device__ __half operator()(const float x) const { return __float2half(x); }
+};
+
 template <typename ValueType, typename IndexType>
 ::std::ostream& operator<<(::std::ostream& os, const SDDMMInputs<ValueType, IndexType>& params)
 {
@@ -72,11 +80,12 @@ template <typename ValueType, typename IndexType>
 template <typename ValueType,
           typename IndexType,
           typename LayoutPolicyA = raft::layout_c_contiguous,
-          typename LayoutPolicyB = raft::layout_c_contiguous>
-class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexType>> {
+          typename LayoutPolicyB = raft::layout_c_contiguous,
+          typename OutputType    = ValueType>
+class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexType, OutputType>> {
  public:
   SDDMMTest()
-    : params(::testing::TestWithParam<SDDMMInputs<ValueType, IndexType>>::GetParam()),
+    : params(::testing::TestWithParam<SDDMMInputs<ValueType, IndexType, OutputType>>::GetParam()),
       stream(resource::get_cuda_stream(handle)),
       a_data_d(0, resource::get_cuda_stream(handle)),
       b_data_d(0, resource::get_cuda_stream(handle)),
@@ -88,9 +97,25 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
   }
 
  protected:
+  bool isCuSparseVersionGreaterThan_12_0_1()
+  {
+    int version;
+    cusparseHandle_t handle;
+    cusparseCreate(&handle);
+    cusparseGetVersion(handle, &version);
+
+    int major = version / 1000;
+    int minor = (version % 1000) / 100;
+    int patch = version % 100;
+
+    cusparseDestroy(handle);
+
+    return (major > 12) || (major == 12 && minor > 0) || (major == 12 && minor == 0 && patch >= 2);
+  }
+
   IndexType create_sparse_matrix(IndexType m,
                                  IndexType n,
-                                 ValueType sparsity,
+                                 OutputType sparsity,
                                  std::vector<bool>& matrix)
   {
     IndexType total_elements = static_cast<IndexType>(m * n);
@@ -119,7 +144,7 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
   void convert_to_csr(std::vector<bool>& matrix,
                       IndexType rows,
                       IndexType cols,
-                      std::vector<ValueType>& values,
+                      std::vector<OutputType>& values,
                       std::vector<IndexType>& indices,
                       std::vector<IndexType>& indptr)
   {
@@ -130,7 +155,7 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
     for (IndexType i = 0; i < rows; ++i) {
       for (IndexType j = 0; j < cols; ++j) {
         if (matrix[i * cols + j]) {
-          values[offset_values]  = static_cast<ValueType>(1.0);
+          values[offset_values]  = static_cast<OutputType>(1.0);
           indices[offset_values] = static_cast<IndexType>(j);
           offset_values++;
         }
@@ -141,7 +166,7 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
 
   void cpu_sddmm(const std::vector<ValueType>& A,
                  const std::vector<ValueType>& B,
-                 std::vector<ValueType>& vals,
+                 std::vector<OutputType>& vals,
                  const std::vector<IndexType>& cols,
                  const std::vector<IndexType>& row_ptrs,
                  bool is_row_major_A,
@@ -158,11 +183,15 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
 
     for (IndexType i = 0; i < params.m; ++i) {
       for (IndexType j = row_ptrs[i]; j < row_ptrs[i + 1]; ++j) {
-        ValueType sum = 0;
+        OutputType sum = 0;
         for (IndexType l = 0; l < params.k; ++l) {
           IndexType a_index = trans_a ? i * params.k + l : l * params.m + i;
           IndexType b_index = trans_b ? l * params.n + cols[j] : cols[j] * params.k + l;
-          sum += A[a_index] * B[b_index];
+          if constexpr ((std::is_same_v<OutputType, float> && std::is_same_v<ValueType, half>)) {
+            sum += __half2float(A[a_index]) * __half2float(B[b_index]);
+          } else {
+            sum += A[a_index] * B[b_index];
+          }
         }
         vals[j] = params.alpha * sum + params.beta * vals[j];
       }
@@ -181,29 +210,53 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
     a_data_d.resize(a_size, stream);
     b_data_d.resize(b_size, stream);
 
-    auto blobs_a_b = raft::make_device_matrix<ValueType, IndexType>(handle, 1, a_size + b_size);
+    auto blobs_a_b = raft::make_device_matrix<OutputType, IndexType>(handle, 1, a_size + b_size);
     auto labels    = raft::make_device_vector<IndexType, IndexType>(handle, 1);
 
-    raft::random::make_blobs<ValueType, IndexType>(blobs_a_b.data_handle(),
-                                                   labels.data_handle(),
-                                                   1,
-                                                   a_size + b_size,
-                                                   1,
-                                                   stream,
-                                                   false,
-                                                   nullptr,
-                                                   nullptr,
-                                                   ValueType(1.0),
-                                                   false,
-                                                   ValueType(-1.0f),
-                                                   ValueType(1.0f),
-                                                   uint64_t(2024));
-
-    raft::copy(a_data_h.data(), blobs_a_b.data_handle(), a_size, stream);
-    raft::copy(b_data_h.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
-
-    raft::copy(a_data_d.data(), blobs_a_b.data_handle(), a_size, stream);
-    raft::copy(b_data_d.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+    raft::random::make_blobs<OutputType, IndexType>(blobs_a_b.data_handle(),
+                                                    labels.data_handle(),
+                                                    1,
+                                                    a_size + b_size,
+                                                    1,
+                                                    stream,
+                                                    false,
+                                                    nullptr,
+                                                    nullptr,
+                                                    OutputType(1.0),
+                                                    false,
+                                                    OutputType(-1.0f),
+                                                    OutputType(1.0f),
+                                                    uint64_t(2024));
+    if constexpr ((std::is_same_v<OutputType, float> && std::is_same_v<ValueType, half>)) {
+      {
+        thrust::device_ptr<OutputType> d_output_ptr =
+          thrust::device_pointer_cast(blobs_a_b.data_handle());
+        thrust::device_ptr<ValueType> d_value_ptr = thrust::device_pointer_cast(a_data_d.data());
+        thrust::transform(thrust::cuda::par.on(stream),
+                          d_output_ptr,
+                          d_output_ptr + a_size,
+                          d_value_ptr,
+                          float_to_half());
+      }
+      {
+        thrust::device_ptr<OutputType> d_output_ptr =
+          thrust::device_pointer_cast(blobs_a_b.data_handle() + a_size);
+        thrust::device_ptr<ValueType> d_value_ptr = thrust::device_pointer_cast(b_data_d.data());
+        thrust::transform(thrust::cuda::par.on(stream),
+                          d_output_ptr,
+                          d_output_ptr + b_size,
+                          d_value_ptr,
+                          float_to_half());
+      }
+      raft::copy(a_data_h.data(), a_data_d.data(), a_size, stream);
+      raft::copy(b_data_h.data(), b_data_d.data(), b_size, stream);
+    } else {
+      raft::copy(a_data_h.data(), blobs_a_b.data_handle(), a_size, stream);
+      raft::copy(b_data_h.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+
+      raft::copy(a_data_d.data(), blobs_a_b.data_handle(), a_size, stream);
+      raft::copy(b_data_d.data(), blobs_a_b.data_handle() + a_size, b_size, stream);
+    }
 
     resource::sync_stream(handle);
 
@@ -213,7 +266,7 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
 
     std::vector<IndexType> c_indptr_h(params.m + 1);
     std::vector<IndexType> c_indices_h(c_true_nnz);
-    std::vector<ValueType> c_data_h(c_true_nnz);
+    std::vector<OutputType> c_data_h(c_true_nnz);
 
     convert_to_csr(c_dense_data_h, params.m, params.n, c_data_h, c_indices_h, c_indptr_h);
 
@@ -238,7 +291,13 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
     resource::sync_stream(handle);
   }
 
-  void SetUp() override { make_data(); }
+  void SetUp() override
+  {
+    if (std::is_same_v<ValueType, half> && !isCuSparseVersionGreaterThan_12_0_1()) {
+      GTEST_SKIP() << "Skipping all tests for half-float as cuSparse doesn't support it.";
+    }
+    make_data();
+  }
 
   void Run()
   {
@@ -258,7 +317,7 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
       params.n,
       static_cast<IndexType>(c_indices_d.size()));
 
-    auto c = raft::make_device_csr_matrix_view<ValueType>(c_data_d.data(), c_structure);
+    auto c = raft::make_device_csr_matrix_view<OutputType>(c_data_d.data(), c_structure);
 
     auto op_a = params.transpose_a ? raft::linalg::Operation::TRANSPOSE
                                    : raft::linalg::Operation::NON_TRANSPOSE;
@@ -271,41 +330,41 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
                                 c,
                                 op_a,
                                 op_b,
-                                raft::make_host_scalar_view<ValueType>(&params.alpha),
-                                raft::make_host_scalar_view<ValueType>(&params.beta));
+                                raft::make_host_scalar_view<OutputType>(&params.alpha),
+                                raft::make_host_scalar_view<OutputType>(&params.beta));
 
     resource::sync_stream(handle);
 
-    ASSERT_TRUE(raft::devArrMatch<ValueType>(c_expected_data_d.data(),
-                                             c.get_elements().data(),
-                                             c_expected_data_d.size(),
-                                             raft::CompareApprox<ValueType>(params.tolerance),
-                                             stream));
+    ASSERT_TRUE(raft::devArrMatch<OutputType>(c_expected_data_d.data(),
+                                              c.get_elements().data(),
+                                              c_expected_data_d.size(),
+                                              raft::CompareApprox<OutputType>(params.tolerance),
+                                              stream));
 
-    thrust::device_ptr<ValueType> expected_data_ptr =
+    thrust::device_ptr<OutputType> expected_data_ptr =
       thrust::device_pointer_cast(c_expected_data_d.data());
-    ValueType sum_abs = thrust::reduce(thrust::cuda::par.on(stream),
-                                       expected_data_ptr,
-                                       expected_data_ptr + c_expected_data_d.size(),
-                                       ValueType(0.0f),
-                                       sum_abs_op<ValueType>());
-    ValueType avg     = sum_abs / (1.0f * c_expected_data_d.size());
-
-    ASSERT_GE(avg, (params.tolerance * static_cast<ValueType>(0.001f)));
+    OutputType sum_abs = thrust::reduce(thrust::cuda::par.on(stream),
+                                        expected_data_ptr,
+                                        expected_data_ptr + c_expected_data_d.size(),
+                                        OutputType(0.0f),
+                                        sum_abs_op<OutputType>());
+    OutputType avg     = sum_abs / (1.0f * c_expected_data_d.size());
+
+    ASSERT_GE(avg, (params.tolerance * static_cast<OutputType>(0.001f)));
   }
 
   raft::resources handle;
   cudaStream_t stream;
-  SDDMMInputs<ValueType, IndexType> params;
+  SDDMMInputs<ValueType, IndexType, OutputType> params;
 
   rmm::device_uvector<ValueType> a_data_d;
   rmm::device_uvector<ValueType> b_data_d;
 
   rmm::device_uvector<IndexType> c_indptr_d;
   rmm::device_uvector<IndexType> c_indices_d;
-  rmm::device_uvector<ValueType> c_data_d;
+  rmm::device_uvector<OutputType> c_data_d;
 
-  rmm::device_uvector<ValueType> c_expected_data_d;
+  rmm::device_uvector<OutputType> c_expected_data_d;
 };
 
 using SDDMMTestF_Row_Col = SDDMMTest<float, int, raft::row_major, raft::col_major>;
@@ -332,6 +391,18 @@ TEST_P(SDDMMTestD_Row_Row, Result) { Run(); }
 using SDDMMTestD_Col_Col = SDDMMTest<double, int, raft::col_major, raft::col_major>;
 TEST_P(SDDMMTestD_Col_Col, Result) { Run(); }
 
+using SDDMMTestHF_Row_Col = SDDMMTest<half, int, raft::row_major, raft::col_major, float>;
+TEST_P(SDDMMTestHF_Row_Col, Result) { Run(); }
+
+using SDDMMTestHF_Col_Row = SDDMMTest<half, int, raft::col_major, raft::row_major, float>;
+TEST_P(SDDMMTestHF_Col_Row, Result) { Run(); }
+
+using SDDMMTestHF_Row_Row = SDDMMTest<half, int, raft::row_major, raft::row_major, float>;
+TEST_P(SDDMMTestHF_Row_Row, Result) { Run(); }
+
+using SDDMMTestHF_Col_Col = SDDMMTest<half, int, raft::col_major, raft::col_major, float>;
+TEST_P(SDDMMTestHF_Col_Col, Result) { Run(); }
+
 const std::vector<SDDMMInputs<float, int>> sddmm_inputs_f = {
   {0.0001f, 10, 5, 32, 1.0, 0.0, false, false, 0.01, 1234ULL},
   {0.0001f, 1024, 32, 1024, 0.3, 0.0, true, false, 0.1, 1234ULL},
@@ -352,6 +423,16 @@ const std::vector<SDDMMInputs<double, int>> sddmm_inputs_d = {
   {0.0001f, 32, 1024, 1024, 2.0, 0.2, false, true, 0.19, 1234ULL},
   {0.0001f, 1024, 1024, 1024, 0.0, 1.2, true, true, 0.1, 1234ULL}};
 
+const std::vector<SDDMMInputs<half, int, float>> sddmm_inputs_h_f = {
+  {0.0001f, 10, 5, 32, 1.0, 0.0, false, false, 0.01, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 0.3, 0.0, true, false, 0.1, 1234ULL},
+  {0.0003f, 32, 1024, 1024, 1.0, 0.3, false, true, 0.2, 1234ULL},
+  {0.001f, 1024, 1024, 1024, 0.2, 0.2, true, true, 0.19, 1234ULL},
+  {0.0001f, 1024, 1024, 32, 0.1, 0.2, false, false, 0.3, 1234ULL},
+  {0.0001f, 1024, 32, 1024, 1.0, 0.3, true, false, 0.4, 1234ULL},
+  {0.0003f, 32, 1024, 1024, 2.0, 0.2, false, true, 0.19, 1234ULL},
+  {0.001f, 1024, 1024, 1024, 0.0, 1.2, true, true, 0.1, 1234ULL}};
+
 INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestF_Row_Col, ::testing::ValuesIn(sddmm_inputs_f));
 INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestF_Col_Row, ::testing::ValuesIn(sddmm_inputs_f));
 INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestF_Row_Row, ::testing::ValuesIn(sddmm_inputs_f));
@@ -362,5 +443,10 @@ INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestD_Col_Row, ::testing::ValuesIn(sddmm
 INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestD_Row_Row, ::testing::ValuesIn(sddmm_inputs_d));
 INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestD_Col_Col, ::testing::ValuesIn(sddmm_inputs_d));
 
+INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestHF_Row_Col, ::testing::ValuesIn(sddmm_inputs_h_f));
+INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestHF_Col_Row, ::testing::ValuesIn(sddmm_inputs_h_f));
+INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestHF_Row_Row, ::testing::ValuesIn(sddmm_inputs_h_f));
+INSTANTIATE_TEST_CASE_P(SDDMMTest, SDDMMTestHF_Col_Col, ::testing::ValuesIn(sddmm_inputs_h_f));
+
 }  // namespace sparse
 }  // namespace raft

From f1cc0fb79d2f40f6cc266c60928ed70ea64c843a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 22 Aug 2024 11:51:31 -0400
Subject: [PATCH 07/31] Use CUDA math wheels (#2415)

Use CUDA math wheels to reduce wheel size by not statically linking CUDA math libraries.

Contributes to https://github.com/rapidsai/build-planning/issues/35

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2415
---
 ci/build_wheel.sh               | 21 ++++++++++++++++++++-
 ci/build_wheel_pylibraft.sh     | 11 ++++++++++-
 dependencies.yaml               | 31 +++++++++++++++++++++++++++++++
 python/pylibraft/CMakeLists.txt | 18 ++++++++++++++++++
 python/pylibraft/pyproject.toml |  6 +++++-
 5 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 62d93a668e..e7ae52f33a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -20,10 +20,29 @@ rapids-generate-version > VERSION
 
 cd "${package_dir}"
 
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXCLUDE_ARGS=(
+      --exclude "libcublas.so.12"
+      --exclude "libcublasLt.so.12"
+      --exclude "libcurand.so.10"
+      --exclude "libcusolver.so.11"
+      --exclude "libcusparse.so.12"
+      --exclude "libnvJitLink.so.12"
+      --exclude "libucp.so.0"
+    )
+  ;;
+  11.*)
+    EXCLUDE_ARGS=(
+      --exclude "libucp.so.0"
+    )
+  ;;
+esac
+
 # Hardcode the output dir
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
 mkdir -p final_dist
-python -m auditwheel repair -w final_dist --exclude "libucp.so.0" dist/*
+python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
 RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
index 895c311f46..ce9f0ed172 100755
--- a/ci/build_wheel_pylibraft.sh
+++ b/ci/build_wheel_pylibraft.sh
@@ -3,7 +3,16 @@
 
 set -euo pipefail
 
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+  ;;
+  11.*)
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+  ;;
+esac
+
 # Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF${EXTRA_CMAKE_ARGS}"
 
 ci/build_wheel.sh pylibraft python/pylibraft
diff --git a/dependencies.yaml b/dependencies.yaml
index 630b5a50c9..92c6d98414 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -83,6 +83,7 @@ files:
     extras:
       table: project
     includes:
+      - cuda_wheels
       - run_pylibraft
   py_test_pylibraft:
     output: pyproject
@@ -377,6 +378,36 @@ dependencies:
               - *libcusolver114
               - *libcusparse_dev114
               - *libcusparse114
+  cuda_wheels:
+    specific:
+      - output_types: pyproject
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              use_cuda_wheels: "true"
+            packages:
+              - nvidia-cublas-cu12
+              - nvidia-curand-cu12
+              - nvidia-cusolver-cu12
+              - nvidia-cusparse-cu12
+          # CUDA 11 does not provide wheels, so use the system libraries instead
+          - matrix:
+              cuda: "11.*"
+              use_cuda_wheels: "true"
+            packages:
+          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
+          # (e.g. for DLFW and pip devcontainers)
+          - matrix:
+              use_cuda_wheels: "false"
+            packages:
+          # if no matching matrix selectors passed, list the unsuffixed packages
+          # (just as a source of documentation, as this populates pyproject.toml in source control)
+          - matrix:
+            packages:
+              - nvidia-cublas
+              - nvidia-curand
+              - nvidia-cusolver
+              - nvidia-cusparse
 
   depends_on_cupy:
     common:
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 6cbe8e4cbf..c286d3debf 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -30,6 +30,7 @@ project(
 option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
        ON
 )
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
 
 # If the user requested it we attempt to find RAFT.
 if(FIND_RAFT_CPP)
@@ -48,15 +49,32 @@ endif()
 include(rapids-cython-core)
 
 if(NOT raft_FOUND)
+  find_package(CUDAToolkit REQUIRED)
+
   set(BUILD_TESTS OFF)
   set(BUILD_PRIMS_BENCH OFF)
   set(BUILD_ANN_BENCH OFF)
   set(RAFT_COMPILE_LIBRARY ON)
   set(CUDA_STATIC_RUNTIME ON)
   set(CUDA_STATIC_MATH_LIBRARIES ON)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+    set(CUDA_STATIC_MATH_LIBRARIES OFF)
+  elseif(USE_CUDA_MATH_WHEELS)
+    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+  endif()
 
   add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
 
+  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+    set_property(TARGET raft_lib PROPERTY INSTALL_RPATH
+      "$ORIGIN/../nvidia/cublas/lib"
+      "$ORIGIN/../nvidia/curand/lib"
+      "$ORIGIN/../nvidia/cusolver/lib"
+      "$ORIGIN/../nvidia/cusparse/lib"
+      "$ORIGIN/../nvidia/nvjitlink/lib"
+    )
+  endif()
+
   # When building the C++ libraries from source we must copy libraft.so alongside the
   # pairwise_distance and random Cython libraries TODO: when we have a single 'compiled' raft
   # library, we shouldn't need this
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 4c1d8e8fa1..9a826e53c6 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -33,6 +33,10 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python",
     "numpy>=1.23,<2.0a0",
+    "nvidia-cublas",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -124,7 +128,7 @@ requires = [
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 
 [tool.pytest.ini_options]
 filterwarnings = [

From 2f587b13b62d2dd8b15bf6d2902c65d67e7afc7b Mon Sep 17 00:00:00 2001
From: Jinsol Park <jinsolp@andrew.cmu.edu>
Date: Thu, 22 Aug 2024 17:17:30 -0700
Subject: [PATCH 08/31] [FEA] Batching NN Descent (#2403)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR implements batching NN Descent. It will be helpful for reducing device memory usages for large datasets (specifically if the dataset is kept on host).
`index_params` now has...
- `n_clusters`: number of clusters to make. Larger clusters reduce device memory usage. Default is 1, in which case it doesn't do the batched NND.

### Notes
- The batching approach may have duplicate indices in the knn graph (in rare cases) because sometimes distances calculated for the same pair may be slightly different. This results in putting the same index far apart after sorting by distances, making it difficult to get unique indices (which is done by looking at 2 indices before the current one).
  - handled by adding a `max_duplicates` for `check_unique_indices` in tests

### Benchmarks
- Dataset for NND (no batch and batch) is on host
- Dataset for brute force knn is on device (but still won't be able to run with large datasets even if the data is put on the host because it brings the entire dataset to device anyway)
- The dataset is just a slice of the wiki-all dataset (88M, 768) to test for different sizes
<img width="773" alt="Screenshot 2024-08-02 at 8 35 58 AM" src="https://github.com/user-attachments/assets/2d2d4bb2-a7a9-4731-9b3d-7d0eccf875ea">

Authors:
  - Jinsol Park (https://github.com/jinsolp)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2403
---
 .../raft/neighbors/detail/nn_descent.cuh      |  48 +-
 .../neighbors/detail/nn_descent_batch.cuh     | 701 ++++++++++++++++++
 cpp/include/raft/neighbors/nn_descent.cuh     |  30 +-
 .../raft/neighbors/nn_descent_types.hpp       |  23 +
 cpp/test/CMakeLists.txt                       |  36 +-
 cpp/test/neighbors/ann_nn_descent.cuh         | 155 +++-
 .../test_batch_float_uint32_t.cu              |  30 +
 cpp/test/neighbors/ann_utils.cuh              |  18 +-
 8 files changed, 981 insertions(+), 60 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/detail/nn_descent_batch.cuh
 create mode 100644 cpp/test/neighbors/ann_nn_descent/test_batch_float_uint32_t.cu

diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index 9c37ee146d..02610f9afb 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -26,6 +26,7 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/matrix/init.cuh>
 #include <raft/matrix/slice.cuh>
 #include <raft/neighbors/detail/cagra/device_common.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
@@ -344,7 +345,9 @@ struct GnndGraph {
   ~GnndGraph();
 };
 
-template <typename Data_t = float, typename Index_t = int, typename epilogue_op = raft::identity_op>
+template <typename Data_t      = float,
+          typename Index_t     = int,
+          typename epilogue_op = DistEpilogue<Index_t, Data_t>>
 class GNND {
  public:
   GNND(raft::resources const& res, const BuildConfig& build_config);
@@ -356,9 +359,10 @@ class GNND {
              Index_t* output_graph,
              bool return_distances,
              DistData_t* output_distances,
-             epilogue_op distance_epilogue = raft::identity_op());
+             epilogue_op distance_epilogue = DistEpilogue<Index_t, Data_t>());
   ~GNND()    = default;
   using ID_t = InternalID_t<Index_t>;
+  void reset(raft::resources const& res);
 
  private:
   void add_reverse_edges(Index_t* graph_ptr,
@@ -366,7 +370,8 @@ class GNND {
                          Index_t* d_rev_graph_ptr,
                          int2* list_sizes,
                          cudaStream_t stream = 0);
-  void local_join(cudaStream_t stream = 0, epilogue_op distance_epilogue = raft::identity_op());
+  void local_join(cudaStream_t stream           = 0,
+                  epilogue_op distance_epilogue = DistEpilogue<Index_t, Data_t>());
 
   raft::resources const& res;
 
@@ -701,7 +706,7 @@ __device__ __forceinline__ void remove_duplicates(
 // is 1024 and 1536 respectively, which means the bounds don't work anymore
 template <typename Index_t,
           typename ID_t        = InternalID_t<Index_t>,
-          typename epilogue_op = raft::identity_op>
+          typename epilogue_op = DistEpilogue<Index_t, DistData_t>>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
 #if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890)
@@ -1183,18 +1188,23 @@ GNND<Data_t, Index_t, epilogue_op>::GNND(raft::resources const& res,
     d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
 {
   static_assert(NUM_SAMPLES <= 32);
-
-  thrust::fill(thrust::device,
-               dists_buffer_.data_handle(),
-               dists_buffer_.data_handle() + dists_buffer_.size(),
-               std::numeric_limits<float>::max());
-  thrust::fill(thrust::device,
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()),
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()) + graph_buffer_.size(),
-               std::numeric_limits<Index_t>::max());
-  thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0);
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
 };
 
+template <typename Data_t, typename Index_t, typename epilogue_op>
+void GNND<Data_t, Index_t, epilogue_op>::reset(raft::resources const& res)
+{
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
+}
+
 template <typename Data_t, typename Index_t, typename epilogue_op>
 void GNND<Data_t, Index_t, epilogue_op>::add_reverse_edges(Index_t* graph_ptr,
                                                            Index_t* h_rev_graph_ptr,
@@ -1246,6 +1256,7 @@ void GNND<Data_t, Index_t, epilogue_op>::build(Data_t* data,
 
   cudaStream_t stream = raft::resource::get_cuda_stream(res);
   nrow_               = nrow;
+  graph_.nrow         = nrow;
   graph_.h_graph      = (InternalID_t<Index_t>*)output_graph;
 
   cudaPointerAttributes data_ptr_attr;
@@ -1384,6 +1395,7 @@ void GNND<Data_t, Index_t, epilogue_op>::build(Data_t* data,
                                            static_cast<int64_t>(build_config_.output_graph_degree)};
     raft::matrix::slice<DistData_t, int64_t, raft::row_major>(
       res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords);
+    raft::resource::sync_stream(res);
   }
 
   Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle();
@@ -1414,14 +1426,14 @@ void GNND<Data_t, Index_t, epilogue_op>::build(Data_t* data,
 
 template <typename T,
           typename IdxT        = uint32_t,
-          typename epilogue_op = raft::identity_op,
+          typename epilogue_op = DistEpilogue<IdxT, T>,
           typename Accessor =
             host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
 void build(raft::resources const& res,
            const index_params& params,
            mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
            index<IdxT>& idx,
-           epilogue_op distance_epilogue = raft::identity_op())
+           epilogue_op distance_epilogue = DistEpilogue<IdxT, T>())
 {
   RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
                "The dataset size for GNND should be less than %d",
@@ -1491,13 +1503,13 @@ void build(raft::resources const& res,
 
 template <typename T,
           typename IdxT        = uint32_t,
-          typename epilogue_op = raft::identity_op,
+          typename epilogue_op = DistEpilogue<IdxT, T>,
           typename Accessor =
             host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
 index<IdxT> build(raft::resources const& res,
                   const index_params& params,
                   mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
-                  epilogue_op distance_epilogue = raft::identity_op())
+                  epilogue_op distance_epilogue = DistEpilogue<IdxT, T>())
 {
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
diff --git a/cpp/include/raft/neighbors/detail/nn_descent_batch.cuh b/cpp/include/raft/neighbors/detail/nn_descent_batch.cuh
new file mode 100644
index 0000000000..78467c9741
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/nn_descent_batch.cuh
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/util/cudart_utils.hpp>
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#include "../nn_descent_types.hpp"
+#include "nn_descent.cuh"
+
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/managed_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/matrix/detail/gather_inplace.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/sample_rows.cuh>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#include <cub/cub.cuh>
+
+#include <vector_types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <random>
+#include <type_traits>
+
+namespace raft::neighbors::experimental::nn_descent::detail {
+
+//
+// Run balanced kmeans on a subsample of the dataset to get centroids
+//
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void get_balanced_kmeans_centroids(
+  raft::resources const& res,
+  raft::distance::DistanceType metric,
+  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+  raft::device_matrix_view<T, IdxT> centroids)
+{
+  size_t num_rows   = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols   = static_cast<size_t>(dataset.extent(1));
+  size_t n_clusters = centroids.extent(0);
+  size_t num_subsamples =
+    std::min(static_cast<size_t>(num_rows / n_clusters), static_cast<size_t>(num_rows * 0.1));
+
+  auto d_subsample_dataset =
+    raft::make_device_matrix<T, int64_t, raft::row_major>(res, num_subsamples, num_cols);
+  raft::matrix::sample_rows<T, int64_t, Accessor>(
+    res, raft::random::RngState{0}, dataset, d_subsample_dataset.view());
+
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.metric = metric;
+
+  auto d_subsample_dataset_const_view =
+    raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
+      d_subsample_dataset.data_handle(), num_subsamples, num_cols);
+  raft::cluster::kmeans_balanced::fit<T, T, IdxT>(
+    res, kmeans_params, d_subsample_dataset_const_view, centroids);
+}
+
+//
+// Get the top k closest centroid indices for each data point
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename T, typename IdxT = uint32_t>
+void get_global_nearest_k(
+  raft::resources const& res,
+  size_t k,
+  size_t num_rows,
+  size_t n_clusters,
+  const T* dataset,
+  raft::host_matrix_view<IdxT, IdxT, raft::row_major> global_nearest_cluster,
+  raft::device_matrix_view<T, IdxT, raft::row_major> centroids,
+  raft::distance::DistanceType metric)
+{
+  size_t num_cols = centroids.extent(1);
+
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset));
+  float* ptr = reinterpret_cast<float*>(attr.devicePointer);
+
+  if (ptr == nullptr) {  // data on host
+    size_t num_batches = n_clusters;
+    size_t batch_size  = (num_rows + n_clusters) / n_clusters;
+
+    auto d_dataset_batch =
+      raft::make_device_matrix<T, IdxT, raft::row_major>(res, batch_size, num_cols);
+
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<IdxT, IdxT, raft::row_major>(res, batch_size, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, IdxT, raft::row_major>(res, batch_size, k);
+
+    for (size_t i = 0; i < num_batches; i++) {
+      size_t batch_size_ = batch_size;
+
+      if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; }
+      raft::copy(d_dataset_batch.data_handle(),
+                 dataset + i * batch_size * num_cols,
+                 batch_size_ * num_cols,
+                 resource::get_cuda_stream(res));
+
+      raft::neighbors::brute_force::fused_l2_knn<T, IdxT, raft::row_major, raft::row_major>(
+        res,
+        raft::make_const_mdspan(centroids),
+        raft::make_const_mdspan(d_dataset_batch.view()),
+        nearest_clusters_idx.view(),
+        nearest_clusters_dist.view(),
+        metric);
+      raft::copy(global_nearest_cluster.data_handle() + i * batch_size * k,
+                 nearest_clusters_idx.data_handle(),
+                 batch_size_ * k,
+                 resource::get_cuda_stream(res));
+    }
+  } else {  // data on device
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<IdxT, IdxT, raft::row_major>(res, num_rows, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, IdxT, raft::row_major>(res, num_rows, k);
+
+    raft::neighbors::brute_force::fused_l2_knn<T, IdxT, raft::row_major, raft::row_major>(
+      res,
+      raft::make_const_mdspan(centroids),
+      raft::make_device_matrix_view<const T, IdxT, row_major>(dataset, num_rows, num_cols),
+      nearest_clusters_idx.view(),
+      nearest_clusters_dist.view(),
+      metric);
+
+    raft::copy(global_nearest_cluster.data_handle(),
+               nearest_clusters_idx.data_handle(),
+               num_rows * k,
+               resource::get_cuda_stream(res));
+  }
+}
+
+//
+// global_nearest_cluster [num_rows X k=2] : top 2 closest clusters for each data point
+// inverted_indices [num_rows x k vector] : sparse vector for data indices for each cluster
+// cluster_size [n_cluster] : cluster size for each cluster
+// offset [n_cluster] : offset in inverted_indices for each cluster
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename IdxT = uint32_t>
+void get_inverted_indices(raft::resources const& res,
+                          size_t n_clusters,
+                          size_t& max_cluster_size,
+                          size_t& min_cluster_size,
+                          raft::host_matrix_view<IdxT, IdxT> global_nearest_cluster,
+                          raft::host_vector_view<IdxT, IdxT> inverted_indices,
+                          raft::host_vector_view<IdxT, IdxT> cluster_size,
+                          raft::host_vector_view<IdxT, IdxT> offset)
+{
+  // build sparse inverted indices and get number of data points for each cluster
+  size_t num_rows = global_nearest_cluster.extent(0);
+  size_t k        = global_nearest_cluster.extent(1);
+
+  auto local_offset = raft::make_host_vector<IdxT>(n_clusters);
+
+  max_cluster_size = 0;
+  min_cluster_size = std::numeric_limits<size_t>::max();
+
+  thrust::fill(
+    thrust::host, cluster_size.data_handle(), cluster_size.data_handle() + n_clusters, 0);
+  thrust::fill(
+    thrust::host, local_offset.data_handle(), local_offset.data_handle() + n_clusters, 0);
+
+  // TODO: this part isn't really a bottleneck but maybe worth trying omp parallel
+  // for with atomic add
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      cluster_size(cluster_id) += 1;
+    }
+  }
+
+  offset(0) = 0;
+  for (size_t i = 1; i < n_clusters; i++) {
+    offset(i) = offset(i - 1) + cluster_size(i - 1);
+  }
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      inverted_indices(offset(cluster_id) + local_offset(cluster_id)) = i;
+      local_offset(cluster_id) += 1;
+    }
+  }
+
+  max_cluster_size = static_cast<size_t>(
+    *std::max_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+  min_cluster_size = static_cast<size_t>(
+    *std::min_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+}
+
+template <typename KeyType, typename ValueType>
+struct KeyValuePair {
+  KeyType key;
+  ValueType value;
+};
+
+template <typename KeyType, typename ValueType>
+struct CustomKeyComparator {
+  __device__ bool operator()(const KeyValuePair<KeyType, ValueType>& a,
+                             const KeyValuePair<KeyType, ValueType>& b) const
+  {
+    if (a.key == b.key) { return a.value < b.value; }
+    return a.key < b.key;
+  }
+};
+
+template <typename IdxT, int BLOCK_SIZE, int ITEMS_PER_THREAD>
+RAFT_KERNEL merge_subgraphs(IdxT* cluster_data_indices,
+                            size_t graph_degree,
+                            size_t num_cluster_in_batch,
+                            float* global_distances,
+                            float* batch_distances,
+                            IdxT* global_indices,
+                            IdxT* batch_indices)
+{
+  size_t batch_row = blockIdx.x;
+  typedef cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>
+    BlockMergeSortType;
+  __shared__ typename cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>::
+    TempStorage tmpSmem;
+
+  extern __shared__ char sharedMem[];
+  float* blockKeys  = reinterpret_cast<float*>(sharedMem);
+  IdxT* blockValues = reinterpret_cast<IdxT*>(&sharedMem[graph_degree * 2 * sizeof(float)]);
+  int16_t* uniqueMask =
+    reinterpret_cast<int16_t*>(&sharedMem[graph_degree * 2 * (sizeof(float) + sizeof(IdxT))]);
+
+  if (batch_row < num_cluster_in_batch) {
+    // load batch or global depending on threadIdx
+    size_t global_row = cluster_data_indices[batch_row];
+
+    KeyValuePair<float, IdxT> threadKeyValuePair[ITEMS_PER_THREAD];
+
+    size_t halfway   = BLOCK_SIZE / 2;
+    size_t do_global = threadIdx.x < halfway;
+
+    float* distances;
+    IdxT* indices;
+
+    if (do_global) {
+      distances = global_distances;
+      indices   = global_indices;
+    } else {
+      distances = batch_distances;
+      indices   = batch_indices;
+    }
+
+    size_t idxBase = (threadIdx.x * do_global + (threadIdx.x - halfway) * (1lu - do_global)) *
+                     static_cast<size_t>(ITEMS_PER_THREAD);
+    size_t arrIdxBase = (global_row * do_global + batch_row * (1lu - do_global)) * graph_degree;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < graph_degree) {
+        threadKeyValuePair[i].key   = distances[arrIdxBase + colId];
+        threadKeyValuePair[i].value = indices[arrIdxBase + colId];
+      } else {
+        threadKeyValuePair[i].key   = std::numeric_limits<float>::max();
+        threadKeyValuePair[i].value = std::numeric_limits<IdxT>::max();
+      }
+    }
+
+    __syncthreads();
+
+    BlockMergeSortType(tmpSmem).Sort(threadKeyValuePair, CustomKeyComparator<float, IdxT>{});
+
+    // load sorted result into shared memory to get unique values
+    idxBase = threadIdx.x * ITEMS_PER_THREAD;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < 2 * graph_degree) {
+        blockKeys[colId]   = threadKeyValuePair[i].key;
+        blockValues[colId] = threadKeyValuePair[i].value;
+      }
+    }
+
+    __syncthreads();
+
+    // get unique mask
+    if (threadIdx.x == 0) { uniqueMask[0] = 1; }
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        uniqueMask[colId] = static_cast<int16_t>(blockValues[colId] != blockValues[colId - 1]);
+      }
+    }
+
+    __syncthreads();
+
+    // prefix sum
+    if (threadIdx.x == 0) {
+      for (int i = 1; i < 2 * graph_degree; i++) {
+        uniqueMask[i] += uniqueMask[i - 1];
+      }
+    }
+
+    __syncthreads();
+    // load unique values to global memory
+    if (threadIdx.x == 0) {
+      global_distances[global_row * graph_degree] = blockKeys[0];
+      global_indices[global_row * graph_degree]   = blockValues[0];
+    }
+
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        bool is_unique       = uniqueMask[colId] != uniqueMask[colId - 1];
+        int16_t global_colId = uniqueMask[colId] - 1;
+        if (is_unique && static_cast<size_t>(global_colId) < graph_degree) {
+          global_distances[global_row * graph_degree + global_colId] = blockKeys[colId];
+          global_indices[global_row * graph_degree + global_colId]   = blockValues[colId];
+        }
+      }
+    }
+  }
+}
+
+//
+// builds knn graph using NN Descent and merge with global graph
+//
+template <typename T,
+          typename IdxT        = uint32_t,
+          typename epilogue_op = DistEpilogue<IdxT, T>,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void build_and_merge(raft::resources const& res,
+                     const index_params& params,
+                     size_t num_data_in_cluster,
+                     size_t graph_degree,
+                     size_t int_graph_node_degree,
+                     T* cluster_data,
+                     IdxT* cluster_data_indices,
+                     int* int_graph,
+                     IdxT* inverted_indices,
+                     IdxT* global_indices_d,
+                     float* global_distances_d,
+                     IdxT* batch_indices_h,
+                     IdxT* batch_indices_d,
+                     float* batch_distances_d,
+                     GNND<const T, int, epilogue_op>& nnd,
+                     epilogue_op distance_epilogue)
+{
+  nnd.build(
+    cluster_data, num_data_in_cluster, int_graph, true, batch_distances_d, distance_epilogue);
+
+  // remap indices
+#pragma omp parallel for
+  for (size_t i = 0; i < num_data_in_cluster; i++) {
+    for (size_t j = 0; j < graph_degree; j++) {
+      size_t local_idx                      = int_graph[i * int_graph_node_degree + j];
+      batch_indices_h[i * graph_degree + j] = inverted_indices[local_idx];
+    }
+  }
+
+  raft::copy(batch_indices_d,
+             batch_indices_h,
+             num_data_in_cluster * graph_degree,
+             raft::resource::get_cuda_stream(res));
+
+  size_t num_elems     = graph_degree * 2;
+  size_t sharedMemSize = num_elems * (sizeof(float) + sizeof(IdxT) + sizeof(int16_t));
+
+  if (num_elems <= 128) {
+    merge_subgraphs<IdxT, 32, 4>
+      <<<num_data_in_cluster, 32, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 512) {
+    merge_subgraphs<IdxT, 128, 4>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 1024) {
+    merge_subgraphs<IdxT, 128, 8>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 2048) {
+    merge_subgraphs<IdxT, 256, 8>
+      <<<num_data_in_cluster, 256, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else {
+    // this is as far as we can get due to the shared mem usage of cub::BlockMergeSort
+    RAFT_FAIL("The degree of knn is too large (%lu). It must be smaller than 1024", graph_degree);
+  }
+  raft::resource::sync_stream(res);
+}
+
+//
+// For each cluster, gather the data samples that belong to that cluster, and
+// call build_and_merge
+//
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = DistEpilogue<IdxT, T>>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::host_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config,
+                 epilogue_op distance_epilogue)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int, epilogue_op> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_host_matrix<T, int64_t, row_major>(max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on host. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+#pragma omp parallel for
+    for (size_t i = 0; i < num_data_in_cluster; i++) {
+      for (size_t j = 0; j < num_cols; j++) {
+        size_t global_row         = (inverted_indices + offset)[i];
+        cluster_data_matrix(i, j) = dataset(global_row, j);
+      }
+    }
+
+    distance_epilogue.preprocess_for_batch(cluster_data_indices + offset, num_data_in_cluster);
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_matrix.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd,
+                             distance_epilogue);
+    nnd.reset(res);
+  }
+}
+
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = DistEpilogue<IdxT, T>>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::device_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config,
+                 epilogue_op distance_epilogue)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int, epilogue_op> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_device_matrix<T, int64_t, row_major>(res, max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on device. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+    auto cluster_data_view = raft::make_device_matrix_view<T, IdxT>(
+      cluster_data_matrix.data_handle(), num_data_in_cluster, num_cols);
+    auto cluster_data_indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+      cluster_data_indices + offset, num_data_in_cluster);
+    distance_epilogue.preprocess_for_batch(cluster_data_indices + offset, num_data_in_cluster);
+
+    auto dataset_IdxT =
+      raft::make_device_matrix_view<const T, IdxT>(dataset.data_handle(), num_rows, num_cols);
+    raft::matrix::gather(res, dataset_IdxT, cluster_data_indices_view, cluster_data_view);
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_view.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd,
+                             distance_epilogue);
+    nnd.reset(res);
+  }
+}
+
+template <typename T,
+          typename IdxT        = uint32_t,
+          typename epilogue_op = DistEpilogue<IdxT, T>,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<float>, memory_type::host>>
+index<IdxT> batch_build(raft::resources const& res,
+                        const index_params& params,
+                        mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+                        epilogue_op distance_epilogue = DistEpilogue<IdxT, T>())
+{
+  size_t graph_degree        = params.graph_degree;
+  size_t intermediate_degree = params.intermediate_graph_degree;
+
+  size_t num_rows = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols = static_cast<size_t>(dataset.extent(1));
+
+  auto centroids =
+    raft::make_device_matrix<T, IdxT, raft::row_major>(res, params.n_clusters, num_cols);
+  get_balanced_kmeans_centroids<T, IdxT>(res, params.metric, dataset, centroids.view());
+
+  size_t k                    = 2;
+  auto global_nearest_cluster = raft::make_host_matrix<IdxT, IdxT, raft::row_major>(num_rows, k);
+  get_global_nearest_k<T, IdxT>(res,
+                                k,
+                                num_rows,
+                                params.n_clusters,
+                                dataset.data_handle(),
+                                global_nearest_cluster.view(),
+                                centroids.view(),
+                                params.metric);
+
+  auto inverted_indices = raft::make_host_vector<IdxT, IdxT, raft::row_major>(num_rows * k);
+  auto cluster_size     = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+  auto offset           = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+
+  size_t max_cluster_size, min_cluster_size;
+  get_inverted_indices(res,
+                       params.n_clusters,
+                       max_cluster_size,
+                       min_cluster_size,
+                       global_nearest_cluster.view(),
+                       inverted_indices.view(),
+                       cluster_size.view(),
+                       offset.view());
+
+  if (intermediate_degree >= min_cluster_size) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than minimum cluster size, reducing it to %lu",
+      dataset.extent(0));
+    intermediate_degree = min_cluster_size - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  size_t extended_graph_degree =
+    align32::roundUp(static_cast<size_t>(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3)));
+  size_t extended_intermediate_degree = align32::roundUp(
+    static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
+
+  auto int_graph = raft::make_host_matrix<int, int64_t, row_major>(
+    max_cluster_size, static_cast<int64_t>(extended_graph_degree));
+
+  BuildConfig build_config{.max_dataset_size      = max_cluster_size,
+                           .dataset_dim           = num_cols,
+                           .node_degree           = extended_graph_degree,
+                           .internal_node_degree  = extended_intermediate_degree,
+                           .max_iterations        = params.max_iterations,
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = graph_degree};
+
+  auto global_indices_h   = raft::make_managed_matrix<IdxT, int64_t>(res, num_rows, graph_degree);
+  auto global_distances_h = raft::make_managed_matrix<float, int64_t>(res, num_rows, graph_degree);
+
+  thrust::fill(thrust::host,
+               global_indices_h.data_handle(),
+               global_indices_h.data_handle() + num_rows * graph_degree,
+               std::numeric_limits<IdxT>::max());
+  thrust::fill(thrust::host,
+               global_distances_h.data_handle(),
+               global_distances_h.data_handle() + num_rows * graph_degree,
+               std::numeric_limits<float>::max());
+
+  auto batch_indices_h =
+    raft::make_host_matrix<IdxT, int64_t, row_major>(max_cluster_size, graph_degree);
+  auto batch_indices_d =
+    raft::make_device_matrix<IdxT, int64_t, row_major>(res, max_cluster_size, graph_degree);
+  auto batch_distances_d =
+    raft::make_device_matrix<float, int64_t, row_major>(res, max_cluster_size, graph_degree);
+
+  auto cluster_data_indices = raft::make_device_vector<IdxT, IdxT>(res, num_rows * k);
+  raft::copy(cluster_data_indices.data_handle(),
+             inverted_indices.data_handle(),
+             num_rows * k,
+             resource::get_cuda_stream(res));
+
+  cluster_nnd<T, IdxT>(res,
+                       params,
+                       graph_degree,
+                       extended_graph_degree,
+                       max_cluster_size,
+                       dataset,
+                       offset.data_handle(),
+                       cluster_size.data_handle(),
+                       cluster_data_indices.data_handle(),
+                       int_graph.data_handle(),
+                       inverted_indices.data_handle(),
+                       global_indices_h.data_handle(),
+                       global_distances_h.data_handle(),
+                       batch_indices_h.data_handle(),
+                       batch_indices_d.data_handle(),
+                       batch_distances_d.data_handle(),
+                       build_config,
+                       distance_epilogue);
+
+  index<IdxT> global_idx{
+    res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
+
+  raft::copy(global_idx.graph().data_handle(),
+             global_indices_h.data_handle(),
+             num_rows * graph_degree,
+             raft::resource::get_cuda_stream(res));
+  if (params.return_distances && global_idx.distances().has_value()) {
+    raft::copy(global_idx.distances().value().data_handle(),
+               global_distances_h.data_handle(),
+               num_rows * graph_degree,
+               raft::resource::get_cuda_stream(res));
+  }
+  return global_idx;
+}
+
+}  // namespace raft::neighbors::experimental::nn_descent::detail
diff --git a/cpp/include/raft/neighbors/nn_descent.cuh b/cpp/include/raft/neighbors/nn_descent.cuh
index a46a2006d6..6c08546d3f 100644
--- a/cpp/include/raft/neighbors/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/nn_descent.cuh
@@ -17,9 +17,11 @@
 #pragma once
 
 #include "detail/nn_descent.cuh"
+#include "detail/nn_descent_batch.cuh"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/neighbors/nn_descent.cuh>
 
 namespace raft::neighbors::experimental::nn_descent {
 
@@ -57,13 +59,17 @@ namespace raft::neighbors::experimental::nn_descent {
  * @param[in] distance_epilogue epilogue operation for distances
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
-template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = DistEpilogue<IdxT, T>>
 index<IdxT> build(raft::resources const& res,
                   index_params const& params,
                   raft::device_matrix_view<const T, int64_t, row_major> dataset,
-                  epilogue_op distance_epilogue = raft::identity_op())
+                  epilogue_op distance_epilogue = DistEpilogue<IdxT, T>())
 {
-  return detail::build<T, IdxT>(res, params, dataset, distance_epilogue);
+  if (params.n_clusters > 1) {
+    return detail::batch_build<T, IdxT>(res, params, dataset, distance_epilogue);
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset, distance_epilogue);
+  }
 }
 
 /**
@@ -98,12 +104,12 @@ index<IdxT> build(raft::resources const& res,
  * in host memory
  * @param[in] distance_epilogue epilogue operation for distances
  */
-template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = DistEpilogue<IdxT, T>>
 void build(raft::resources const& res,
            index_params const& params,
            raft::device_matrix_view<const T, int64_t, row_major> dataset,
            index<IdxT>& idx,
-           epilogue_op distance_epilogue = raft::identity_op())
+           epilogue_op distance_epilogue = DistEpilogue<IdxT, T>())
 {
   detail::build<T, IdxT>(res, params, dataset, idx, distance_epilogue);
 }
@@ -137,13 +143,17 @@ void build(raft::resources const& res,
  * @param[in] distance_epilogue epilogue operation for distances
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
-template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = DistEpilogue<IdxT, T>>
 index<IdxT> build(raft::resources const& res,
                   index_params const& params,
                   raft::host_matrix_view<const T, int64_t, row_major> dataset,
-                  epilogue_op distance_epilogue = raft::identity_op())
+                  epilogue_op distance_epilogue = DistEpilogue<IdxT, T>())
 {
-  return detail::build<T, IdxT>(res, params, dataset, distance_epilogue);
+  if (params.n_clusters > 1) {
+    return detail::batch_build<T, IdxT>(res, params, dataset, distance_epilogue);
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset, distance_epilogue);
+  }
 }
 
 /**
@@ -178,12 +188,12 @@ index<IdxT> build(raft::resources const& res,
  * in host memory
  * @param[in] distance_epilogue epilogue operation for distances
  */
-template <typename T, typename IdxT = uint32_t, typename epilogue_op = raft::identity_op>
+template <typename T, typename IdxT = uint32_t, typename epilogue_op = DistEpilogue<IdxT, T>>
 void build(raft::resources const& res,
            index_params const& params,
            raft::host_matrix_view<const T, int64_t, row_major> dataset,
            index<IdxT>& idx,
-           epilogue_op distance_epilogue = raft::identity_op())
+           epilogue_op distance_epilogue = DistEpilogue<IdxT, T>())
 {
   detail::build<T, IdxT>(res, params, dataset, idx, distance_epilogue);
 }
diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp
index 5d23ff2c2e..eb01a423be 100644
--- a/cpp/include/raft/neighbors/nn_descent_types.hpp
+++ b/cpp/include/raft/neighbors/nn_descent_types.hpp
@@ -48,6 +48,20 @@ namespace raft::neighbors::experimental::nn_descent {
  * `max_iterations`: The number of iterations that nn-descent will refine
  * the graph for. More iterations produce a better quality graph at cost of performance
  * `termination_threshold`: The delta at which nn-descent will terminate its iterations
+ * `return_distances`: boolean whether to return distances
+ * `n_clusters`: NN Descent offers batching a dataset to save GPU memory usage.
+ * Increase `n_clusters` to save GPU memory and run NN Descent with large datasets.
+ * Most effective when data is put on CPU memory.
+ * Setting this number too big may results in too much overhead of doing multiple
+ * iterations of graph building. Recommend starting at 4 and continue to increase
+ * depending on desired GPU memory usages.
+ * (Specifically, with n_clusters > 1, the NN Descent build algorithm will first
+ * find n_clusters number of cluster centroids of the dataset, then consider data
+ * points that belong to each cluster as a batch.
+ * Then we build knn subgraphs on each batch of the entire data. This is especially
+ * useful when the dataset is put on host, since only a subset of the data will
+ * be on GPU at once, enabling running NN Descent with large datasets that do not
+ * fit on the GPU as a whole.)
  *
  */
 struct index_params : ann::index_params {
@@ -56,6 +70,7 @@ struct index_params : ann::index_params {
   size_t max_iterations            = 20;      // Number of nn-descent iterations.
   float termination_threshold      = 0.0001;  // Termination threshold of nn-descent.
   bool return_distances            = false;   // return distances if true
+  size_t n_clusters                = 1;       // defaults to not using any batching
 };
 
 /**
@@ -178,6 +193,14 @@ struct index : ann::index {
   bool return_distances_;
 };
 
+template <typename value_idx, typename value_t = float>
+struct DistEpilogue : raft::identity_op {
+  __host__ void preprocess_for_batch(value_idx* cluster_indices, size_t num_data_in_cluster)
+  {
+    return;
+  }
+};
+
 /** @} */
 
 }  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index e3af6ebb78..a497e6d3ba 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -96,17 +96,8 @@ endfunction()
 
 if(BUILD_TESTS)
   ConfigureTest(
-    NAME
-    CLUSTER_TEST
-    PATH
-    cluster/kmeans.cu
-    cluster/kmeans_balanced.cu
-    cluster/kmeans_find_k.cu
-    cluster/cluster_solvers.cu
-    cluster/linkage.cu
-    cluster/spectral.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
+    NAME CLUSTER_TEST PATH cluster/kmeans.cu cluster/kmeans_balanced.cu cluster/kmeans_find_k.cu
+    cluster/cluster_solvers.cu cluster/linkage.cu cluster/spectral.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -144,8 +135,8 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME CORE_TEST PATH core/stream_view.cpp core/mdspan_copy.cpp LIB
-    EXPLICIT_INSTANTIATE_ONLY NOCUDA
+    NAME CORE_TEST PATH core/stream_view.cpp core/mdspan_copy.cpp LIB EXPLICIT_INSTANTIATE_ONLY
+    NOCUDA
   )
 
   ConfigureTest(
@@ -301,8 +292,8 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME SOLVERS_TEST PATH cluster/cluster_solvers_deprecated.cu linalg/eigen_solvers.cu
-    lap/lap.cu sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME SOLVERS_TEST PATH cluster/cluster_solvers_deprecated.cu linalg/eigen_solvers.cu lap/lap.cu
+    sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -331,19 +322,13 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH sparse/dist_coo_spmv.cu sparse/distance.cu
-    sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME SPARSE_DIST_TEST PATH sparse/dist_coo_spmv.cu sparse/distance.cu sparse/gram.cu LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
-    NAME
-    SPARSE_NEIGHBORS_TEST
-    PATH
-    sparse/neighbors/cross_component_nn.cu
-    sparse/neighbors/brute_force.cu
-    sparse/neighbors/knn_graph.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
+    NAME SPARSE_NEIGHBORS_TEST PATH sparse/neighbors/cross_component_nn.cu
+    sparse/neighbors/brute_force.cu sparse/neighbors/knn_graph.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -455,6 +440,7 @@ if(BUILD_TESTS)
     neighbors/ann_nn_descent/test_float_uint32_t.cu
     neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
     neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+    neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index f74cadb415..2f9d4e252b 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -42,6 +42,15 @@ struct AnnNNDescentInputs {
   double min_recall;
 };
 
+struct AnnNNDescentBatchInputs {
+  std::pair<double, size_t> recall_cluster;
+  int n_rows;
+  int dim;
+  int graph_degree;
+  raft::distance::DistanceType metric;
+  bool host_dataset;
+};
+
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p)
 {
   os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
@@ -50,6 +59,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs&
   return os;
 }
 
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentBatchInputs& p)
+{
+  os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << ", clusters=" << p.recall_cluster.second << std::endl;
+  return os;
+}
+
 template <typename DistanceT, typename DataT, typename IdxT>
 class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
  public:
@@ -105,7 +122,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
               (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
-            auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_host_view);
+            index<IdxT> index{handle_, ps.n_rows, static_cast<int64_t>(ps.graph_degree), true};
+            nn_descent::build<DataT, IdxT>(
+              handle_, index_params, database_host_view, index, DistEpilogue<IdxT, DataT>());
             raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
             if (index.distances().has_value()) {
@@ -116,7 +135,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
             }
 
           } else {
-            auto index = nn_descent::build<DataT, IdxT>(handle_, index_params, database_view);
+            index<IdxT> index{handle_, ps.n_rows, static_cast<int64_t>(ps.graph_degree), true};
+            nn_descent::build<DataT, IdxT>(
+              handle_, index_params, database_view, index, DistEpilogue<IdxT, DataT>());
             raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
             if (index.distances().has_value()) {
@@ -168,6 +189,127 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
   rmm::device_uvector<DataT> database;
 };
 
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnNNDescentBatchTest : public ::testing::TestWithParam<AnnNNDescentBatchInputs> {
+ public:
+  AnnNNDescentBatchTest()
+    : stream_(resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnNNDescentBatchInputs>::GetParam()),
+      database(0, stream_)
+  {
+  }
+
+  void testNNDescentBatch()
+  {
+    size_t queries_size = ps.n_rows * ps.graph_degree;
+    std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        database.data(),
+                                        database.data(),
+                                        ps.n_rows,
+                                        ps.n_rows,
+                                        ps.dim,
+                                        ps.graph_degree,
+                                        ps.metric);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    {
+      {
+        nn_descent::index_params index_params;
+        index_params.metric                    = ps.metric;
+        index_params.graph_degree              = ps.graph_degree;
+        index_params.intermediate_graph_degree = 2 * ps.graph_degree;
+        index_params.max_iterations            = 10;
+        index_params.return_distances          = true;
+        index_params.n_clusters                = ps.recall_cluster.second;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        {
+          if (ps.host_dataset) {
+            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            auto index = nn_descent::build<DataT, IdxT>(
+              handle_, index_params, database_host_view, DistEpilogue<IdxT, DataT>());
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
+          } else {
+            auto index = nn_descent::build<DataT, IdxT>(
+              handle_, index_params, database_view, DistEpilogue<IdxT, DataT>());
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+          };
+        }
+        resource::sync_stream(handle_);
+      }
+      double min_recall = ps.recall_cluster.first;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.01,
+                                  min_recall,
+                                  true,
+                                  static_cast<size_t>(ps.graph_degree * 0.1)));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    raft::random::RngState r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
+    } else {
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
+    }
+    resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    resource::sync_stream(handle_);
+    database.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnNNDescentBatchInputs ps;
+  rmm::device_uvector<DataT> database;
+};
+
 const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<AnnNNDescentInputs>(
   {1000, 2000},                                              // n_rows
   {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
@@ -176,4 +318,13 @@ const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<An
   {false, true},
   {0.90});
 
+const std::vector<AnnNNDescentBatchInputs> inputsBatch =
+  raft::util::itertools::product<AnnNNDescentBatchInputs>(
+    {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
+    {4000, 5000},                                          // n_rows
+    {192, 512},                                            // dim
+    {32, 64},                                              // graph_degree
+    {raft::distance::DistanceType::L2Expanded},
+    {false, true});
+
 }  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_batch_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
new file mode 100644
index 0000000000..c6f56e8c39
--- /dev/null
+++ b/cpp/test/neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../ann_nn_descent.cuh"
+
+#include <gtest/gtest.h>
+
+namespace raft::neighbors::experimental::nn_descent {
+
+typedef AnnNNDescentBatchTest<float, float, std::uint32_t> AnnNNDescentBatchTestF_U32;
+TEST_P(AnnNNDescentBatchTestF_U32, AnnNNDescentBatch) { this->testNNDescentBatch(); }
+
+INSTANTIATE_TEST_CASE_P(AnnNNDescentBatchTest,
+                        AnnNNDescentBatchTestF_U32,
+                        ::testing::ValuesIn(inputsBatch));
+
+}  // namespace raft::neighbors::experimental::nn_descent
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 2139e97428..82e3ace9da 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -153,9 +153,13 @@ auto calc_recall(const std::vector<T>& expected_idx,
 /** check uniqueness of indices
  */
 template <typename T>
-auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t cols)
+auto check_unique_indices(const std::vector<T>& actual_idx,
+                          size_t rows,
+                          size_t cols,
+                          size_t max_duplicates)
 {
   size_t max_count;
+  size_t dup_count = 0lu;
   std::set<T> unique_indices;
   for (size_t i = 0; i < rows; ++i) {
     unique_indices.clear();
@@ -168,8 +172,11 @@ auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t
       } else if (unique_indices.find(act_idx) == unique_indices.end()) {
         unique_indices.insert(act_idx);
       } else {
-        return testing::AssertionFailure()
-               << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        dup_count++;
+        if (dup_count > max_duplicates) {
+          return testing::AssertionFailure()
+                 << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        }
       }
     }
   }
@@ -252,7 +259,8 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
                      size_t cols,
                      double eps,
                      double min_recall,
-                     bool test_unique = true) -> testing::AssertionResult
+                     bool test_unique      = true,
+                     size_t max_duplicates = 0) -> testing::AssertionResult
 {
   auto [actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps);
@@ -270,7 +278,7 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
            << min_recall << "); eps = " << eps << ". ";
   }
   if (test_unique)
-    return check_unique_indices(actual_idx, rows, cols);
+    return check_unique_indices(actual_idx, rows, cols, max_duplicates);
   else
     return testing::AssertionSuccess();
 }

From 2dafe79af0216ad7189415ca3e2864277194c3e9 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Fri, 23 Aug 2024 07:49:30 +0200
Subject: [PATCH 09/31] Remove NumPy <2 pin (#2414)

This PR removes the NumPy<2 pin which is expected to work for
RAPIDS projects once CuPy 13.3.0 is released (CuPy 13.2.0 had
some issues preventing the use with NumPy 2).

Authors:
  - Sebastian Berg (https://github.com/seberg)
  - https://github.com/jakirkham

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/raft/pull/2414
---
 conda/environments/all_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml  | 2 +-
 conda/recipes/pylibraft/meta.yaml                 | 2 +-
 dependencies.yaml                                 | 3 +--
 python/pylibraft/pyproject.toml                   | 2 +-
 python/raft-dask/pyproject.toml                   | 1 -
 8 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index ef8524dce3..462874a7e7 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
 - pre-commit
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 6ffb27bb29..cfd974a6a8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - pre-commit
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index fd0e380a1c..82e391e9ae 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index ad4ecb7ff2..0389427d13 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 4ef85fc0e5..9d91af712e 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     {% endif %}
     - libraft {{ version }}
     - libraft-headers {{ version }}
-    - numpy >=1.23,<2.0a0
+    - numpy >=1.23,<3.0a0
     - python x.x
     - rmm ={{ minor_version }}
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 92c6d98414..a5b818e0f3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -473,7 +473,7 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &numpy numpy>=1.23,<2.0a0
+          - numpy>=1.23,<3.0a0
       - output_types: [conda]
         packages:
           - *rmm_unsuffixed
@@ -513,7 +513,6 @@ dependencies:
           - dask-cuda==24.10.*,>=0.0.0a0
           - joblib>=0.11
           - numba>=0.57
-          - *numpy
           - rapids-dask-dependency==24.10.*,>=0.0.0a0
       - output_types: conda
         packages:
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 9a826e53c6..9a6e454e7b 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -32,7 +32,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cuda-python",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "nvidia-cublas",
     "nvidia-curand",
     "nvidia-cusolver",
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 4fadfa5c9f..4e2e45206f 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -35,7 +35,6 @@ dependencies = [
     "distributed-ucxx==0.40.*,>=0.0.0a0",
     "joblib>=0.11",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
     "pylibraft==24.10.*,>=0.0.0a0",
     "rapids-dask-dependency==24.10.*,>=0.0.0a0",
     "ucx-py==0.40.*,>=0.0.0a0",

From 73dc36c5304b338ee8d12d775522dbf4d75a014a Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 23 Aug 2024 00:51:31 -0500
Subject: [PATCH 10/31] Drop Python 3.9 support (#2417)

Contributes to https://github.com/rapidsai/build-planning/issues/88

Finishes the work of dropping Python 3.9 support.

This project stopped building / testing against Python 3.9 as of https://github.com/rapidsai/shared-workflows/pull/235.
This PR updates configuration and docs to reflect that.

## Notes for Reviewers

### How I tested this

Checked that there were no remaining uses like this:

```shell
git grep -E '3\.9'
git grep '39'
git grep 'py39'
```

And similar for variations on Python 3.8 (to catch things that were missed the last time this was done).

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/raft/pull/2417
---
 dependencies.yaml                    | 6 +-----
 pyproject.toml                       | 2 +-
 python/pylibraft/pyproject.toml      | 3 +--
 python/raft-ann-bench/pyproject.toml | 3 +--
 python/raft-dask/pyproject.toml      | 3 +--
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index a5b818e0f3..e4e361548f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -454,10 +454,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.9"
-            packages:
-              - python=3.9
           - matrix:
               py: "3.10"
             packages:
@@ -468,7 +464,7 @@ dependencies:
               - python=3.11
           - matrix:
             packages:
-              - python>=3.9,<3.12
+              - python>=3.10,<3.12
   run_pylibraft:
     common:
       - output_types: [conda, pyproject]
diff --git a/pyproject.toml b/pyproject.toml
index 1e4ba0b369..5042113388 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.black]
 line-length = 79
-target-version = ["py39"]
+target-version = ["py310"]
 include = '\.py?$'
 force-exclude = '''
 /(
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 9a6e454e7b..14f2ba7d2f 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -29,7 +29,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cuda-python",
     "numpy>=1.23,<3.0a0",
@@ -42,7 +42,6 @@ dependencies = [
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index d22dd567fe..fa5781893b 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -25,7 +25,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 4e2e45206f..44012b5f10 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -29,7 +29,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "dask-cuda==24.10.*,>=0.0.0a0",
     "distributed-ucxx==0.40.*,>=0.0.0a0",
@@ -42,7 +42,6 @@ dependencies = [
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]

From e17dcdfdbe6985d9a9dfb4bd76d047dc08481630 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 27 Aug 2024 15:50:10 -0400
Subject: [PATCH 11/31] Update rapidsai/pre-commit-hooks (#2420)

This PR updates rapidsai/pre-commit-hooks to the version 0.4.0.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2420
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9aed6bf387..88be628e55 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -99,7 +99,7 @@ repos:
         hooks:
               - id: check-json
       - repo: https://github.com/rapidsai/pre-commit-hooks
-        rev: v0.3.1
+        rev: v0.4.0
         hooks:
           - id: verify-copyright
             files: |

From e04a640e489da027986f189608098308ae040ed5 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 29 Aug 2024 13:50:19 -0400
Subject: [PATCH 12/31] Exclude any kernel symbol that uses cutlass (#2425)

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/raft/pull/2425
---
 .github/workflows/pr.yaml   | 2 +-
 .github/workflows/test.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 98cb724623..381ca6b378 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -51,7 +51,7 @@ jobs:
     with:
       build_type: pull-request
       enable_check_symbols: true
-      symbol_exclusions: _ZN\d+raft_cutlass
+      symbol_exclusions: raft_cutlass
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index f6ca417dcf..ad0456d526 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -23,7 +23,7 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       enable_check_symbols: true
-      symbol_exclusions: _ZN\d+raft_cutlass
+      symbol_exclusions: raft_cutlass
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10

From 8a71b9831cc2d5701f5dbbe0229a5c677b032e6f Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 9 Sep 2024 12:15:18 -0700
Subject: [PATCH 13/31] Add support for Python 3.12 (#2428)

Contributes to https://github.com/rapidsai/build-planning/issues/40

This PR adds support for Python 3.12.

## Notes for Reviewers

This is part of ongoing work to add Python 3.12 support across RAPIDS.
It temporarily introduces a build/test matrix including Python 3.12, from https://github.com/rapidsai/shared-workflows/pull/213.

A follow-up PR will revert back to pointing at the `branch-24.10` branch of `shared-workflows` once all
RAPIDS repos have added Python 3.12 support.

### This will fail until all dependencies have been updates to Python 3.12

CI here is expected to fail until all of this project's upstream dependencies support Python 3.12.

This can be merged whenever all CI jobs are passing.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2428
---
 .github/workflows/build.yaml         | 16 ++++++++--------
 .github/workflows/pr.yaml            | 26 +++++++++++++-------------
 .github/workflows/test.yaml          | 10 +++++-----
 dependencies.yaml                    |  6 +++++-
 python/pylibraft/pyproject.toml      |  1 +
 python/raft-ann-bench/pyproject.toml |  1 +
 python/raft-dask/pyproject.toml      |  1 +
 7 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2b0ae5099c..037d7cc05f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       package-name: pylibraft
   wheel-build-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 381ca6b378..26c6928f77 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ad0456d526..90518c1132 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/dependencies.yaml b/dependencies.yaml
index e4e361548f..8f5c69245f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -463,8 +463,12 @@ dependencies:
             packages:
               - python=3.11
           - matrix:
+              py: "3.12"
             packages:
-              - python>=3.10,<3.12
+              - python=3.12
+          - matrix:
+            packages:
+              - python>=3.10,<3.13
   run_pylibraft:
     common:
       - output_types: [conda, pyproject]
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 14f2ba7d2f..a540915585 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -44,6 +44,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index fa5781893b..0e4fda1f00 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -27,6 +27,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.urls]
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 44012b5f10..d1f577120f 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -44,6 +44,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]

From ac53a0fdc35fe36a11a3ac4debd4cc9a4076fe7f Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Tue, 10 Sep 2024 17:20:52 +0200
Subject: [PATCH 14/31] [BUG] Fix bitset function visibility (#2429)

`raft::ceildiv` is also being replaced with `raft::div_rounding_up_safe` to avoid including CUDA headers when not needed.

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - rhdong (https://github.com/rhdong)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2429
---
 cpp/include/raft/core/bitmap.cuh |  6 +++---
 cpp/include/raft/core/bitset.cuh | 24 ++++++------------------
 cpp/include/raft/core/bitset.hpp | 11 +++++++++--
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh
index cafd1977ab..024b1244a6 100644
--- a/cpp/include/raft/core/bitmap.cuh
+++ b/cpp/include/raft/core/bitmap.cuh
@@ -35,9 +35,9 @@ _RAFT_HOST_DEVICE inline bool bitmap_view<bitmap_t, index_t>::test(const index_t
 }
 
 template <typename bitmap_t, typename index_t>
-_RAFT_HOST_DEVICE void bitmap_view<bitmap_t, index_t>::set(const index_t row,
-                                                           const index_t col,
-                                                           bool new_value) const
+_RAFT_DEVICE void bitmap_view<bitmap_t, index_t>::set(const index_t row,
+                                                      const index_t col,
+                                                      bool new_value) const
 {
   set(row * cols_ + col, new_value);
 }
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index 0cdb4c1fb6..b6e6128eca 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -46,8 +46,8 @@ _RAFT_HOST_DEVICE bool bitset_view<bitset_t, index_t>::operator[](const index_t
 }
 
 template <typename bitset_t, typename index_t>
-_RAFT_HOST_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index,
-                                                           bool set_value) const
+_RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index,
+                                                      bool set_value) const
 {
   const index_t bit_element = sample_index / bitset_element_size;
   const index_t bit_index   = sample_index % bitset_element_size;
@@ -60,18 +60,12 @@ _RAFT_HOST_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_
   }
 }
 
-template <typename bitset_t, typename index_t>
-_RAFT_HOST_DEVICE inline index_t bitset_view<bitset_t, index_t>::n_elements() const
-{
-  return raft::ceildiv(bitset_len_, bitset_element_size);
-}
-
 template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
                                   raft::device_vector_view<const index_t, index_t> mask_index,
                                   index_t bitset_len,
                                   bool default_value)
-  : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+  : bitset_{std::size_t(raft::div_rounding_up_safe(bitset_len, bitset_element_size)),
             raft::resource::get_cuda_stream(res)},
     bitset_len_{bitset_len}
 {
@@ -83,26 +77,20 @@ template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
                                   index_t bitset_len,
                                   bool default_value)
-  : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+  : bitset_{std::size_t(raft::div_rounding_up_safe(bitset_len, bitset_element_size)),
             raft::resource::get_cuda_stream(res)},
     bitset_len_{bitset_len}
 {
   reset(res, default_value);
 }
 
-template <typename bitset_t, typename index_t>
-index_t bitset<bitset_t, index_t>::n_elements() const
-{
-  return raft::ceildiv(bitset_len_, bitset_element_size);
-}
-
 template <typename bitset_t, typename index_t>
 void bitset<bitset_t, index_t>::resize(const raft::resources& res,
                                        index_t new_bitset_len,
                                        bool default_value)
 {
-  auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
-  auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
+  auto old_size = raft::div_rounding_up_safe(bitset_len_, bitset_element_size);
+  auto new_size = raft::div_rounding_up_safe(new_bitset_len, bitset_element_size);
   bitset_.resize(new_size);
   bitset_len_ = new_bitset_len;
   if (old_size < new_size) {
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index 0df12f25e6..3608ee43fa 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -20,6 +20,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/util/integer_utils.hpp>
 
 namespace raft::core {
 /**
@@ -89,7 +90,10 @@ struct bitset_view {
   /**
    * @brief Get the number of elements used by the bitset representation.
    */
-  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t;
+  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t
+  {
+    return raft::div_rounding_up_safe(bitset_len_, bitset_element_size);
+  }
 
   inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
   {
@@ -173,7 +177,10 @@ struct bitset {
   /**
    * @brief Get the number of elements used by the bitset representation.
    */
-  inline auto n_elements() const -> index_t;
+  inline auto n_elements() const -> index_t
+  {
+    return raft::div_rounding_up_safe(bitset_len_, bitset_element_size);
+  }
 
   /** @brief Get an mdspan view of the current bitset */
   inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>

From 20718412dfa2026b94de2de2e822b73fe26a3bbc Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 17 Sep 2024 11:56:06 -0700
Subject: [PATCH 15/31] Allow coo_sort to work on int64_t indices (#2432)

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/2432
---
 cpp/include/raft/sparse/op/detail/sort.h | 10 +++++-----
 cpp/include/raft/sparse/op/sort.cuh      | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 85ae825035..02287c2367 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -68,8 +68,8 @@ struct TupleComp {
  * @param vals vals array from coo matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
+template <typename T, typename IdxT = int>
+void coo_sort(IdxT m, IdxT n, IdxT nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
@@ -83,10 +83,10 @@ void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t
  * @param in: COO to sort by row
  * @param stream: the cuda stream to use
  */
-template <typename T>
-void coo_sort(COO<T>* const in, cudaStream_t stream)
+template <typename T, typename IdxT = int>
+void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
 {
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+  coo_sort<T, IdxT>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index c6c3c2e220..5b8a792429 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,8 @@ namespace op {
  * @param vals vals array from coo matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
+template <typename T, typename IdxT = int>
+void coo_sort(IdxT m, IdxT n, IdxT nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
   detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
 }
@@ -49,10 +49,10 @@ void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t
  * @param in: COO to sort by row
  * @param stream: the cuda stream to use
  */
-template <typename T>
-void coo_sort(COO<T>* const in, cudaStream_t stream)
+template <typename T, typename IdxT = int>
+void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
 {
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+  coo_sort<T, IdxT>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
@@ -75,4 +75,4 @@ void coo_sort_by_weight(
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif

From e60cd7d063c3ab7830959d5aef465461f2daf99f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 18 Sep 2024 03:05:44 -0500
Subject: [PATCH 16/31] Update to flake8 7.1.1. (#2435)

We need to update flake8 to fix a false-positive that appears with older flake8 versions on Python 3.12.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2435
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 88be628e55..458d8b1b51 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
                 # Explicitly specify the pyproject.toml at the repo root, not per-project.
                 args: ["--config", "pyproject.toml"]
       - repo: https://github.com/PyCQA/flake8
-        rev: 5.0.4
+        rev: 7.1.1
         hooks:
               - id: flake8
                 args: ["--config=.flake8"]

From c5dcba0333baa4daf41292a8901044442c8d79c8 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 18 Sep 2024 14:43:38 -0500
Subject: [PATCH 17/31] Use CI workflow branch 'branch-24.10' again [skip ci]
 (#2437)

---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 26 +++++++++++++-------------
 .github/workflows/test.yaml  | 10 +++++-----
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 037d7cc05f..2b0ae5099c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       package-name: pylibraft
   wheel-build-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 26c6928f77..381ca6b378 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 90518c1132..ad0456d526 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From cd02a8f2b8dc22ec2115e80e559a4ab98d402c6e Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 20 Sep 2024 13:05:44 -0400
Subject: [PATCH 18/31] Fix sed syntax [skip-ci] (#2441)

---
 ci/release/update-version.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 075eb896f2..5bb98511cf 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -58,7 +58,7 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for DEP in "${UCXX_DEPENDENCIES[@]}"; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXX_SHORT_TAG_PEP440}.*/,>=0.0.0a0/g" "${FILE}"
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXX_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
 done
 for FILE in python/*/pyproject.toml; do

From f49567e1fb9f722f8864bc84880745b9b776eb61 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Sat, 21 Sep 2024 01:09:00 +0200
Subject: [PATCH 19/31] Use runtime check of cudart version for eig (#2430)

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2430
---
 cpp/include/raft/linalg/detail/eig.cuh | 31 ++++++++++++++------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh
index ba7ed3dcdf..561187178c 100644
--- a/cpp/include/raft/linalg/detail/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -95,16 +95,19 @@ void eigDC(raft::resources const& handle,
   return;
 #endif
 
-#if CUDART_VERSION <= 12040
-  // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093.
+  int cudart_version = 0;
+  RAFT_CUDA_TRY(cudaRuntimeGetVersion(&cudart_version));
+  cudaStream_t stream_new;
+  cudaEvent_t sync_event = resource::detail::get_cuda_stream_sync_event(handle);
   rmm::cuda_stream stream_new_wrapper;
-  cudaStream_t stream_new = stream_new_wrapper.value();
-  cudaEvent_t sync_event  = resource::detail::get_cuda_stream_sync_event(handle);
-  RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream));
-  RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event));
-#else
-  cudaStream_t stream_new = stream;
-#endif
+  if (cudart_version < 12050) {
+    // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093.
+    stream_new = stream_new_wrapper.value();
+    RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream));
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event));
+  } else {
+    stream_new = stream;
+  }
   cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
 
   cusolverDnParams_t dn_params = nullptr;
@@ -152,11 +155,11 @@ void eigDC(raft::resources const& handle,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
 
-#if CUDART_VERSION <= 12040
-  // Synchronize the created stream with the original stream before return
-  RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new));
-  RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event));
-#endif
+  if (cudart_version < 12050) {
+    // Synchronize the created stream with the original stream before return
+    RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new));
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event));
+  }
 }
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };

From b79f15d2f229849bc02425b2e4ffd7bd3db89d4c Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 23 Sep 2024 16:09:01 -0500
Subject: [PATCH 20/31] Update fmt (to 11.0.2) and spdlog (to 1.14.1). (#2433)

* Update fmt (to 11.0.2) and spdlog (to 1.14.1).

* use rmm and ucxx CI artifacts

* try using librmm wheels

* try again to use rmm wheels

* rapids-get-pr-wheel-artifact was missing RAPIDS_PY_WHEEL_NAME

* ok you do not need to provide the Python slug yourself for rapids-get-pr-wheel-artifact

* constraints need 'file://' protocol

* try suppressing unreachable-code diagnostics from nvcc (this should be narrowed down / upstreamed before merging)

* fix checks

* Revert "try suppressing unreachable-code diagnostics from nvcc (this should be narrowed down / upstreamed before merging)"

This reverts commit 3ba2201d678cc8befdfcaa0e89630a95a00a78b2.

* copyright

* move rapids-cmake overrides [skip ci]

* kick off a build

* fix dependency graph

* devcontainer

* run all CI

* remove testing-only changes [skip ci]
---
 conda/recipes/libraft/conda_build_config.yaml         |  4 ++--
 .../raft-ann-bench-cpu/conda_build_config.yaml        |  4 ++--
 cpp/cmake/thirdparty/get_spdlog.cmake                 | 11 +----------
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 00b133c821..5c0047fb9c 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -73,7 +73,7 @@ cuda11_cuda_profiler_api_run_version:
   - ">=11.4.240,<12"
 
 spdlog_version:
-  - ">=1.12.0,<1.13"
+  - ">=1.14.1,<1.15"
 
 fmt_version:
-  - ">=10.1.1,<11"
+  - ">=11.0.2,<12"
diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
index 70d1f0490e..ed6f708e14 100644
--- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
@@ -23,7 +23,7 @@ nlohmann_json_version:
   - ">=3.11.2"
 
 spdlog_version:
-  - ">=1.12.0,<1.13"
+  - ">=1.14.1,<1.15"
 
 fmt_version:
-  - ">=10.1.1,<11"
+  - ">=11.0.2,<12"
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index 7be7804c7e..57e38c2638 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -19,15 +19,6 @@ function(find_and_configure_spdlog)
     rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports)
     rapids_export_package(BUILD spdlog rmm-exports)
 
-    if(spdlog_ADDED)
-        rapids_export(
-                BUILD spdlog
-                EXPORT_SET spdlog
-                GLOBAL_TARGETS spdlog spdlog_header_only
-                NAMESPACE spdlog::)
-        include("${rapids-cmake-dir}/export/find_package_root.cmake")
-        rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET rmm-exports)
-    endif()
 endfunction()
 
 find_and_configure_spdlog()
\ No newline at end of file

From 878cefc923f25df07fc362126d1c269bdff30c0c Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:10:57 -0500
Subject: [PATCH 21/31] update update-version.sh to use packaging lib (#2447)

---
 ci/release/update-version.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 5bb98511cf..032b88b4aa 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -25,8 +25,8 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 # Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCXX_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 

From 17757b84c38dfa671abe5e8794fd2bd9bc636772 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 25 Sep 2024 13:49:36 -0500
Subject: [PATCH 22/31] Switch traceback to `native` (#2446)

In cudf we have observed a ~10% speed up of pytest suite execution by switching pytest traceback to `--native`:

```
currently:

102474 passed, 2117 skipped, 902 xfailed in 892.16s (0:14:52)

--tb=short:

102474 passed, 2117 skipped, 902 xfailed in 898.99s (0:14:58)

--tb=no:

102474 passed, 2117 skipped, 902 xfailed in 815.98s (0:13:35)

--tb=native:

102474 passed, 2117 skipped, 902 xfailed in 820.92s (0:13:40)
```

This PR makes a similar change to `raft` repo.

xref: https://github.com/rapidsai/cudf/pull/16851

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2446
---
 python/pylibraft/pylibraft/test/pytest.ini | 5 +++++
 python/raft-dask/pytest.ini                | 1 +
 python/raft-dask/raft_dask/test/pytest.ini | 5 +++++
 3 files changed, 11 insertions(+)
 create mode 100644 python/pylibraft/pylibraft/test/pytest.ini
 create mode 100644 python/raft-dask/raft_dask/test/pytest.ini

diff --git a/python/pylibraft/pylibraft/test/pytest.ini b/python/pylibraft/pylibraft/test/pytest.ini
new file mode 100644
index 0000000000..bf70c06f84
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/pytest.ini
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
+
diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini
index 2467e2089a..e09c2b173d 100644
--- a/python/raft-dask/pytest.ini
+++ b/python/raft-dask/pytest.ini
@@ -10,3 +10,4 @@ markers =
   nccl: marks a test as using NCCL
   ucx: marks a test as using UCX-Py
   ucxx: marks a test as using UCXX
+addopts = --tb=native
diff --git a/python/raft-dask/raft_dask/test/pytest.ini b/python/raft-dask/raft_dask/test/pytest.ini
new file mode 100644
index 0000000000..bf70c06f84
--- /dev/null
+++ b/python/raft-dask/raft_dask/test/pytest.ini
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
+

From 0f8b09720054e5cf7a65dddfa02a2f06e7ba8adb Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 25 Sep 2024 20:53:54 -0400
Subject: [PATCH 23/31] Disable NN Descent Batch tests temporarily (#2453)

Linked issue https://github.com/rapidsai/raft/issues/2450

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2453
---
 cpp/test/CMakeLists.txt               |  4 +++-
 cpp/test/neighbors/ann_nn_descent.cuh | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index a497e6d3ba..5d504d2100 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -440,7 +440,9 @@ if(BUILD_TESTS)
     neighbors/ann_nn_descent/test_float_uint32_t.cu
     neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
     neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
-    neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
+    # TODO: Investigate why this test is failing Reference issue
+    # https://github.com/rapidsai/raft/issues/2450
+    # neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index 2f9d4e252b..5070d83b15 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -318,13 +318,15 @@ const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<An
   {false, true},
   {0.90});
 
-const std::vector<AnnNNDescentBatchInputs> inputsBatch =
-  raft::util::itertools::product<AnnNNDescentBatchInputs>(
-    {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
-    {4000, 5000},                                          // n_rows
-    {192, 512},                                            // dim
-    {32, 64},                                              // graph_degree
-    {raft::distance::DistanceType::L2Expanded},
-    {false, true});
+// TODO: Investigate why this test is failing
+// Reference issue https://github.com/rapidsai/raft/issues/2450
+// const std::vector<AnnNNDescentBatchInputs> inputsBatch =
+//   raft::util::itertools::product<AnnNNDescentBatchInputs>(
+//     {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
+//     {4000, 5000},                                          // n_rows
+//     {192, 512},                                            // dim
+//     {32, 64},                                              // graph_degree
+//     {raft::distance::DistanceType::L2Expanded},
+//     {false, true});
 
 }  // namespace raft::neighbors::experimental::nn_descent

From 2e705fbf5bfa9af148871ba509ac55593da9a098 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 25 Sep 2024 23:01:05 -0700
Subject: [PATCH 24/31] Add missing `cuda_suffixed: true` (#2440)

This change ensures `rmm-cuXX` is not in the package dependencies when building the pylibraft wheel with `-C "rapidsai.matrix-entry=cuda_suffixed=false`.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2440
---
 dependencies.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 8f5c69245f..91fb176b56 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -499,10 +499,14 @@ dependencies:
               - *cuda_python
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - *rmm_cu12
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - *rmm_cu11
           - {matrix: null, packages: [*rmm_unsuffixed]}

From 0284b424367f07220de133715043a963b341485a Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 26 Sep 2024 07:42:29 -0700
Subject: [PATCH 25/31] [Feat] add `repeat`, `sparsity`,  `eval_n_elements`
 APIs to `bitset` (#2439)

- This PR is a part of the feature that applies the prefilter brute-force in Cagra.

Authors:
  - rhdong (https://github.com/rhdong)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/raft/pull/2439
---
 cpp/bench/prims/util/popc.cu          |   7 +-
 cpp/include/raft/core/bitset.cuh      | 107 +++++++++++++++++++++++-
 cpp/include/raft/core/bitset.hpp      |  76 +++++++++++++++++
 cpp/include/raft/util/detail/popc.cuh |   6 +-
 cpp/include/raft/util/popc.cuh        |   4 +-
 cpp/test/core/bitset.cu               | 114 +++++++++++++++++++++++---
 cpp/test/util/popc.cu                 |  22 +++--
 7 files changed, 311 insertions(+), 25 deletions(-)

diff --git a/cpp/bench/prims/util/popc.cu b/cpp/bench/prims/util/popc.cu
index 249dc13d1e..c6249fb2bd 100644
--- a/cpp/bench/prims/util/popc.cu
+++ b/cpp/bench/prims/util/popc.cu
@@ -89,10 +89,9 @@ struct popc_bench : public fixture {
       auto bits_view =
         raft::make_device_vector_view<const bits_t, index_t>(bits_d.data_handle(), bits_d.size());
 
-      index_t max_len   = params.n_rows * params.n_cols;
-      auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
-      auto nnz_actual_view =
-        nnz_actual_d.view();  // raft::make_device_scalar_view<index_t>(nnz_actual_d.data_handle());
+      index_t max_len      = params.n_rows * params.n_cols;
+      auto max_len_view    = raft::make_host_scalar_view<const index_t, index_t>(&max_len);
+      auto nnz_actual_view = nnz_actual_d.view();
       raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
     });
   }
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index b6e6128eca..d1bffdb81e 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -26,6 +26,8 @@
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/popc.cuh>
 
+#include <rmm/device_scalar.hpp>
+
 #include <thrust/for_each.h>
 
 namespace raft::core {
@@ -60,6 +62,109 @@ _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index
   }
 }
 
+template <typename bitset_t, typename index_t>
+void bitset_view<bitset_t, index_t>::count(const raft::resources& res,
+                                           raft::device_scalar_view<index_t> count_gpu_scalar) const
+{
+  auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
+  auto values  = raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
+  raft::popc(res, values, max_len, count_gpu_scalar);
+}
+
+template <typename bitset_t, typename index_t>
+RAFT_KERNEL bitset_repeat_kernel(const bitset_t* src,
+                                 bitset_t* output,
+                                 index_t src_bit_len,
+                                 index_t repeat_times)
+{
+  constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+  int output_idx                     = blockIdx.x * blockDim.x + threadIdx.x;
+
+  index_t total_bits  = src_bit_len * repeat_times;
+  index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
+  index_t src_size    = (src_bit_len + bits_per_element - 1) / bits_per_element;
+
+  if (output_idx < output_size) {
+    bitset_t result     = 0;
+    index_t bit_written = 0;
+
+    index_t start_bit = output_idx * bits_per_element;
+
+    while (bit_written < bits_per_element && start_bit + bit_written < total_bits) {
+      index_t bit_idx      = (start_bit + bit_written) % src_bit_len;
+      index_t src_word_idx = bit_idx / bits_per_element;
+      index_t src_offset   = bit_idx % bits_per_element;
+
+      index_t remaining_bits = min(bits_per_element - bit_written, src_bit_len - bit_idx);
+
+      bitset_t src_value = (src[src_word_idx] >> src_offset);
+
+      if (src_offset + remaining_bits > bits_per_element) {
+        bitset_t next_value = src[(src_word_idx + 1) % src_size];
+        src_value |= (next_value << (bits_per_element - src_offset));
+      }
+      src_value &= ((bitset_t{1} << remaining_bits) - 1);
+      result |= (src_value << bit_written);
+      bit_written += remaining_bits;
+    }
+    output[output_idx] = result;
+  }
+}
+
+template <typename bitset_t, typename index_t>
+void bitset_repeat(raft::resources const& handle,
+                   const bitset_t* d_src,
+                   bitset_t* d_output,
+                   index_t src_bit_len,
+                   index_t repeat_times)
+{
+  if (src_bit_len == 0 || repeat_times == 0) return;
+  auto stream = resource::get_cuda_stream(handle);
+
+  constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+  const index_t total_bits           = src_bit_len * repeat_times;
+  const index_t output_size          = (total_bits + bits_per_element - 1) / bits_per_element;
+
+  int threadsPerBlock = 128;
+  int blocksPerGrid   = (output_size + threadsPerBlock - 1) / threadsPerBlock;
+  bitset_repeat_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+    d_src, d_output, src_bit_len, repeat_times);
+
+  return;
+}
+
+template <typename bitset_t, typename index_t>
+void bitset_view<bitset_t, index_t>::repeat(const raft::resources& res,
+                                            index_t times,
+                                            bitset_t* output_device_ptr) const
+{
+  auto thrust_policy                 = raft::resource::get_thrust_policy(res);
+  constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+
+  if (bitset_len_ % bits_per_element == 0) {
+    index_t num_elements_to_copy = bitset_len_ / bits_per_element;
+
+    for (index_t i = 0; i < times; ++i) {
+      raft::copy(output_device_ptr + i * num_elements_to_copy,
+                 bitset_ptr_,
+                 num_elements_to_copy,
+                 raft::resource::get_cuda_stream(res));
+    }
+  } else {
+    bitset_repeat(res, bitset_ptr_, output_device_ptr, bitset_len_, times);
+  }
+}
+
+template <typename bitset_t, typename index_t>
+double bitset_view<bitset_t, index_t>::sparsity(const raft::resources& res) const
+{
+  index_t size_h = this->size();
+  if (0 == size_h) { return static_cast<double>(1.0); }
+  index_t count_h = this->count(res);
+
+  return static_cast<double>((1.0 * (size_h - count_h)) / (1.0 * size_h));
+}
+
 template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
                                   raft::device_vector_view<const index_t, index_t> mask_index,
@@ -155,7 +260,7 @@ template <typename bitset_t, typename index_t>
 void bitset<bitset_t, index_t>::count(const raft::resources& res,
                                       raft::device_scalar_view<index_t> count_gpu_scalar)
 {
-  auto max_len = raft::make_host_scalar_view<index_t>(&bitset_len_);
+  auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
   auto values =
     raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
   raft::popc(res, values, max_len, count_gpu_scalar);
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index 3608ee43fa..be828def87 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -22,6 +22,8 @@
 #include <raft/core/resources.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <cmath>
+
 namespace raft::core {
 /**
  * @defgroup bitset Bitset
@@ -103,6 +105,80 @@ struct bitset_view {
   {
     return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
   }
+  /**
+   * @brief Returns the number of bits set to true in count_gpu_scalar.
+   *
+   * @param[in] res RAFT resources
+   * @param[out] count_gpu_scalar Device scalar to store the count
+   */
+  void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar) const;
+  /**
+   * @brief Returns the number of bits set to true.
+   *
+   * @param res RAFT resources
+   * @return index_t Number of bits set to true
+   */
+  auto count(const raft::resources& res) const -> index_t
+  {
+    auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
+    count(res, count_gpu_scalar.view());
+    index_t count_cpu = 0;
+    raft::update_host(
+      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+    resource::sync_stream(res);
+    return count_cpu;
+  }
+
+  /**
+   * @brief Repeats the bitset data and copies it to the output device pointer.
+   *
+   * This function takes the original bitset data stored in the device memory
+   * and repeats it a specified number of times into a new location in the device memory.
+   * The bits are copied bit-by-bit to ensure that even if the number of bits (bitset_len_)
+   * is not a multiple of the bitset element size (e.g., 32 for uint32_t), the bits are
+   * tightly packed without any gaps between rows.
+   *
+   * @param res RAFT resources for managing CUDA streams and execution policies.
+   * @param times Number of times the bitset data should be repeated in the output.
+   * @param output_device_ptr Device pointer where the repeated bitset data will be stored.
+   *
+   * The caller must ensure that the output device pointer has enough memory allocated
+   * to hold `times * bitset_len` bits, where `bitset_len` is the number of bits in the original
+   * bitset. This function uses Thrust parallel algorithms to efficiently perform the operation on
+   * the GPU.
+   */
+  void repeat(const raft::resources& res, index_t times, bitset_t* output_device_ptr) const;
+
+  /**
+   * @brief Calculate the sparsity (fraction of 0s) of the bitset.
+   *
+   * This function computes the sparsity of the bitset, defined as the ratio of unset bits (0s)
+   * to the total number of bits in the set. If the total number of bits is zero, the function
+   * returns 1.0, indicating the set is fully sparse.
+   *
+   * @param res RAFT resources for managing CUDA streams and execution policies.
+   * @return double The sparsity of the bitset, i.e., the fraction of unset bits.
+   *
+   * This API will synchronize on the stream of `res`.
+   */
+  double sparsity(const raft::resources& res) const;
+
+  /**
+   * @brief Calculates the number of `bitset_t` elements required to store a bitset.
+   *
+   * This function computes the number of `bitset_t` elements needed to store a bitset, ensuring
+   * that all bits are accounted for. If the bitset length is not a multiple of the `bitset_t` size
+   * (in bits), the calculation rounds up to include the remaining bits in an additional `bitset_t`
+   * element.
+   *
+   * @param bitset_len The total length of the bitset in bits.
+   * @return size_t The number of `bitset_t` elements required to store the bitset.
+   */
+  static inline size_t eval_n_elements(size_t bitset_len)
+  {
+    const size_t bits_per_element = sizeof(bitset_t) * 8;
+    return (bitset_len + bits_per_element - 1) / bits_per_element;
+  }
 
  private:
   bitset_t* bitset_ptr_;
diff --git a/cpp/include/raft/util/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh
index 20b4814216..f335be6fd0 100644
--- a/cpp/include/raft/util/detail/popc.cuh
+++ b/cpp/include/raft/util/detail/popc.cuh
@@ -36,12 +36,12 @@ namespace raft::detail {
  */
 template <typename value_t, typename index_t>
 void popc(const raft::resources& res,
-          device_vector_view<value_t, index_t> values,
-          raft::host_scalar_view<index_t> max_len,
+          device_vector_view<const value_t, index_t> values,
+          raft::host_scalar_view<const index_t, index_t> max_len,
           raft::device_scalar_view<index_t> counter)
 {
   auto values_size   = values.size();
-  auto values_matrix = raft::make_device_matrix_view<value_t, index_t, col_major>(
+  auto values_matrix = raft::make_device_matrix_view<const value_t, index_t, col_major>(
     values.data_handle(), values_size, 1);
   auto counter_vector = raft::make_device_vector_view<index_t, index_t>(counter.data_handle(), 1);
 
diff --git a/cpp/include/raft/util/popc.cuh b/cpp/include/raft/util/popc.cuh
index 153694e45e..d4bc01e274 100644
--- a/cpp/include/raft/util/popc.cuh
+++ b/cpp/include/raft/util/popc.cuh
@@ -31,8 +31,8 @@ namespace raft {
  */
 template <typename value_t, typename index_t>
 void popc(const raft::resources& res,
-          device_vector_view<value_t, index_t> values,
-          raft::host_scalar_view<index_t> max_len,
+          device_vector_view<const value_t, index_t> values,
+          raft::host_scalar_view<const index_t, index_t> max_len,
           raft::device_scalar_view<index_t> counter)
 {
   detail::popc(res, values, max_len, counter);
diff --git a/cpp/test/core/bitset.cu b/cpp/test/core/bitset.cu
index b799297e8c..ac601274c1 100644
--- a/cpp/test/core/bitset.cu
+++ b/cpp/test/core/bitset.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,13 @@ struct test_spec_bitset {
   uint64_t bitset_len;
   uint64_t mask_len;
   uint64_t query_len;
+  uint64_t repeat_times;
 };
 
 auto operator<<(std::ostream& os, const test_spec_bitset& ss) -> std::ostream&
 {
   os << "bitset{bitset_len: " << ss.bitset_len << ", mask_len: " << ss.mask_len
-     << ", query_len: " << ss.query_len << "}";
+     << ", query_len: " << ss.query_len << ", repeat_times: " << ss.repeat_times << "}";
   return os;
 }
 
@@ -80,6 +81,48 @@ void flip_cpu_bitset(std::vector<bitset_t>& bitset)
   }
 }
 
+template <typename bitset_t>
+void repeat_cpu_bitset(std::vector<bitset_t>& input,
+                       size_t input_bits,
+                       size_t repeat,
+                       std::vector<bitset_t>& output)
+{
+  const size_t output_bits  = input_bits * repeat;
+  const size_t output_units = (output_bits + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+
+  std::memset(output.data(), 0, output_units * sizeof(bitset_t));
+
+  size_t output_bit_index = 0;
+
+  for (size_t r = 0; r < repeat; ++r) {
+    for (size_t i = 0; i < input_bits; ++i) {
+      size_t input_unit_index = i / (sizeof(bitset_t) * 8);
+      size_t input_bit_offset = i % (sizeof(bitset_t) * 8);
+      bool bit                = (input[input_unit_index] >> input_bit_offset) & 1;
+
+      size_t output_unit_index = output_bit_index / (sizeof(bitset_t) * 8);
+      size_t output_bit_offset = output_bit_index % (sizeof(bitset_t) * 8);
+
+      output[output_unit_index] |= (static_cast<bitset_t>(bit) << output_bit_offset);
+
+      ++output_bit_index;
+    }
+  }
+}
+
+template <typename bitset_t>
+double sparsity_cpu_bitset(std::vector<bitset_t>& data, size_t total_bits)
+{
+  size_t one_count = 0;
+  for (size_t i = 0; i < total_bits; ++i) {
+    size_t unit_index = i / (sizeof(bitset_t) * 8);
+    size_t bit_offset = i % (sizeof(bitset_t) * 8);
+    bool bit          = (data[unit_index] >> bit_offset) & 1;
+    if (bit == 1) { ++one_count; }
+  }
+  return static_cast<double>((total_bits - one_count) / (1.0 * total_bits));
+}
+
 template <typename bitset_t, typename index_t>
 class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
  protected:
@@ -87,13 +130,19 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
   const test_spec_bitset spec;
   std::vector<bitset_t> bitset_result;
   std::vector<bitset_t> bitset_ref;
+  std::vector<bitset_t> bitset_repeat_ref;
+  std::vector<bitset_t> bitset_repeat_result;
   raft::resources res;
 
  public:
   explicit BitsetTest()
     : spec(testing::TestWithParam<test_spec_bitset>::GetParam()),
       bitset_result(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))),
-      bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size)))
+      bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))),
+      bitset_repeat_ref(
+        raft::ceildiv(spec.bitset_len * spec.repeat_times, uint64_t(bitset_element_size))),
+      bitset_repeat_result(
+        raft::ceildiv(spec.bitset_len * spec.repeat_times, uint64_t(bitset_element_size)))
   {
   }
 
@@ -145,6 +194,50 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
+    // test sparsity, repeat and eval_n_elements
+    {
+      auto my_bitset_view  = my_bitset.view();
+      auto sparsity_result = my_bitset_view.sparsity(res);
+      auto sparsity_ref    = sparsity_cpu_bitset(bitset_ref, size_t(spec.bitset_len));
+      ASSERT_EQ(sparsity_result, sparsity_ref);
+
+      auto eval_n_elements =
+        bitset_view<bitset_t, index_t>::eval_n_elements(spec.bitset_len * spec.repeat_times);
+      ASSERT_EQ(bitset_repeat_ref.size(), eval_n_elements);
+
+      auto repeat_device = raft::make_device_vector<bitset_t, index_t>(res, eval_n_elements);
+      RAFT_CUDA_TRY(cudaMemsetAsync(
+        repeat_device.data_handle(), 0, eval_n_elements * sizeof(bitset_t), stream));
+      repeat_cpu_bitset(
+        bitset_ref, size_t(spec.bitset_len), size_t(spec.repeat_times), bitset_repeat_ref);
+
+      my_bitset_view.repeat(res, index_t(spec.repeat_times), repeat_device.data_handle());
+
+      ASSERT_EQ(bitset_repeat_ref.size(), repeat_device.size());
+      update_host(
+        bitset_repeat_result.data(), repeat_device.data_handle(), repeat_device.size(), stream);
+      ASSERT_EQ(bitset_repeat_ref.size(), bitset_repeat_result.size());
+
+      index_t errors                        = 0;
+      static constexpr index_t len_per_item = sizeof(bitset_t) * 8;
+      bitset_t tail_len = (index_t(spec.bitset_len * spec.repeat_times) % len_per_item);
+      bitset_t tail_mask =
+        tail_len ? (bitset_t)((bitset_t{1} << tail_len) - bitset_t{1}) : ~bitset_t{0};
+      for (index_t i = 0; i < bitset_repeat_ref.size(); i++) {
+        if (i == bitset_repeat_ref.size() - 1) {
+          errors += (bitset_repeat_ref[i] & tail_mask) != (bitset_repeat_result[i] & tail_mask);
+        } else {
+          errors += (bitset_repeat_ref[i] != bitset_repeat_result[i]);
+        }
+      }
+      ASSERT_EQ(errors, 0);
+
+      // recheck the sparsity after repeat
+      sparsity_result =
+        sparsity_cpu_bitset(bitset_repeat_result, size_t(spec.bitset_len * spec.repeat_times));
+      ASSERT_EQ(sparsity_result, sparsity_ref);
+    }
+
     // Flip the bitset and re-test
     auto bitset_count = my_bitset.count(res);
     my_bitset.flip(res);
@@ -167,13 +260,14 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
   }
 };
 
-auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10},
-                                       test_spec_bitset{100, 30, 10},
-                                       test_spec_bitset{1024, 55, 100},
-                                       test_spec_bitset{10000, 1000, 1000},
-                                       test_spec_bitset{1 << 15, 1 << 3, 1 << 12},
-                                       test_spec_bitset{1 << 15, 1 << 24, 1 << 13},
-                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14});
+auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10, 101},
+                                       test_spec_bitset{100, 30, 10, 13},
+                                       test_spec_bitset{1024, 55, 100, 1},
+                                       test_spec_bitset{10000, 1000, 1000, 100},
+                                       test_spec_bitset{1 << 15, 1 << 3, 1 << 12, 5},
+                                       test_spec_bitset{1 << 15, 1 << 24, 1 << 13, 3},
+                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14, 3},
+                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14, 21});
 
 using Uint16_32 = BitsetTest<uint16_t, uint32_t>;
 TEST_P(Uint16_32, Run) { run(); }
diff --git a/cpp/test/util/popc.cu b/cpp/test/util/popc.cu
index c08faacb07..28eaad2fcb 100644
--- a/cpp/test/util/popc.cu
+++ b/cpp/test/util/popc.cu
@@ -76,7 +76,7 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
       index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
-        element |= (static_cast<index_t>(1) << bit_position);
+        element |= (static_cast<bits_t>(1) << bit_position);
         num_ones--;
       }
     }
@@ -101,7 +101,7 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
       raft::make_device_vector_view<const bits_t, index_t>(bits_d.data(), bits_d.size());
 
     index_t max_len   = params.n_rows * params.n_cols;
-    auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
+    auto max_len_view = raft::make_host_scalar_view<const index_t, index_t>(&max_len);
 
     index_t nnz_actual_h = 0;
     rmm::device_scalar<index_t> nnz_actual_d(0, stream);
@@ -123,8 +123,17 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
   index_t nnz_expected;
 };
 
-using PopcTestI32 = PopcTest<int32_t>;
-TEST_P(PopcTestI32, Result) { Run(); }
+using PopcTestI32_U32 = PopcTest<int32_t, uint32_t>;
+TEST_P(PopcTestI32_U32, Result) { Run(); }
+
+using PopcTestI32_U64 = PopcTest<int32_t, uint64_t>;
+TEST_P(PopcTestI32_U64, Result) { Run(); }
+
+using PopcTestI32_U16 = PopcTest<int32_t, uint16_t>;
+TEST_P(PopcTestI32_U16, Result) { Run(); }
+
+using PopcTestI32_U8 = PopcTest<int32_t, uint8_t>;
+TEST_P(PopcTestI32_U8, Result) { Run(); }
 
 template <typename index_t>
 const std::vector<PopcInputs<index_t>> popc_inputs = {
@@ -154,6 +163,9 @@ const std::vector<PopcInputs<index_t>> popc_inputs = {
   {2, 33, 0.2},
 };
 
-INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U32, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U64, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U16, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U8, ::testing::ValuesIn(popc_inputs<int32_t>));
 
 }  // namespace raft

From f37c41c54fc64a4e3689e5a61851ba3821800fee Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 26 Sep 2024 09:45:11 -0500
Subject: [PATCH 26/31] bump NCCL floor to 2.18.1.1 (#2443)

Contributes to https://github.com/rapidsai/build-planning/issues/102

Some RAPIDS libraries are using `ncclCommSplit()`, which was introduced in `nccl==2.18.1.1`. This is part of a series of PRs across RAPIDS updating libraries' pins to `nccl>=2.18.1.1` to ensure they get a new-enough version that supports that.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/raft/pull/2443
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-120_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-120_arch-x86_64.yaml  | 2 +-
 conda/recipes/libraft/conda_build_config.yaml           | 2 +-
 conda/recipes/raft-ann-bench/conda_build_config.yaml    | 2 +-
 conda/recipes/raft-dask/conda_build_config.yaml         | 3 +++
 conda/recipes/raft-dask/meta.yaml                       | 4 ++--
 dependencies.yaml                                       | 2 +-
 13 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 462874a7e7..de4eb7e690 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cfd974a6a8..26f4c1efaa 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 82e391e9ae..692956502b 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 0389427d13..133d42bfee 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index eff1c56840..f99cedd627 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 87b19d2952..08aea32ab1 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index ff3451c15c..572ad85ab5 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 085e099ae8..9fd23edb03 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 5c0047fb9c..6dc06648bd 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.9.9"
+  - ">=2.18.1.1"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index db0083b583..bdb4e883ea 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.9.9"
+  - ">=2.18.1.1"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index e6afed2890..58e8ec3c9e 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -24,3 +24,6 @@ ucxx_version:
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
+
+nccl_version:
+  - ">=2.18.1.1"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 74b26b5935..bc13d352b7 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - cython >=3.0.0
-    - nccl >=2.9.9
+    - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
@@ -68,7 +68,7 @@ requirements:
     - dask-cuda ={{ minor_version }}
     - rapids-dask-dependency ={{ minor_version }}
     - joblib >=0.11
-    - nccl >=2.9.9
+    - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 91fb176b56..e833e8519a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -170,7 +170,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - nccl>=2.9.9
+          - nccl>=2.18.1.1
           - libucxx==0.40.*,>=0.0.0a0
     specific:
       - output_types: conda

From 704feb1a0249e73bb0cab96968b892e8d5823b46 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 26 Sep 2024 09:32:41 -0700
Subject: [PATCH 27/31] [Opt] Enforce the UT Coverity and add benchmark for
 `transpose` (#2438)

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2438
---
 cpp/bench/prims/CMakeLists.txt               |   1 +
 cpp/bench/prims/linalg/transpose.cu          |  85 ++++
 cpp/include/raft/linalg/detail/transpose.cuh |  67 ++-
 cpp/test/linalg/transpose.cu                 | 445 ++++++++++++++-----
 4 files changed, 465 insertions(+), 133 deletions(-)
 create mode 100644 cpp/bench/prims/linalg/transpose.cu

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 9d80cbaac2..52c63ad73b 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -132,6 +132,7 @@ if(BUILD_PRIMS_BENCH)
     linalg/reduce_rows_by_key.cu
     linalg/reduce.cu
     linalg/sddmm.cu
+    linalg/transpose.cu
     main.cpp
   )
 
diff --git a/cpp/bench/prims/linalg/transpose.cu b/cpp/bench/prims/linalg/transpose.cu
new file mode 100644
index 0000000000..e60e50c125
--- /dev/null
+++ b/cpp/bench/prims/linalg/transpose.cu
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+template <typename IdxT>
+struct transpose_input {
+  IdxT rows, cols;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const transpose_input<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols;
+  return os;
+}
+
+template <typename T, typename IdxT, typename Layout>
+struct TransposeBench : public fixture {
+  TransposeBench(const transpose_input<IdxT>& p)
+    : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream)
+  {
+    raft::random::RngState rng{1234};
+    raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      auto input_view =
+        raft::make_device_matrix_view<T, IdxT, Layout>(in.data(), params.rows, params.cols);
+      auto output_view = raft::make_device_vector_view<T, IdxT, Layout>(out.data(), params.rows);
+      raft::linalg::transpose(handle,
+                              input_view.data_handle(),
+                              output_view.data_handle(),
+                              params.rows,
+                              params.cols,
+                              handle.get_stream());
+    });
+  }
+
+ private:
+  transpose_input<IdxT> params;
+  rmm::device_uvector<T> in, out;
+};  // struct TransposeBench
+
+const std::vector<transpose_input<int>> transpose_inputs_i32 =
+  raft::util::itertools::product<transpose_input<int>>({10, 128, 256, 512, 1024},
+                                                       {10000, 100000, 1000000});
+
+RAFT_BENCH_REGISTER((TransposeBench<float, int, raft::row_major>), "", transpose_inputs_i32);
+RAFT_BENCH_REGISTER((TransposeBench<half, int, raft::row_major>), "", transpose_inputs_i32);
+
+RAFT_BENCH_REGISTER((TransposeBench<float, int, raft::col_major>), "", transpose_inputs_i32);
+RAFT_BENCH_REGISTER((TransposeBench<half, int, raft::col_major>), "", transpose_inputs_i32);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index ec60aacc9c..c5f0544b5c 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -38,7 +38,9 @@ template <typename IndexType, int TILE_DIM, int BLOCK_ROWS>
 RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
                                   IndexType n_cols,
                                   const half* __restrict__ in,
-                                  half* __restrict__ out)
+                                  half* __restrict__ out,
+                                  const IndexType stride_in,
+                                  const IndexType stride_out)
 {
   __shared__ half tile[TILE_DIM][TILE_DIM + 1];
 
@@ -49,7 +51,7 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
 
       for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
         if (x < n_cols && (y + j) < n_rows) {
-          tile[threadIdx.y + j][threadIdx.x] = __ldg(&in[(y + j) * n_cols + x]);
+          tile[threadIdx.y + j][threadIdx.x] = __ldg(&in[(y + j) * stride_in + x]);
         }
       }
       __syncthreads();
@@ -59,7 +61,7 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
 
       for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
         if (x < n_rows && (y + j) < n_cols) {
-          out[(y + j) * n_rows + x] = tile[threadIdx.x][threadIdx.y + j];
+          out[(y + j) * stride_out + x] = tile[threadIdx.x][threadIdx.y + j];
         }
       }
       __syncthreads();
@@ -67,9 +69,33 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
   }
 }
 
+/**
+ * @brief Transposes a matrix stored in row-major order.
+ *
+ * This function transposes a matrix of half-precision floating-point numbers (`half`).
+ * Both the input (`in`) and output (`out`) matrices are assumed to be stored in row-major order.
+ *
+ * @tparam IndexType The type used for indexing the matrix dimensions (e.g., int).
+ * @param handle The RAFT resource handle which contains resources.
+ * @param n_rows The number of rows in the input matrix.
+ * @param n_cols The number of columns in the input matrix.
+ * @param in Pointer to the input matrix in row-major order.
+ * @param out Pointer to the output matrix in row-major order, where the transposed matrix will be
+ * stored.
+ * @param stride_in The stride (number of elements between consecutive rows) for the input matrix.
+ *                  Default is 1, which means the input matrix is contiguous in memory.
+ * @param stride_out The stride (number of elements between consecutive rows) for the output matrix.
+ *                   Default is 1, which means the output matrix is contiguous in memory.
+ */
+
 template <typename IndexType>
-void transpose_half(
-  raft::resources const& handle, IndexType n_rows, IndexType n_cols, const half* in, half* out)
+void transpose_half(raft::resources const& handle,
+                    IndexType n_rows,
+                    IndexType n_cols,
+                    const half* in,
+                    half* out,
+                    const IndexType stride_in  = 1,
+                    const IndexType stride_out = 1)
 {
   if (n_cols == 0 || n_rows == 0) return;
   auto stream = resource::get_cuda_stream(handle);
@@ -100,8 +126,13 @@ void transpose_half(
 
   dim3 grids(adjusted_grid_x, adjusted_grid_y);
 
-  transpose_half_kernel<IndexType, block_dim_x, block_dim_y>
-    <<<grids, blocks, 0, stream>>>(n_rows, n_cols, in, out);
+  if (stride_in > 1 || stride_out > 1) {
+    transpose_half_kernel<IndexType, block_dim_x, block_dim_y>
+      <<<grids, blocks, 0, stream>>>(n_rows, n_cols, in, out, stride_in, stride_out);
+  } else {
+    transpose_half_kernel<IndexType, block_dim_x, block_dim_y>
+      <<<grids, blocks, 0, stream>>>(n_rows, n_cols, in, out, n_cols, n_rows);
+  }
 
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
@@ -118,7 +149,7 @@ void transpose(raft::resources const& handle,
   int out_n_cols = n_rows;
 
   if constexpr (std::is_same_v<math_t, half>) {
-    transpose_half(handle, out_n_rows, out_n_cols, in, out);
+    transpose_half(handle, n_cols, n_rows, in, out);
   } else {
     cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
     RAFT_CUBLAS_TRY(cublasSetStream(cublas_h, stream));
@@ -195,9 +226,13 @@ void transpose_row_major_impl(
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
-  auto out_n_rows = in.extent(1);
-  auto out_n_cols = in.extent(0);
-  transpose_half<IndexType>(handle, out_n_cols, out_n_rows, in.data_handle(), out.data_handle());
+  transpose_half<IndexType>(handle,
+                            in.extent(0),
+                            in.extent(1),
+                            in.data_handle(),
+                            out.data_handle(),
+                            in.stride(0),
+                            out.stride(0));
 }
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
@@ -233,9 +268,13 @@ void transpose_col_major_impl(
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
-  auto out_n_rows = in.extent(1);
-  auto out_n_cols = in.extent(0);
-  transpose_half<IndexType>(handle, out_n_rows, out_n_cols, in.data_handle(), out.data_handle());
+  transpose_half<IndexType>(handle,
+                            in.extent(1),
+                            in.extent(0),
+                            in.data_handle(),
+                            out.data_handle(),
+                            in.stride(1),
+                            out.stride(1));
 }
 
 };  // end namespace detail
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index cbe869a9a5..22fc1c1d60 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -29,48 +29,104 @@
 
 #include <gtest/gtest.h>
 
+#include <type_traits>
+
+namespace std {
+template <>
+struct is_floating_point<half> : std::true_type {};
+}  // namespace std
+
 namespace raft {
 namespace linalg {
 
 template <typename T>
-struct TranposeInputs {
+void initialize_array(T* data_h, size_t size)
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+
+  for (size_t i = 0; i < size; ++i) {
+    if constexpr (std::is_same_v<T, half>) {
+      data_h[i] = __float2half(static_cast<float>(dis(gen)));
+    } else {
+      data_h[i] = static_cast<T>(dis(gen));
+    }
+  }
+}
+
+template <typename T>
+void cpu_transpose_row_major(
+  const T* input, T* output, int rows, int cols, int stride_in = -1, int stride_out = -1)
+{
+  stride_in  = stride_in == -1 ? cols : stride_in;
+  stride_out = stride_out == -1 ? rows : stride_out;
+  if (stride_in)
+    for (int i = 0; i < rows; ++i) {
+      for (int j = 0; j < cols; ++j) {
+        output[j * stride_out + i] = input[i * stride_in + j];
+      }
+    }
+}
+
+template <typename T>
+void cpu_transpose_col_major(
+  const T* input, T* output, int rows, int cols, int stride_in = -1, int stride_out = -1)
+{
+  cpu_transpose_row_major(input, output, cols, rows, stride_in, stride_out);
+}
+
+bool validate_half(const half* h_ref, const half* h_result, half tolerance, int len)
+{
+  bool success = true;
+  for (int i = 0; i < len; ++i) {
+    if (raft::abs(__half2float(h_result[i]) - __half2float(h_ref[i])) >= __half2float(tolerance)) {
+      success = false;
+      break;
+    }
+    if (!success) break;
+  }
+  return success;
+}
+
+namespace transpose_regular_test {
+
+template <typename T>
+struct TransposeInputs {
   T tolerance;
-  int len;
   int n_row;
   int n_col;
   unsigned long long int seed;
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const TranposeInputs<T>& dims)
-{
-  return os;
-}
-
-template <typename T>
-class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
+class TransposeTest : public ::testing::TestWithParam<TransposeInputs<T>> {
  public:
   TransposeTest()
-    : params(::testing::TestWithParam<TranposeInputs<T>>::GetParam()),
+    : params(::testing::TestWithParam<TransposeInputs<T>>::GetParam()),
       stream(resource::get_cuda_stream(handle)),
-      data(params.len, stream),
-      data_trans_ref(params.len, stream),
-      data_trans(params.len, stream)
+      data(params.n_row * params.n_col, stream),
+      data_trans_ref(params.n_row * params.n_col, stream),
+      data_trans(params.n_row * params.n_col, stream)
   {
   }
 
  protected:
   void SetUp() override
   {
-    int len = params.len;
-    ASSERT(params.len == 9, "This test works only with len=9!");
-    T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
-    raft::update_device(data.data(), data_h, len, stream);
-    T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
-    raft::update_device(data_trans_ref.data(), data_ref_h, len, stream);
+    int len = params.n_row * params.n_col;
+    std::vector<T> data_h(len);
+    std::vector<T> data_ref_h(len);
+
+    initialize_array(data_h.data(), len);
+
+    cpu_transpose_col_major(data_h.data(), data_ref_h.data(), params.n_row, params.n_col);
+
+    raft::update_device(data.data(), data_h.data(), len, stream);
+    raft::update_device(data_trans_ref.data(), data_ref_h.data(), len, stream);
 
     transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
-    transpose(data.data(), params.n_row, stream);
+    if (params.n_row == params.n_col) { transpose(data.data(), params.n_col, stream); }
     resource::sync_stream(handle, stream);
   }
 
@@ -78,28 +134,45 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   raft::resources handle;
   cudaStream_t stream;
 
-  TranposeInputs<T> params;
+  TransposeInputs<T> params;
   rmm::device_uvector<T> data, data_trans, data_trans_ref;
 };
 
-const std::vector<TranposeInputs<float>> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}};
-
-const std::vector<TranposeInputs<double>> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
-
-const std::vector<TranposeInputs<half>> inputsh2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TransposeInputs<float>> inputsf2 = {{0.1f, 3, 3, 1234ULL},
+                                                      {0.1f, 3, 4, 1234ULL},
+                                                      {0.1f, 300, 300, 1234ULL},
+                                                      {0.1f, 300, 4100, 1234ULL},
+                                                      {0.1f, 1, 13000, 1234ULL},
+                                                      {0.1f, 3, 130001, 1234ULL}};
+
+const std::vector<TransposeInputs<double>> inputsd2 = {{0.1f, 3, 3, 1234ULL},
+                                                       {0.1f, 3, 4, 1234ULL},
+                                                       {0.1f, 300, 300, 1234ULL},
+                                                       {0.1f, 300, 4100, 1234ULL},
+                                                       {0.1f, 1, 13000, 1234ULL},
+                                                       {0.1f, 3, 130001, 1234ULL}};
+
+const std::vector<TransposeInputs<half>> inputsh2 = {{0.1f, 3, 3, 1234ULL},
+                                                     {0.1f, 3, 4, 1234ULL},
+                                                     {0.1f, 300, 300, 1234ULL},
+                                                     {0.1f, 300, 4100, 1234ULL},
+                                                     {0.1f, 1, 13000, 1234ULL},
+                                                     {0.1f, 3, 130001, 1234ULL}};
 
 typedef TransposeTest<float> TransposeTestValF;
 TEST_P(TransposeTestValF, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
                                 data_trans.data(),
-                                params.len,
+                                params.n_row * params.n_col,
                                 raft::CompareApproxAbs<float>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
-                                data.data(),
-                                params.len,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+  if (params.n_row == params.n_col) {
+    ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                  data.data(),
+                                  params.n_row * params.n_col,
+                                  raft::CompareApproxAbs<float>(params.tolerance)));
+  }
 }
 
 typedef TransposeTest<double> TransposeTestValD;
@@ -107,59 +180,47 @@ TEST_P(TransposeTestValD, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
                                 data_trans.data(),
-                                params.len,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
-
-  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
-                                data.data(),
-                                params.len,
+                                params.n_row * params.n_col,
                                 raft::CompareApproxAbs<double>(params.tolerance)));
-}
-
-bool validate_half(const half* h_ref, const half* h_result, half tolerance, int len)
-{
-  bool success = true;
-  for (int i = 0; i < len; ++i) {
-    if (raft::abs(__half2float(h_result[i]) - __half2float(h_ref[i])) >= __half2float(tolerance)) {
-      success = false;
-      break;
-    }
-    if (!success) break;
+  if (params.n_row == params.n_col) {
+    ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                  data.data(),
+                                  params.n_row * params.n_col,
+                                  raft::CompareApproxAbs<double>(params.tolerance)));
   }
-  return success;
 }
 
 typedef TransposeTest<half> TransposeTestValH;
 TEST_P(TransposeTestValH, Result)
 {
-  half data_trans_ref_h[params.len];
-  half data_trans_h[params.len];
-  half data_h[params.len];
+  auto len = params.n_row * params.n_col;
 
-  RAFT_CUDA_TRY(cudaMemcpyAsync(data_trans_ref_h,
-                                data_trans_ref.data(),
-                                params.len * sizeof(half),
-                                cudaMemcpyDeviceToHost,
-                                stream));
-
-  RAFT_CUDA_TRY(cudaMemcpyAsync(
-    data_trans_h, data_trans.data(), params.len * sizeof(half), cudaMemcpyDeviceToHost, stream));
-  RAFT_CUDA_TRY(cudaMemcpyAsync(
-    data_h, data.data(), params.len * sizeof(half), cudaMemcpyDeviceToHost, stream));
+  std::vector<half> data_trans_ref_h(len);
+  std::vector<half> data_trans_h(len);
+  std::vector<half> data_h(len);
 
+  raft::copy(
+    data_trans_ref_h.data(), data_trans_ref.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(data_trans_h.data(), data_trans.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(data_h.data(), data.data(), len, resource::get_cuda_stream(handle));
   resource::sync_stream(handle, stream);
 
-  ASSERT_TRUE(validate_half(data_trans_ref_h, data_trans_h, params.tolerance, params.len));
-  ASSERT_TRUE(validate_half(data_trans_ref_h, data_h, params.tolerance, params.len));
+  ASSERT_TRUE(validate_half(
+    data_trans_ref_h.data(), data_trans_h.data(), params.tolerance, params.n_row * params.n_col));
+
+  if (params.n_row == params.n_col) {
+    ASSERT_TRUE(validate_half(
+      data_trans_ref_h.data(), data_h.data(), params.tolerance, params.n_row * params.n_col));
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2));
-
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2));
-
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValH, ::testing::ValuesIn(inputsh2));
+}  // namespace transpose_regular_test
+
+namespace transpose_extra_test {
 
-namespace {
 /**
  * We hide these functions in tests for now until we have a heterogeneous mdarray
  * implementation.
@@ -230,79 +291,225 @@ template <typename T, typename IndexType>
   }
 }
 
+template <typename T>
+struct TransposeMdspanInputs {
+  int n_row;
+  int n_col;
+  T tolerance = T{0.01};
+};
+
 template <typename T, typename LayoutPolicy>
-void test_transpose_with_mdspan()
+void test_transpose_with_mdspan(const TransposeMdspanInputs<T>& param)
 {
+  auto len = param.n_row * param.n_col;
+  std::vector<T> in_h(len);
+  std::vector<T> out_ref_h(len);
+
+  initialize_array(in_h.data(), len);
+
   raft::resources handle;
-  auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 3);
-  T k{0};
-  for (size_t i = 0; i < v.extent(0); ++i) {
-    for (size_t j = 0; j < v.extent(1); ++j) {
-      v(i, j) = k++;
-    }
+  auto stream  = resource::get_cuda_stream(handle);
+  auto in      = make_device_matrix<T, size_t, LayoutPolicy>(handle, param.n_row, param.n_col);
+  auto out_ref = make_device_matrix<T, size_t, LayoutPolicy>(handle, param.n_row, param.n_col);
+  resource::sync_stream(handle, stream);
+  if constexpr (std::is_same_v<LayoutPolicy, layout_c_contiguous>) {
+    cpu_transpose_row_major(in_h.data(), out_ref_h.data(), param.n_row, param.n_col);
+  } else {
+    cpu_transpose_col_major(in_h.data(), out_ref_h.data(), param.n_row, param.n_col);
   }
-  auto out = transpose(handle, v.view());
-  static_assert(std::is_same_v<LayoutPolicy, typename decltype(out)::layout_type>);
-  ASSERT_EQ(out.extent(0), v.extent(1));
-  ASSERT_EQ(out.extent(1), v.extent(0));
+  raft::copy(in.data_handle(), in_h.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(out_ref.data_handle(), out_ref_h.data(), len, resource::get_cuda_stream(handle));
 
-  k = 0;
-  for (size_t i = 0; i < out.extent(1); ++i) {
-    for (size_t j = 0; j < out.extent(0); ++j) {
-      ASSERT_EQ(out(j, i), k++);
-    }
+  auto out = transpose(handle, in.view());
+  static_assert(std::is_same_v<LayoutPolicy, typename decltype(out)::layout_type>);
+  ASSERT_EQ(out.extent(0), in.extent(1));
+  ASSERT_EQ(out.extent(1), in.extent(0));
+  if constexpr (std::is_same_v<T, half>) {
+    std::vector<half> out_h(len);
+    raft::copy(out_h.data(), out.data_handle(), len, resource::get_cuda_stream(handle));
+    ASSERT_TRUE(validate_half(out_ref_h.data(), out_h.data(), param.tolerance, len));
+  } else {
+    ASSERT_TRUE(raft::devArrMatch(
+      out_ref.data_handle(), out.data_handle(), len, raft::CompareApproxAbs<T>(param.tolerance)));
   }
 }
-}  // namespace
 
-TEST(TransposeTest, MDSpan)
+const std::vector<TransposeMdspanInputs<float>> inputs_mdspan_f  = {{3, 3},
+                                                                    {3, 4},
+                                                                    {300, 300},
+                                                                    {300, 4100},
+                                                                    {1, 13000},
+                                                                    {3, 130001},
+                                                                    {4100, 300},
+                                                                    {13000, 1},
+                                                                    {130001, 3}};
+const std::vector<TransposeMdspanInputs<double>> inputs_mdspan_d = {{3, 3},
+                                                                    {3, 4},
+                                                                    {300, 300},
+                                                                    {300, 4100},
+                                                                    {1, 13000},
+                                                                    {3, 130001},
+                                                                    {4100, 300},
+                                                                    {13000, 1},
+                                                                    {130001, 3}};
+const std::vector<TransposeMdspanInputs<half>> inputs_mdspan_h   = {{3, 3},
+                                                                    {3, 4},
+                                                                    {300, 300},
+                                                                    {300, 4100},
+                                                                    {1, 13000},
+                                                                    {3, 130001},
+                                                                    {4100, 300},
+                                                                    {13000, 1},
+                                                                    {130001, 3}};
+
+TEST(TransposeTest, MDSpanFloat)
 {
-  test_transpose_with_mdspan<float, layout_c_contiguous>();
-  test_transpose_with_mdspan<double, layout_c_contiguous>();
-
-  test_transpose_with_mdspan<float, layout_f_contiguous>();
-  test_transpose_with_mdspan<double, layout_f_contiguous>();
+  for (const auto& p : inputs_mdspan_f) {
+    test_transpose_with_mdspan<float, layout_c_contiguous>(p);
+    test_transpose_with_mdspan<float, layout_f_contiguous>(p);
+  }
+}
+TEST(TransposeTest, MDSpanDouble)
+{
+  for (const auto& p : inputs_mdspan_d) {
+    test_transpose_with_mdspan<double, layout_c_contiguous>(p);
+    test_transpose_with_mdspan<double, layout_f_contiguous>(p);
+  }
+}
+TEST(TransposeTest, MDSpanHalf)
+{
+  for (const auto& p : inputs_mdspan_h) {
+    test_transpose_with_mdspan<half, layout_c_contiguous>(p);
+    test_transpose_with_mdspan<half, layout_f_contiguous>(p);
+  }
 }
 
-namespace {
+template <typename T>
+struct TransposeSubmatrixInputs {
+  int n_row;
+  int n_col;
+  int row_beg;
+  int row_end;
+  int col_beg;
+  int col_end;
+  T tolerance = T{0.01};
+};
+
 template <typename T, typename LayoutPolicy>
-void test_transpose_submatrix()
+void test_transpose_submatrix(const TransposeSubmatrixInputs<T>& param)
 {
+  auto len     = param.n_row * param.n_col;
+  auto sub_len = (param.row_end - param.row_beg) * (param.col_end - param.col_beg);
+
+  std::vector<T> in_h(len);
+  std::vector<T> out_ref_h(sub_len);
+
+  initialize_array(in_h.data(), len);
+
   raft::resources handle;
-  auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 33);
-  T k{0};
-  size_t row_beg{3}, row_end{13}, col_beg{2}, col_end{11};
-  for (size_t i = row_beg; i < row_end; ++i) {
-    for (size_t j = col_beg; j < col_end; ++j) {
-      v(i, j) = k++;
-    }
+  auto stream = resource::get_cuda_stream(handle);
+
+  auto in      = make_device_matrix<T, size_t, LayoutPolicy>(handle, param.n_row, param.n_col);
+  auto out_ref = make_device_matrix<T, size_t, LayoutPolicy>(
+    handle, (param.row_end - param.row_beg), (param.col_end - param.col_beg));
+
+  if constexpr (std::is_same_v<LayoutPolicy, layout_c_contiguous>) {
+    auto offset = param.row_beg * param.n_col + param.col_beg;
+    cpu_transpose_row_major(in_h.data() + offset,
+                            out_ref_h.data(),
+                            (param.row_end - param.row_beg),
+                            (param.col_end - param.col_beg),
+                            in.extent(1),
+                            (param.row_end - param.row_beg));
+  } else {
+    auto offset = param.col_beg * param.n_row + param.row_beg;
+    cpu_transpose_col_major(in_h.data() + offset,
+                            out_ref_h.data(),
+                            (param.row_end - param.row_beg),
+                            (param.col_end - param.col_beg),
+                            in.extent(0),
+                            (param.col_end - param.col_beg));
   }
 
-  auto vv     = v.view();
-  auto submat = std::experimental::submdspan(
-    vv, std::make_tuple(row_beg, row_end), std::make_tuple(col_beg, col_end));
-  static_assert(std::is_same_v<typename decltype(submat)::layout_type, layout_stride>);
+  raft::copy(in.data_handle(), in_h.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(out_ref.data_handle(), out_ref_h.data(), sub_len, resource::get_cuda_stream(handle));
+  resource::sync_stream(handle, stream);
 
-  auto out = transpose(handle, submat);
-  ASSERT_EQ(out.extent(0), submat.extent(1));
-  ASSERT_EQ(out.extent(1), submat.extent(0));
+  auto in_submat = std::experimental::submdspan(in.view(),
+                                                std::make_tuple(param.row_beg, param.row_end),
+                                                std::make_tuple(param.col_beg, param.col_end));
 
-  k = 0;
-  for (size_t i = 0; i < out.extent(1); ++i) {
-    for (size_t j = 0; j < out.extent(0); ++j) {
-      ASSERT_EQ(out(j, i), k++);
-    }
+  static_assert(std::is_same_v<typename decltype(in_submat)::layout_type, layout_stride>);
+  auto out = transpose(handle, in_submat);
+
+  ASSERT_EQ(out.extent(0), in_submat.extent(1));
+  ASSERT_EQ(out.extent(1), in_submat.extent(0));
+
+  if constexpr (std::is_same_v<T, half>) {
+    std::vector<half> out_h(sub_len);
+
+    raft::copy(out_h.data(), out.data_handle(), sub_len, resource::get_cuda_stream(handle));
+    ASSERT_TRUE(validate_half(out_ref_h.data(), out_h.data(), param.tolerance, sub_len));
+  } else {
+    ASSERT_TRUE(raft::devArrMatch(out_ref.data_handle(),
+                                  out.data_handle(),
+                                  sub_len,
+                                  raft::CompareApproxAbs<T>(param.tolerance)));
   }
 }
-}  // namespace
-
-TEST(TransposeTest, SubMatrix)
+const std::vector<TransposeSubmatrixInputs<float>> inputs_submatrix_f = {
+  {3, 3, 1, 2, 0, 2},
+  {3, 4, 1, 3, 2, 3},
+  {300, 300, 1, 299, 2, 239},
+  {300, 4100, 3, 299, 101, 4001},
+  {2, 13000, 0, 1, 3, 13000},
+  {3, 130001, 0, 3, 3999, 129999},
+  {4100, 300, 159, 4001, 125, 300},
+  {13000, 5, 0, 11111, 0, 3},
+  {130001, 3, 19, 130000, 2, 3}};
+const std::vector<TransposeSubmatrixInputs<double>> inputs_submatrix_d = {
+  {3, 3, 1, 2, 0, 2},
+  {3, 4, 1, 3, 2, 3},
+  {300, 300, 1, 299, 2, 239},
+  {300, 4100, 3, 299, 101, 4001},
+  {2, 13000, 0, 1, 3, 13000},
+  {3, 130001, 0, 3, 3999, 129999},
+  {4100, 300, 159, 4001, 125, 300},
+  {13000, 5, 0, 11111, 0, 3},
+  {130001, 3, 19, 130000, 2, 3}};
+const std::vector<TransposeSubmatrixInputs<half>> inputs_submatrix_h = {
+  {3, 3, 1, 2, 0, 2},
+  {3, 4, 1, 3, 2, 3},
+  {300, 300, 1, 299, 2, 239},
+  {300, 4100, 3, 299, 101, 4001},
+  {2, 13000, 0, 1, 3, 13000},
+  {3, 130001, 0, 3, 3999, 129999},
+  {4100, 300, 159, 4001, 125, 300},
+  {13000, 5, 0, 11111, 0, 3},
+  {130001, 3, 19, 130000, 2, 3}};
+
+TEST(TransposeTest, SubMatrixFloat)
 {
-  test_transpose_submatrix<float, layout_c_contiguous>();
-  test_transpose_submatrix<double, layout_c_contiguous>();
-
-  test_transpose_submatrix<float, layout_f_contiguous>();
-  test_transpose_submatrix<double, layout_f_contiguous>();
+  for (const auto& p : inputs_submatrix_f) {
+    test_transpose_submatrix<float, layout_c_contiguous>(p);
+    test_transpose_submatrix<float, layout_f_contiguous>(p);
+  }
 }
+TEST(TransposeTest, SubMatrixDouble)
+{
+  for (const auto& p : inputs_submatrix_d) {
+    test_transpose_submatrix<double, layout_c_contiguous>(p);
+    test_transpose_submatrix<double, layout_f_contiguous>(p);
+  }
+}
+TEST(TransposeTest, SubMatrixHalf)
+{
+  for (const auto& p : inputs_submatrix_h) {
+    test_transpose_submatrix<half, layout_c_contiguous>(p);
+    test_transpose_submatrix<half, layout_f_contiguous>(p);
+  }
+}
+
+}  // namespace transpose_extra_test
 }  // end namespace linalg
 }  // end namespace raft

From b78a07d9d2a117c12f396f330526a8e8ed23a9ba Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Fri, 27 Sep 2024 00:36:44 +0200
Subject: [PATCH 28/31] Adding NCCL clique to the RAFT handle (#2431)

Authors:
  - Victor Lafargue (https://github.com/viclafargue)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2431
---
 cpp/include/raft/comms/detail/std_comms.hpp   |   8 +-
 cpp/include/raft/comms/nccl_clique.hpp        | 156 ++++++++++++++++++
 .../raft/core/resource/nccl_clique.hpp        |  66 ++++++++
 .../raft/core/resource/resource_types.hpp     |   1 +
 4 files changed, 227 insertions(+), 4 deletions(-)
 create mode 100644 cpp/include/raft/comms/nccl_clique.hpp
 create mode 100644 cpp/include/raft/core/resource/nccl_clique.hpp

diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index c5d64f6a29..ed869e6cae 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -310,13 +310,13 @@ class std_comms : public comms_iface {
             // Wait for a UCXX progress thread roundtrip, prevent waiting for longer
             // than 10ms for each operation, will retry in next iteration.
             ucxx::utils::CallbackNotifier callbackNotifierPre{};
-            worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); },
-                                       10000000 /* 10ms */);
+            (void)worker->registerGenericPre(
+              [&callbackNotifierPre]() { callbackNotifierPre.set(); }, 10000000 /* 10ms */);
             callbackNotifierPre.wait();
 
             ucxx::utils::CallbackNotifier callbackNotifierPost{};
-            worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); },
-                                        10000000 /* 10ms */);
+            (void)worker->registerGenericPost(
+              [&callbackNotifierPost]() { callbackNotifierPost.set(); }, 10000000 /* 10ms */);
             callbackNotifierPost.wait();
           } else {
             // Causes UCXX to progress through the send/recv message queue
diff --git a/cpp/include/raft/comms/nccl_clique.hpp b/cpp/include/raft/comms/nccl_clique.hpp
new file mode 100644
index 0000000000..c6520af753
--- /dev/null
+++ b/cpp/include/raft/comms/nccl_clique.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_resources.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <nccl.h>
+
+/**
+ * @brief Error checking macro for NCCL runtime API functions.
+ *
+ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
+ * exception detailing the NCCL error that occurred
+ */
+#define RAFT_NCCL_TRY(call)                        \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
+  } while (0);
+
+namespace raft::comms {
+void build_comms_nccl_only(raft::resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank);
+}
+
+namespace raft::comms {
+
+struct nccl_clique {
+  using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+
+  /**
+   * Instantiates a NCCL clique with all available GPUs
+   *
+   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+   *
+   */
+  nccl_clique(int percent_of_free_memory = 80)
+    : root_rank_(0),
+      percent_of_free_memory_(percent_of_free_memory),
+      per_device_pools_(0),
+      device_resources_(0)
+  {
+    cudaGetDeviceCount(&num_ranks_);
+    device_ids_.resize(num_ranks_);
+    std::iota(device_ids_.begin(), device_ids_.end(), 0);
+    nccl_comms_.resize(num_ranks_);
+    nccl_clique_init();
+  }
+
+  /**
+   * Instantiates a NCCL clique
+   *
+   * Usage example:
+   * @code{.cpp}
+   * int n_devices;
+   * cudaGetDeviceCount(&n_devices);
+   * std::vector<int> device_ids(n_devices);
+   * std::iota(device_ids.begin(), device_ids.end(), 0);
+   * cuvs::neighbors::mg::nccl_clique& clique(device_ids); // first device is the root rank
+   * @endcode
+   *
+   * @param[in] device_ids list of device IDs to be used to initiate the clique
+   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+   *
+   */
+  nccl_clique(const std::vector<int>& device_ids, int percent_of_free_memory = 80)
+    : root_rank_(0),
+      num_ranks_(device_ids.size()),
+      percent_of_free_memory_(percent_of_free_memory),
+      device_ids_(device_ids),
+      nccl_comms_(device_ids.size()),
+      per_device_pools_(0),
+      device_resources_(0)
+  {
+    nccl_clique_init();
+  }
+
+  void nccl_clique_init()
+  {
+    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), num_ranks_, device_ids_.data()));
+
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+
+      // create a pool memory resource for each device
+      auto old_mr = rmm::mr::get_current_device_resource();
+      per_device_pools_.push_back(std::make_unique<pool_mr>(
+        old_mr, rmm::percent_of_free_device_memory(percent_of_free_memory_)));
+      rmm::cuda_device_id id(device_ids_[rank]);
+      rmm::mr::set_per_device_resource(id, per_device_pools_.back().get());
+
+      // create a device resource handle for each device
+      device_resources_.emplace_back();
+
+      // add NCCL communications to the device resource handle
+      raft::comms::build_comms_nccl_only(
+        &device_resources_[rank], nccl_comms_[rank], num_ranks_, rank);
+    }
+
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+      raft::resource::sync_stream(device_resources_[rank]);
+    }
+  }
+
+  const raft::device_resources& set_current_device_to_root_rank() const
+  {
+    int root_device_id = device_ids_[root_rank_];
+    RAFT_CUDA_TRY(cudaSetDevice(root_device_id));
+    return device_resources_[root_rank_];
+  }
+
+  ~nccl_clique()
+  {
+#pragma omp parallel for  // necessary to avoid hangs
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      cudaSetDevice(device_ids_[rank]);
+      ncclCommDestroy(nccl_comms_[rank]);
+      rmm::cuda_device_id id(device_ids_[rank]);
+      rmm::mr::set_per_device_resource(id, nullptr);
+    }
+  }
+
+  int root_rank_;
+  int num_ranks_;
+  int percent_of_free_memory_;
+  std::vector<int> device_ids_;
+  std::vector<ncclComm_t> nccl_comms_;
+  std::vector<std::shared_ptr<pool_mr>> per_device_pools_;
+  std::vector<raft::device_resources> device_resources_;
+};
+
+}  // namespace raft::comms
diff --git a/cpp/include/raft/core/resource/nccl_clique.hpp b/cpp/include/raft/core/resource/nccl_clique.hpp
new file mode 100644
index 0000000000..edda5043ae
--- /dev/null
+++ b/cpp/include/raft/core/resource/nccl_clique.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/comms/nccl_clique.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <memory>
+
+namespace raft::resource {
+
+class nccl_clique_resource : public resource {
+ public:
+  nccl_clique_resource() : clique_(std::make_unique<raft::comms::nccl_clique>()) {}
+  ~nccl_clique_resource() override {}
+  void* get_resource() override { return clique_.get(); }
+
+ private:
+  std::unique_ptr<raft::comms::nccl_clique> clique_;
+};
+
+/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
+class nccl_clique_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::NCCL_CLIQUE; }
+  resource* make_resource() override { return new nccl_clique_resource(); }
+};
+
+/**
+ * @defgroup nccl_clique_resource resource functions
+ * @{
+ */
+
+/**
+ * Retrieves a NCCL clique from raft res if it exists, otherwise initializes it and return it.
+ *
+ * @param[in] res the raft resources object
+ * @return NCCL clique
+ */
+inline const raft::comms::nccl_clique& get_nccl_clique(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::NCCL_CLIQUE)) {
+    res.add_resource_factory(std::make_shared<nccl_clique_resource_factory>());
+  }
+  return *res.get_resource<raft::comms::nccl_clique>(resource_type::NCCL_CLIQUE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index d9126251c9..4fa84c3bdb 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -46,6 +46,7 @@ enum resource_type {
   CUBLASLT_HANDLE,           // cublasLt handle
   CUSTOM,                    // runtime-shared default-constructible resource
   LARGE_WORKSPACE_RESOURCE,  // rmm device memory resource for somewhat large temporary allocations
+  NCCL_CLIQUE,               // nccl clique
 
   LAST_KEY  // reserved for the last key
 };

From 5ee0e79a7aa033d1f312e805fca1194cdf0483b3 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 26 Sep 2024 18:22:10 -0500
Subject: [PATCH 29/31] bump NCCL floor to 2.19 (#2458)

Follow-up to #2443

As part of the work to support NumPy 2 across RAPIDS, we found reason to upgrade some libraries like `cugraph` to slightly newer NCCL (`>=2.19`). Context: https://github.com/rapidsai/build-planning/issues/102#issuecomment-2375595743

This applies that same bump here, to keep the range of NCCL versions consistent across RAPIDS.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2458
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-120_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-120_arch-x86_64.yaml  | 2 +-
 conda/recipes/libraft/conda_build_config.yaml           | 2 +-
 conda/recipes/raft-ann-bench/conda_build_config.yaml    | 2 +-
 conda/recipes/raft-dask/conda_build_config.yaml         | 2 +-
 dependencies.yaml                                       | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index de4eb7e690..0b84772fad 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 26f4c1efaa..d1c01f1b16 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 692956502b..4c506f5297 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 133d42bfee..a123950e3a 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index f99cedd627..864eb2130b 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 08aea32ab1..5da6eaf17e 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 572ad85ab5..65de97c170 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 9fd23edb03..7e1adbc483 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 6dc06648bd..bc0ff1fae7 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.18.1.1"
+  - ">=2.19"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index bdb4e883ea..47bd730daf 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.18.1.1"
+  - ">=2.19"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 58e8ec3c9e..65c589fc0c 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -26,4 +26,4 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.18.1.1"
+  - ">=2.19"
diff --git a/dependencies.yaml b/dependencies.yaml
index e833e8519a..d0991f4d04 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -170,7 +170,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - nccl>=2.18.1.1
+          - nccl>=2.19
           - libucxx==0.40.*,>=0.0.0a0
     specific:
       - output_types: conda

From 6c4fdfb3bfa32c3f08d03adad0d461c167f277ce Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 26 Sep 2024 21:25:22 -0400
Subject: [PATCH 30/31] Deprecating vector search APIs and updating README
 accordingly (#2448)

I opted to deprecate just the necessary pieces, such as the `index` classes instead of deprecating every single function.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/raft/pull/2448
---
 README.md                                     | 111 +--------------
 cpp/include/raft/cluster/kmeans.cuh           |  63 +++++----
 cpp/include/raft/cluster/kmeans_balanced.cuh  |  68 ++++-----
 cpp/include/raft/cluster/single_linkage.cuh   |  33 ++---
 cpp/include/raft/distance/distance-ext.cuh    | 133 +++++++++---------
 cpp/include/raft/neighbors/ball_cover.cuh     |   1 +
 .../raft/neighbors/brute_force_types.hpp      |  32 +++--
 cpp/include/raft/neighbors/cagra_types.hpp    |  14 +-
 cpp/include/raft/neighbors/hnsw_types.hpp     |   6 +-
 cpp/include/raft/neighbors/ivf_flat_types.hpp |  16 ++-
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  20 +--
 .../raft/neighbors/nn_descent_types.hpp       |  16 ++-
 cpp/include/raft/neighbors/refine-ext.cuh     |  32 ++---
 docs/source/conf.py                           |   2 +-
 docs/source/raft_ann_benchmarks.md            |   4 +
 15 files changed, 239 insertions(+), 312 deletions(-)

diff --git a/README.md b/README.md
index 25ce059630..af2219fdd1 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools for Vector Search and More</div>
 
 > [!IMPORTANT]
-> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.10 (October) release and will be removing them altogether in the 24.12 (December) release.
 
 ![RAFT tech stack](img/raft-tech-stack-vss.png)
 
@@ -36,7 +36,7 @@
 
 ## What is RAFT?
 
-RAFT contains fundamental widely-used algorithms and primitives for machine learning and information retrieval. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications.
+RAFT contains fundamental widely-used algorithms and primitives for machine learning and data mining. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications.
 
 By taking a primitives-based approach to algorithm development, RAFT
 - accelerates algorithm construction time
@@ -47,12 +47,10 @@ While not exhaustive, the following general categories help summarize the accele
 #####
 | Category              | Accelerated Functions in RAFT                                                                                                     |
 |-----------------------|-----------------------------------------------------------------------------------------------------------------------------------|
-| **Nearest Neighbors** | vector search, neighborhood graph construction, epsilon neighborhoods, pairwise distances                                         |
-| **Basic Clustering**  | spectral clustering, hierarchical clustering, k-means                                                                             |
-| **Solvers**           | combinatorial optimization, iterative solvers                                                                                     |
 | **Data Formats**      | sparse & dense, conversions, data generation                                                                                      |
 | **Dense Operations**  | linear algebra, matrix and vector operations, reductions, slicing, norms, factorization, least squares, svd & eigenvalue problems |
 | **Sparse Operations** | linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling             |
+| **Solvers**           | combinatorial optimization, iterative solvers                                                                                     |
 | **Statistics**        | sampling, moments and summary statistics, metrics, model evaluation                                                               |
 | **Tools & Utilities** | common tools and utilities for developing CUDA applications, multi-node multi-gpu infrastructure                                  |
 
@@ -67,42 +65,6 @@ In addition being a C++ library, RAFT also provides 2 Python libraries:
 
 ![RAFT is a C++ header-only template library with optional shared library and lightweight Python wrappers](img/arch.png)
 
-## Use cases
-
-### Vector Similarity Search
-
-RAFT contains state-of-the-art implementations of approximate nearest neighbors search (ANNS) algorithms on the GPU, such as:
-
-* [Brute force](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#brute-force). Performs a brute force nearest neighbors search without an index.
-* [IVF-Flat](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-flat) and [IVF-PQ](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-pq). Use an inverted file index structure to map contents to their locations. IVF-PQ additionally uses product quantization to reduce the memory usage of vectors. These methods were originally popularized by the [FAISS](https://github.com/facebookresearch/faiss) library.
-* [CAGRA](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#cagra) (Cuda Anns GRAph-based). Uses a fast ANNS graph construction and search implementation optimized for the GPU. CAGRA outperforms state-of-the art CPU methods (i.e. HNSW) for large batch queries, single queries, and graph construction time. 
-
-Projects that use the RAFT ANNS algorithms for accelerating vector search include: [Milvus](https://milvus.io/), [Redis](https://redis.io/), and [Faiss](https://github.com/facebookresearch/faiss). 
-
-Please see the example [Jupyter notebook](https://github.com/rapidsai/raft/blob/HEAD/notebooks/VectorSearch_QuestionRetrieval.ipynb) to get started RAFT for vector search in Python.
-
-
-
-### Information Retrieval
-
-RAFT contains a catalog of reusable primitives for composing algorithms that require fast neighborhood computations, such as
-
-1. Computing distances between vectors and computing kernel gramm matrices
-2. Performing ball radius queries for constructing epsilon neighborhoods
-3. Clustering points to partition a space for smaller and faster searches
-4. Constructing neighborhood "connectivities" graphs from dense vectors
-
-### Machine Learning
-
-RAFT's primitives are used in several RAPIDS libraries, including [cuML](https://github.com/rapidsai/cuml), [cuGraph](https://github.com/rapidsai/cugraph), and [cuOpt](https://github.com/rapidsai/cuopt) to build many end-to-end machine learning algorithms that span a large spectrum of different applications, including 
-- data generation 
-- model evaluation
-- classification and regression
-- clustering
-- manifold learning
-- dimensionality reduction.
-
-RAFT is also used by the popular collaborative filtering library [implicit](https://github.com/benfred/implicit) for recommender systems.
 
 ## Is RAFT right for me?
 
@@ -327,70 +289,3 @@ When citing RAFT generally, please consider referencing this Github project.
   year={2022}
 }
 ```
-If citing the sparse pairwise distances API, please consider using the following bibtex:
-```bibtex
-@article{nolet2021semiring,
-  title={Semiring primitives for sparse neighborhood methods on the gpu},
-  author={Nolet, Corey J and Gala, Divye and Raff, Edward and Eaton, Joe and Rees, Brad and Zedlewski, John and Oates, Tim},
-  journal={arXiv preprint arXiv:2104.06357},
-  year={2021}
-}
-```
-
-If citing the single-linkage agglomerative clustering APIs, please consider the following bibtex:
-```bibtex
-@misc{nolet2023cuslink,
-      title={cuSLINK: Single-linkage Agglomerative Clustering on the GPU},
-      author={Corey J. Nolet and Divye Gala and Alex Fender and Mahesh Doijade and Joe Eaton and Edward Raff and John Zedlewski and Brad Rees and Tim Oates},
-      year={2023},
-      eprint={2306.16354},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}
-```
-
-If citing CAGRA, please consider the following bibtex:
-```bibtex
-@misc{ootomo2023cagra,
-      title={CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search for GPUs},
-      author={Hiroyuki Ootomo and Akira Naruse and Corey Nolet and Ray Wang and Tamas Feher and Yong Wang},
-      year={2024},
-      series = {ICDE '24}
-}
-```
-
-If citing the k-selection routines, please consider the following bibtex:
-
-```bibtex
-@proceedings{10.1145/3581784,
-    title = {Parallel Top-K Algorithms on GPU: A Comprehensive Study and New Methods},
-    author={Jingrong Zhang, Akira Naruse, Xipeng Li, and Yong Wang},
-    year = {2023},
-    isbn = {9798400701092},
-    publisher = {Association for Computing Machinery},
-    address = {New York, NY, USA},
-    location = {Denver, CO, USA},
-    series = {SC '23}
-}
-```
-
-If citing the nearest neighbors descent API, please consider the following bibtex:
-```bibtex
-@inproceedings{10.1145/3459637.3482344,
-    author = {Wang, Hui and Zhao, Wan-Lei and Zeng, Xiangxiang and Yang, Jianye},
-    title = {Fast K-NN Graph Construction by GPU Based NN-Descent},
-    year = {2021},
-    isbn = {9781450384469},
-    publisher = {Association for Computing Machinery},
-    address = {New York, NY, USA},
-    url = {https://doi.org/10.1145/3459637.3482344},
-    doi = {10.1145/3459637.3482344},
-    abstract = {NN-Descent is a classic k-NN graph construction approach. It is still widely employed in machine learning, computer vision, and information retrieval tasks due to its efficiency and genericness. However, the current design only works well on CPU. In this paper, NN-Descent has been redesigned to adapt to the GPU architecture. A new graph update strategy called selective update is proposed. It reduces the data exchange between GPU cores and GPU global memory significantly, which is the processing bottleneck under GPU computation architecture. This redesign leads to full exploitation of the parallelism of the GPU hardware. In the meantime, the genericness, as well as the simplicity of NN-Descent, are well-preserved. Moreover, a procedure that allows to k-NN graph to be merged efficiently on GPU is proposed. It makes the construction of high-quality k-NN graphs for out-of-GPU-memory datasets tractable. Our approach is 100-250\texttimes{} faster than the single-thread NN-Descent and is 2.5-5\texttimes{} faster than the existing GPU-based approaches as we tested on million as well as billion scale datasets.},
-    booktitle = {Proceedings of the 30th ACM International Conference on Information \& Knowledge Management},
-    pages = {1929–1938},
-    numpages = {10},
-    keywords = {high-dimensional, nn-descent, gpu, k-nearest neighbor graph},
-    location = {Virtual Event, Queensland, Australia},
-    series = {CIKM '21}
-}
-```
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index eb28cc1626..38318e8ec8 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -86,13 +86,14 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit(raft::resources const& handle,
-         const KMeansParams& params,
-         raft::device_matrix_view<const DataT, IndexT> X,
-         std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
-         raft::device_matrix_view<DataT, IndexT> centroids,
-         raft::host_scalar_view<DataT> inertia,
-         raft::host_scalar_view<IndexT> n_iter)
+[[deprecated("Use cuVS instead")]] void fit(
+  raft::resources const& handle,
+  const KMeansParams& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+  raft::device_matrix_view<DataT, IndexT> centroids,
+  raft::host_scalar_view<DataT> inertia,
+  raft::host_scalar_view<IndexT> n_iter)
 {
   detail::kmeans_fit<DataT, IndexT>(handle, params, X, sample_weight, centroids, inertia, n_iter);
 }
@@ -150,14 +151,15 @@ void fit(raft::resources const& handle,
  *                                 their closest cluster center.
  */
 template <typename DataT, typename IndexT>
-void predict(raft::resources const& handle,
-             const KMeansParams& params,
-             raft::device_matrix_view<const DataT, IndexT> X,
-             std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
-             raft::device_matrix_view<const DataT, IndexT> centroids,
-             raft::device_vector_view<IndexT, IndexT> labels,
-             bool normalize_weight,
-             raft::host_scalar_view<DataT> inertia)
+[[deprecated("Use cuVS instead")]] void predict(
+  raft::resources const& handle,
+  const KMeansParams& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+  raft::device_matrix_view<const DataT, IndexT> centroids,
+  raft::device_vector_view<IndexT, IndexT> labels,
+  bool normalize_weight,
+  raft::host_scalar_view<DataT> inertia)
 {
   detail::kmeans_predict<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
@@ -213,14 +215,15 @@ void predict(raft::resources const& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit_predict(raft::resources const& handle,
-                 const KMeansParams& params,
-                 raft::device_matrix_view<const DataT, IndexT> X,
-                 std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
-                 std::optional<raft::device_matrix_view<DataT, IndexT>> centroids,
-                 raft::device_vector_view<IndexT, IndexT> labels,
-                 raft::host_scalar_view<DataT> inertia,
-                 raft::host_scalar_view<IndexT> n_iter)
+[[deprecated("Use cuVS instead")]] void fit_predict(
+  raft::resources const& handle,
+  const KMeansParams& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+  std::optional<raft::device_matrix_view<DataT, IndexT>> centroids,
+  raft::device_vector_view<IndexT, IndexT> labels,
+  raft::host_scalar_view<DataT> inertia,
+  raft::host_scalar_view<IndexT> n_iter)
 {
   detail::kmeans_fit_predict<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
@@ -252,13 +255,13 @@ void transform(raft::resources const& handle,
 }
 
 template <typename DataT, typename IndexT>
-void transform(raft::resources const& handle,
-               const KMeansParams& params,
-               const DataT* X,
-               const DataT* centroids,
-               IndexT n_samples,
-               IndexT n_features,
-               DataT* X_new)
+[[deprecated("Use cuVS instead")]] void transform(raft::resources const& handle,
+                                                  const KMeansParams& params,
+                                                  const DataT* X,
+                                                  const DataT* centroids,
+                                                  IndexT n_samples,
+                                                  IndexT n_features,
+                                                  DataT* X_new)
 {
   detail::kmeans_transform<DataT, IndexT>(
     handle, params, X, centroids, n_samples, n_features, X_new);
diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh
index a1a182608b..7479047fce 100644
--- a/cpp/include/raft/cluster/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/kmeans_balanced.cuh
@@ -73,11 +73,11 @@ namespace raft::cluster::kmeans_balanced {
  *                        datatype. If DataT == MathT, this must be the identity.
  */
 template <typename DataT, typename MathT, typename IndexT, typename MappingOpT = raft::identity_op>
-void fit(const raft::resources& handle,
-         kmeans_balanced_params const& params,
-         raft::device_matrix_view<const DataT, IndexT> X,
-         raft::device_matrix_view<MathT, IndexT> centroids,
-         MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void fit(const raft::resources& handle,
+                                            kmeans_balanced_params const& params,
+                                            raft::device_matrix_view<const DataT, IndexT> X,
+                                            raft::device_matrix_view<MathT, IndexT> centroids,
+                                            MappingOpT mapping_op = raft::identity_op())
 {
   RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
                "Number of features in dataset and centroids are different");
@@ -131,12 +131,13 @@ template <typename DataT,
           typename IndexT,
           typename LabelT,
           typename MappingOpT = raft::identity_op>
-void predict(const raft::resources& handle,
-             kmeans_balanced_params const& params,
-             raft::device_matrix_view<const DataT, IndexT> X,
-             raft::device_matrix_view<const MathT, IndexT> centroids,
-             raft::device_vector_view<LabelT, IndexT> labels,
-             MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void predict(
+  const raft::resources& handle,
+  kmeans_balanced_params const& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<const MathT, IndexT> centroids,
+  raft::device_vector_view<LabelT, IndexT> labels,
+  MappingOpT mapping_op = raft::identity_op())
 {
   RAFT_EXPECTS(X.extent(0) == labels.extent(0),
                "Number of rows in dataset and labels are different");
@@ -196,12 +197,13 @@ template <typename DataT,
           typename IndexT,
           typename LabelT,
           typename MappingOpT = raft::identity_op>
-void fit_predict(const raft::resources& handle,
-                 kmeans_balanced_params const& params,
-                 raft::device_matrix_view<const DataT, IndexT> X,
-                 raft::device_matrix_view<MathT, IndexT> centroids,
-                 raft::device_vector_view<LabelT, IndexT> labels,
-                 MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void fit_predict(
+  const raft::resources& handle,
+  kmeans_balanced_params const& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<MathT, IndexT> centroids,
+  raft::device_vector_view<LabelT, IndexT> labels,
+  MappingOpT mapping_op = raft::identity_op())
 {
   auto centroids_const = raft::make_device_matrix_view<const MathT, IndexT>(
     centroids.data_handle(), centroids.extent(0), centroids.extent(1));
@@ -255,14 +257,15 @@ template <typename DataT,
           typename LabelT,
           typename CounterT,
           typename MappingOpT>
-void build_clusters(const raft::resources& handle,
-                    const kmeans_balanced_params& params,
-                    raft::device_matrix_view<const DataT, IndexT> X,
-                    raft::device_matrix_view<MathT, IndexT> centroids,
-                    raft::device_vector_view<LabelT, IndexT> labels,
-                    raft::device_vector_view<CounterT, IndexT> cluster_sizes,
-                    MappingOpT mapping_op = raft::identity_op(),
-                    std::optional<raft::device_vector_view<const MathT>> X_norm = std::nullopt)
+[[deprecated("Use cuVS instead")]] void build_clusters(
+  const raft::resources& handle,
+  const kmeans_balanced_params& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<MathT, IndexT> centroids,
+  raft::device_vector_view<LabelT, IndexT> labels,
+  raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+  MappingOpT mapping_op                                       = raft::identity_op(),
+  std::optional<raft::device_vector_view<const MathT>> X_norm = std::nullopt)
 {
   RAFT_EXPECTS(X.extent(0) == labels.extent(0),
                "Number of rows in dataset and labels are different");
@@ -334,13 +337,14 @@ template <typename DataT,
           typename LabelT,
           typename CounterT,
           typename MappingOpT = raft::identity_op>
-void calc_centers_and_sizes(const raft::resources& handle,
-                            raft::device_matrix_view<const DataT, IndexT> X,
-                            raft::device_vector_view<const LabelT, IndexT> labels,
-                            raft::device_matrix_view<MathT, IndexT> centroids,
-                            raft::device_vector_view<CounterT, IndexT> cluster_sizes,
-                            bool reset_counters   = true,
-                            MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void calc_centers_and_sizes(
+  const raft::resources& handle,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_vector_view<const LabelT, IndexT> labels,
+  raft::device_matrix_view<MathT, IndexT> centroids,
+  raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+  bool reset_counters   = true,
+  MappingOpT mapping_op = raft::identity_op())
 {
   RAFT_EXPECTS(X.extent(0) == labels.extent(0),
                "Number of rows in dataset and labels are different");
diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh
index d9eba6edc5..067445c542 100644
--- a/cpp/include/raft/cluster/single_linkage.cuh
+++ b/cpp/include/raft/cluster/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,14 +50,14 @@ namespace raft::cluster {
 template <typename value_idx,
           typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(raft::resources const& handle,
-                    const value_t* X,
-                    size_t m,
-                    size_t n,
-                    raft::distance::DistanceType metric,
-                    linkage_output<value_idx>* out,
-                    int c,
-                    size_t n_clusters)
+[[deprecated("Use cuVS instead")]] void single_linkage(raft::resources const& handle,
+                                                       const value_t* X,
+                                                       size_t m,
+                                                       size_t n,
+                                                       raft::distance::DistanceType metric,
+                                                       linkage_output<value_idx>* out,
+                                                       int c,
+                                                       size_t n_clusters)
 {
   detail::single_linkage<value_idx, value_t, dist_type>(
     handle, X, m, n, metric, out, c, n_clusters);
@@ -87,13 +87,14 @@ constexpr int DEFAULT_CONST_C = 15;
  control of k. The algorithm will set `k = log(n) + c`
  */
 template <typename value_t, typename idx_t, LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(raft::resources const& handle,
-                    raft::device_matrix_view<const value_t, idx_t, row_major> X,
-                    raft::device_matrix_view<idx_t, idx_t, row_major> dendrogram,
-                    raft::device_vector_view<idx_t, idx_t> labels,
-                    raft::distance::DistanceType metric,
-                    size_t n_clusters,
-                    std::optional<int> c = std::make_optional<int>(DEFAULT_CONST_C))
+[[deprecated("Use cuVS instead")]] void single_linkage(
+  raft::resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, row_major> X,
+  raft::device_matrix_view<idx_t, idx_t, row_major> dendrogram,
+  raft::device_vector_view<idx_t, idx_t> labels,
+  raft::distance::DistanceType metric,
+  size_t n_clusters,
+  std::optional<int> c = std::make_optional<int>(DEFAULT_CONST_C))
 {
   linkage_output<idx_t> out_arrs;
   out_arrs.children = dendrogram.data_handle();
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 2d41e029fe..dcbfbfdbc3 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -35,42 +35,43 @@ template <raft::distance::DistanceType DistT,
           typename OutT,
           typename FinalLambda,
           typename IdxT = int>
-void distance(raft::resources const& handle,
-              const DataT* x,
-              const DataT* y,
-              OutT* dist,
-              IdxT m,
-              IdxT n,
-              IdxT k,
-              void* workspace,
-              size_t worksize,
-              FinalLambda fin_op,
-              bool isRowMajor  = true,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+                                                 const DataT* x,
+                                                 const DataT* y,
+                                                 OutT* dist,
+                                                 IdxT m,
+                                                 IdxT n,
+                                                 IdxT k,
+                                                 void* workspace,
+                                                 size_t worksize,
+                                                 FinalLambda fin_op,
+                                                 bool isRowMajor  = true,
+                                                 DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
           typename OutT,
           typename IdxT = int>
-void distance(raft::resources const& handle,
-              const DataT* x,
-              const DataT* y,
-              OutT* dist,
-              IdxT m,
-              IdxT n,
-              IdxT k,
-              void* workspace,
-              size_t worksize,
-              bool isRowMajor  = true,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+                                                 const DataT* x,
+                                                 const DataT* y,
+                                                 OutT* dist,
+                                                 IdxT m,
+                                                 IdxT n,
+                                                 IdxT k,
+                                                 void* workspace,
+                                                 size_t worksize,
+                                                 bool isRowMajor  = true,
+                                                 DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
           typename OutT,
           typename IdxT = int>
-size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] size_t getWorkspaceSize(
+  const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
@@ -86,40 +87,40 @@ template <raft::distance::DistanceType DistT,
           typename AccT,
           typename OutT,
           typename IdxT = int>
-void distance(raft::resources const& handle,
-              const DataT* x,
-              const DataT* y,
-              OutT* dist,
-              IdxT m,
-              IdxT n,
-              IdxT k,
-              bool isRowMajor  = true,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+                                                 const DataT* x,
+                                                 const DataT* y,
+                                                 OutT* dist,
+                                                 IdxT m,
+                                                 IdxT n,
+                                                 IdxT k,
+                                                 bool isRowMajor  = true,
+                                                 DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <typename Type, typename IdxT = int>
-void pairwise_distance(raft::resources const& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       IdxT m,
-                       IdxT n,
-                       IdxT k,
-                       rmm::device_uvector<char>& workspace,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(raft::resources const& handle,
+                                                          const Type* x,
+                                                          const Type* y,
+                                                          Type* dist,
+                                                          IdxT m,
+                                                          IdxT n,
+                                                          IdxT k,
+                                                          rmm::device_uvector<char>& workspace,
+                                                          raft::distance::DistanceType metric,
+                                                          bool isRowMajor = true,
+                                                          Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <typename Type, typename IdxT = int>
-void pairwise_distance(raft::resources const& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       IdxT m,
-                       IdxT n,
-                       IdxT k,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(raft::resources const& handle,
+                                                          const Type* x,
+                                                          const Type* y,
+                                                          Type* dist,
+                                                          IdxT m,
+                                                          IdxT n,
+                                                          IdxT k,
+                                                          raft::distance::DistanceType metric,
+                                                          bool isRowMajor = true,
+                                                          Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
@@ -127,19 +128,21 @@ template <raft::distance::DistanceType DistT,
           typename OutT,
           typename layout = raft::layout_c_contiguous,
           typename IdxT   = int>
-void distance(raft::resources const& handle,
-              raft::device_matrix_view<DataT, IdxT, layout> const x,
-              raft::device_matrix_view<DataT, IdxT, layout> const y,
-              raft::device_matrix_view<OutT, IdxT, layout> dist,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(
+  raft::resources const& handle,
+  raft::device_matrix_view<DataT, IdxT, layout> const x,
+  raft::device_matrix_view<DataT, IdxT, layout> const y,
+  raft::device_matrix_view<OutT, IdxT, layout> dist,
+  DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
-void pairwise_distance(raft::resources const& handle,
-                       device_matrix_view<Type, IdxT, layout> const x,
-                       device_matrix_view<Type, IdxT, layout> const y,
-                       device_matrix_view<Type, IdxT, layout> dist,
-                       raft::distance::DistanceType metric,
-                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(
+  raft::resources const& handle,
+  device_matrix_view<Type, IdxT, layout> const x,
+  device_matrix_view<Type, IdxT, layout> const y,
+  device_matrix_view<Type, IdxT, layout> dist,
+  raft::distance::DistanceType metric,
+  Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
 };  // namespace distance
 };  // namespace raft
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
index 20c88f3318..09938020b9 100644
--- a/cpp/include/raft/neighbors/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 #ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
 #include "ball_cover-inl.cuh"
diff --git a/cpp/include/raft/neighbors/brute_force_types.hpp b/cpp/include/raft/neighbors/brute_force_types.hpp
index a8f073edc6..4511f8d8ba 100644
--- a/cpp/include/raft/neighbors/brute_force_types.hpp
+++ b/cpp/include/raft/neighbors/brute_force_types.hpp
@@ -94,12 +94,14 @@ struct index : ann::index {
    * the dataset. If the dataset is in host memory, it will be copied to the device and the
    * index will own the device memory.
    */
+
   template <typename data_accessor>
-  index(raft::resources const& res,
-        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
-        std::optional<raft::device_vector<T, int64_t>>&& norms,
-        raft::distance::DistanceType metric,
-        T metric_arg = 0.0)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+    std::optional<raft::device_vector<T, int64_t>>&& norms,
+    raft::distance::DistanceType metric,
+    T metric_arg = 0.0)
     : ann::index(),
       metric_(metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
@@ -116,11 +118,12 @@ struct index : ann::index {
    * This class stores a non-owning reference to the dataset and norms here.
    * Having precomputed norms gives us a performance advantage at query time.
    */
-  index(raft::resources const& res,
-        raft::device_matrix_view<const T, int64_t, row_major> dataset_view,
-        std::optional<raft::device_vector_view<const T, int64_t>> norms_view,
-        raft::distance::DistanceType metric,
-        T metric_arg = 0.0)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::device_matrix_view<const T, int64_t, row_major> dataset_view,
+    std::optional<raft::device_vector_view<const T, int64_t>> norms_view,
+    raft::distance::DistanceType metric,
+    T metric_arg = 0.0)
     : ann::index(),
       metric_(metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
@@ -131,10 +134,11 @@ struct index : ann::index {
   }
 
   template <typename data_accessor>
-  index(raft::resources const& res,
-        index_params const& params,
-        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
-        std::optional<raft::device_vector<T, int64_t>>&& norms = std::nullopt)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    index_params const& params,
+    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+    std::optional<raft::device_vector<T, int64_t>>&& norms = std::nullopt)
     : ann::index(),
       metric_(params.metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 97c9c0d098..bc7c380db1 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -201,8 +201,9 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. */
-  index(raft::resources const& res,
-        raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
       metric_(metric),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
@@ -265,10 +266,11 @@ struct index : ann::index {
    *
    */
   template <typename data_accessor, typename graph_accessor>
-  index(raft::resources const& res,
-        raft::distance::DistanceType metric,
-        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
-        mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::distance::DistanceType metric,
+    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+    mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
     : ann::index(),
       metric_(metric),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
diff --git a/cpp/include/raft/neighbors/hnsw_types.hpp b/cpp/include/raft/neighbors/hnsw_types.hpp
index f90de6f01b..f78571f491 100644
--- a/cpp/include/raft/neighbors/hnsw_types.hpp
+++ b/cpp/include/raft/neighbors/hnsw_types.hpp
@@ -38,7 +38,6 @@ struct search_params : ann::search_params {
   int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
                         // automatically maximizes parallelism
 };
-
 template <typename T>
 struct index : ann::index {
  public:
@@ -51,7 +50,10 @@ struct index : ann::index {
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
    */
-  index(int dim, raft::distance::DistanceType metric) : dim_{dim}, metric_{metric} {}
+  [[deprecated("Use cuVS instead")]] index(int dim, raft::distance::DistanceType metric)
+    : dim_{dim}, metric_{metric}
+  {
+  }
 
   /**
   @brief Get underlying index
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 7605bd82a3..2cafceb512 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -261,12 +261,12 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& res,
-        raft::distance::DistanceType metric,
-        uint32_t n_lists,
-        bool adaptive_centers,
-        bool conservative_memory_allocation,
-        uint32_t dim)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& res,
+                                           raft::distance::DistanceType metric,
+                                           uint32_t n_lists,
+                                           bool adaptive_centers,
+                                           bool conservative_memory_allocation,
+                                           uint32_t dim)
     : ann::index(),
       veclen_(calculate_veclen(dim)),
       metric_(metric),
@@ -285,7 +285,9 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& res, const index_params& params, uint32_t dim)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& res,
+                                           const index_params& params,
+                                           uint32_t dim)
     : index(res,
             params.metric,
             params.n_lists,
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 3ee350c6fb..d5906d621c 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -361,14 +361,14 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& handle,
-        raft::distance::DistanceType metric,
-        codebook_gen codebook_kind,
-        uint32_t n_lists,
-        uint32_t dim,
-        uint32_t pq_bits                    = 8,
-        uint32_t pq_dim                     = 0,
-        bool conservative_memory_allocation = false)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& handle,
+                                           raft::distance::DistanceType metric,
+                                           codebook_gen codebook_kind,
+                                           uint32_t n_lists,
+                                           uint32_t dim,
+                                           uint32_t pq_bits                    = 8,
+                                           uint32_t pq_dim                     = 0,
+                                           bool conservative_memory_allocation = false)
     : ann::index(),
       metric_(metric),
       codebook_kind_(codebook_kind),
@@ -391,7 +391,9 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& handle, const index_params& params, uint32_t dim)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& handle,
+                                           const index_params& params,
+                                           uint32_t dim)
     : index(handle,
             params.metric,
             params.codebook_kind,
diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp
index eb01a423be..9decf47f39 100644
--- a/cpp/include/raft/neighbors/nn_descent_types.hpp
+++ b/cpp/include/raft/neighbors/nn_descent_types.hpp
@@ -101,7 +101,10 @@ struct index : ann::index {
    * @param n_cols number of cols in knn-graph
    * @param return_distances whether to allocate and get distances information
    */
-  index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& res,
+                                           int64_t n_rows,
+                                           int64_t n_cols,
+                                           bool return_distances = false)
     : ann::index(),
       res_{res},
       metric_{raft::distance::DistanceType::L2Expanded},
@@ -128,11 +131,12 @@ struct index : ann::index {
    * storing knn-graph distances
    * @param return_distances whether to allocate and get distances information
    */
-  index(raft::resources const& res,
-        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
-        std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
-          std::nullopt,
-        bool return_distances = false)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
+    std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
+      std::nullopt,
+    bool return_distances = false)
     : ann::index(),
       res_{res},
       metric_{raft::distance::DistanceType::L2Expanded},
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
index 7948a0e4f2..216e1b9ab5 100644
--- a/cpp/include/raft/neighbors/refine-ext.cuh
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -29,24 +29,24 @@
 namespace raft::neighbors {
 
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::resources const& handle,
-            raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
-            raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
-            raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
-            raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
-            raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
-            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
-  RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void refine(
+  raft::resources const& handle,
+  raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
+  raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
+  raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+  raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+  raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
+  raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
 
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::resources const& handle,
-            raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
-            raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
-            raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
-            raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
-            raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
-            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
-  RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void refine(
+  raft::resources const& handle,
+  raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
+  raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
+  raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+  raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
+  raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
+  raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
 
 }  // namespace raft::neighbors
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8b2040baa2..7a287b689f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -71,7 +71,7 @@
 
 .. attention::
 
-  The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called `cuVS <https://github.com/rapidsai/cuvs>`_. We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+  The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called `cuVS <https://github.com/rapidsai/cuvs>`_. We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.10 (October) release and they will be removed from RAFT altogether in the 24.12 (December) release.
 
 """
 
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index fc11a56ac8..7bac2047fc 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -2,6 +2,10 @@
 
 This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU.
 
+> [!IMPORTANT]
+> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). As a result, `raft-ann-bench` is being migrated to `cuvs-bench` and will be removed from RAFT altogether in the 24.12 (December) release.
+
+
 ## Table of Contents
 
 - [Installing the benchmarks](#installing-the-benchmarks)

From ec12aba62032e2075ba10bdd40cf863937eca7fd Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 9 Oct 2024 09:39:25 -0400
Subject: [PATCH 31/31] Update Changelog [skip ci]

---
 CHANGELOG.md | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0685145dca..9caa5ef571 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,48 @@
+# raft 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- [Feat] add `repeat`, `sparsity`,  `eval_n_elements` APIs to `bitset` ([#2439](https://github.com/rapidsai/raft/pull/2439)) [@rhdong](https://github.com/rhdong)
+
+## 🐛 Bug Fixes
+
+- Disable NN Descent Batch tests temporarily ([#2453](https://github.com/rapidsai/raft/pull/2453)) [@divyegala](https://github.com/divyegala)
+- Fix sed syntax in `update-version.sh` ([#2441](https://github.com/rapidsai/raft/pull/2441)) [@raydouglass](https://github.com/raydouglass)
+- Use runtime check of cudart version for eig ([#2430](https://github.com/rapidsai/raft/pull/2430)) [@lowener](https://github.com/lowener)
+- [BUG] Fix bitset function visibility ([#2429](https://github.com/rapidsai/raft/pull/2429)) [@lowener](https://github.com/lowener)
+- Exclude any kernel symbol that uses cutlass ([#2425](https://github.com/rapidsai/raft/pull/2425)) [@robertmaynard](https://github.com/robertmaynard)
+
+## 🚀 New Features
+
+- [Feat] add `repeat`, `sparsity`,  `eval_n_elements` APIs to `bitset` ([#2439](https://github.com/rapidsai/raft/pull/2439)) [@rhdong](https://github.com/rhdong)
+- [Opt] Enforce the UT Coverity and add benchmark for `transpose` ([#2438](https://github.com/rapidsai/raft/pull/2438)) [@rhdong](https://github.com/rhdong)
+- [FEA] Support for half-float mixed precise in brute-force ([#2382](https://github.com/rapidsai/raft/pull/2382)) [@rhdong](https://github.com/rhdong)
+
+## 🛠️ Improvements
+
+- bump NCCL floor to 2.19 ([#2458](https://github.com/rapidsai/raft/pull/2458)) [@jameslamb](https://github.com/jameslamb)
+- Deprecating vector search APIs and updating README accordingly ([#2448](https://github.com/rapidsai/raft/pull/2448)) [@cjnolet](https://github.com/cjnolet)
+- Update update-version.sh to use packaging lib ([#2447](https://github.com/rapidsai/raft/pull/2447)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Switch traceback to `native` ([#2446](https://github.com/rapidsai/raft/pull/2446)) [@galipremsagar](https://github.com/galipremsagar)
+- bump NCCL floor to 2.18.1.1 ([#2443](https://github.com/rapidsai/raft/pull/2443)) [@jameslamb](https://github.com/jameslamb)
+- Add missing `cuda_suffixed: true` ([#2440](https://github.com/rapidsai/raft/pull/2440)) [@trxcllnt](https://github.com/trxcllnt)
+- Use CI workflow branch &#39;branch-24.10&#39; again ([#2437](https://github.com/rapidsai/raft/pull/2437)) [@jameslamb](https://github.com/jameslamb)
+- Update to flake8 7.1.1. ([#2435](https://github.com/rapidsai/raft/pull/2435)) [@bdice](https://github.com/bdice)
+- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#2433](https://github.com/rapidsai/raft/pull/2433)) [@jameslamb](https://github.com/jameslamb)
+- Allow coo_sort to work on int64_t indices ([#2432](https://github.com/rapidsai/raft/pull/2432)) [@benfred](https://github.com/benfred)
+- Adding NCCL clique to the RAFT handle ([#2431](https://github.com/rapidsai/raft/pull/2431)) [@viclafargue](https://github.com/viclafargue)
+- Add support for Python 3.12 ([#2428](https://github.com/rapidsai/raft/pull/2428)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#2420](https://github.com/rapidsai/raft/pull/2420)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Drop Python 3.9 support ([#2417](https://github.com/rapidsai/raft/pull/2417)) [@jameslamb](https://github.com/jameslamb)
+- Use CUDA math wheels ([#2415](https://github.com/rapidsai/raft/pull/2415)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove NumPy &lt;2 pin ([#2414](https://github.com/rapidsai/raft/pull/2414)) [@seberg](https://github.com/seberg)
+- Update pre-commit hooks ([#2409](https://github.com/rapidsai/raft/pull/2409)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve update-version.sh ([#2408](https://github.com/rapidsai/raft/pull/2408)) [@bdice](https://github.com/bdice)
+- Use tool.scikit-build.cmake.version, set scikit-build-core minimum-version ([#2406](https://github.com/rapidsai/raft/pull/2406)) [@jameslamb](https://github.com/jameslamb)
+- [FEA] Batching NN Descent ([#2403](https://github.com/rapidsai/raft/pull/2403)) [@jinsolp](https://github.com/jinsolp)
+- Update pip devcontainers to UCX v1.17.0 ([#2401](https://github.com/rapidsai/raft/pull/2401)) [@jameslamb](https://github.com/jameslamb)
+- Merge branch-24.08 into branch-24.10 ([#2397](https://github.com/rapidsai/raft/pull/2397)) [@jameslamb](https://github.com/jameslamb)
+
 # raft 24.08.00 (7 Aug 2024)
 
 ## 🚨 Breaking Changes