diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 13103e8f7..05f11c005 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 74d62afcc..b4c507f86 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index d6902d3f9..4f8d628c2 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 3dcf52e83..8e6ba4de8 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
       "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index db20bdbc1..7ac02e365 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   rust-build:
     needs: cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -82,7 +82,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuvs:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-publish-cuvs:
     needs: wheel-build-cuvs
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 07b10e85a..78648235f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-tests
@@ -24,49 +25,87 @@ jobs:
       - wheel-tests-cuvs
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!README.md'
+          - '!docs/**'
+          - '!img/**'
+          - '!notebooks/**'
+          - '!python/**'
+          - '!rust/**'
+          - '!thirdparty/LICENSES/**'
+        test_notebooks:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!README.md'
+          - '!rust/**'
+          - '!thirdparty/LICENSES/**'
+        test_python:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!README.md'
+          - '!docs/**'
+          - '!img/**'
+          - '!notebooks/**'
+          - '!rust/**'
+          - '!thirdparty/LICENSES/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
     with:
       build_type: pull-request
       enable_check_symbols: true
-      symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+      symbol_exclusions: (void (thrust::|cub::))
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -76,7 +115,7 @@ jobs:
   rust-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -86,20 +125,21 @@ jobs:
   wheel-build-cuvs:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_cuvs.sh
   wheel-tests-cuvs:
-    needs: wheel-build-cuvs
+    needs: [wheel-build-cuvs, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cuvs.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0821233a1..27dc99a11 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,17 +16,17 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       enable_check_symbols: true
-      symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+      symbol_exclusions: (void (thrust::|cub::))
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-cuvs:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.gitignore b/.gitignore
index 97eab287d..da6eb07f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,6 +75,7 @@ compile_commands.json
 .clangd/
 
 # serialized ann indexes
+brute_force_index
 cagra_index
 ivf_flat_index
 ivf_pq_index
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ac4085744..dedb5f0ca 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -91,7 +91,10 @@ repos:
               - id: codespell
                 additional_dependencies: [tomli]
                 args: ["--toml", "pyproject.toml"]
-                exclude: (?x)^(^CHANGELOG.md$)
+                exclude: |
+                  (?x)
+                    ^CHANGELOG[.]md$|
+                    ^cpp/cmake/patches/cutlass/build-export[.]patch$
       - repo: https://github.com/pre-commit/pre-commit-hooks
         rev: v4.5.0
         hooks:
@@ -114,7 +117,7 @@ repos:
           - id: verify-alpha-spec
             args: ["--fix", "--mode=release"]
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.13.11
+        rev: v1.16.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ce4a14c3..ed9429d55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,67 @@
+# cuvs 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Skip IVF-PQ packing test for lists with not enough data ([#512](https://github.com/rapidsai/cuvs/pull/512)) [@achirkin](https://github.com/achirkin)
+- [BUG] Fix CAGRA filter ([#489](https://github.com/rapidsai/cuvs/pull/489)) [@enp1s0](https://github.com/enp1s0)
+- Add `kIsSingleSource` to `PairwiseDistanceEpilogueElementwise` ([#485](https://github.com/rapidsai/cuvs/pull/485)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix include errors, header, and unsafe locks in iface.hpp ([#467](https://github.com/rapidsai/cuvs/pull/467)) [@achirkin](https://github.com/achirkin)
+- Fix an OOB error in device-side cuvs::neighbors::refine and CAGRA kern_prune ([#460](https://github.com/rapidsai/cuvs/pull/460)) [@achirkin](https://github.com/achirkin)
+- Put a ceiling on cuda-python ([#445](https://github.com/rapidsai/cuvs/pull/445)) [@bdice](https://github.com/bdice)
+- Enable NVTX in cuvs-cagra-search component ([#439](https://github.com/rapidsai/cuvs/pull/439)) [@achirkin](https://github.com/achirkin)
+- BUG: CAGRA multi-cta illegal access with bad queries ([#438](https://github.com/rapidsai/cuvs/pull/438)) [@achirkin](https://github.com/achirkin)
+- Fix index overflow in edge cases of CAGRA graph optimize ([#435](https://github.com/rapidsai/cuvs/pull/435)) [@achirkin](https://github.com/achirkin)
+- Fix correct call to brute force in generate groundtruth of cuvs-bench ([#427](https://github.com/rapidsai/cuvs/pull/427)) [@dantegd](https://github.com/dantegd)
+- Use Python for sccache hit rate computation. ([#420](https://github.com/rapidsai/cuvs/pull/420)) [@bdice](https://github.com/bdice)
+- Add `click` package to `cuvs-bench` conda recipe ([#408](https://github.com/rapidsai/cuvs/pull/408)) [@divyegala](https://github.com/divyegala)
+- Fix NVTX annotations ([#400](https://github.com/rapidsai/cuvs/pull/400)) [@achirkin](https://github.com/achirkin)
+
+## 📖 Documentation
+
+- [Doc] Fix CAGRA search sample code ([#484](https://github.com/rapidsai/cuvs/pull/484)) [@enp1s0](https://github.com/enp1s0)
+- Fix broken link in README.md references ([#473](https://github.com/rapidsai/cuvs/pull/473)) [@Azurethi](https://github.com/Azurethi)
+- Adding tech stack to docs ([#448](https://github.com/rapidsai/cuvs/pull/448)) [@cjnolet](https://github.com/cjnolet)
+- Fix Question Retrieval notebook ([#352](https://github.com/rapidsai/cuvs/pull/352)) [@lowener](https://github.com/lowener)
+
+## 🚀 New Features
+
+- Add C++ API scalar quantization ([#494](https://github.com/rapidsai/cuvs/pull/494)) [@mfoerste4](https://github.com/mfoerste4)
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Add serialization API to brute-force ([#461](https://github.com/rapidsai/cuvs/pull/461)) [@lowener](https://github.com/lowener)
+- Add Question Retrieval notebook using Milvus ([#451](https://github.com/rapidsai/cuvs/pull/451)) [@lowener](https://github.com/lowener)
+- Migrate feature diff for NN Descent from RAFT to cuVS ([#421](https://github.com/rapidsai/cuvs/pull/421)) [@divyegala](https://github.com/divyegala)
+- Add --no-lap-sync cmd option to ann-bench ([#405](https://github.com/rapidsai/cuvs/pull/405)) [@achirkin](https://github.com/achirkin)
+- Add `InnerProduct` and `CosineExpanded` metric support in NN Descent ([#177](https://github.com/rapidsai/cuvs/pull/177)) [@divyegala](https://github.com/divyegala)
+
+## 🛠️ Improvements
+
+- Update cuvs to match raft&#39;s cutlass changes ([#516](https://github.com/rapidsai/cuvs/pull/516)) [@vyasr](https://github.com/vyasr)
+- add a README for wheels ([#504](https://github.com/rapidsai/cuvs/pull/504)) [@jameslamb](https://github.com/jameslamb)
+- Move check_input_array from pylibraft ([#474](https://github.com/rapidsai/cuvs/pull/474)) [@benfred](https://github.com/benfred)
+- use different wheel-size thresholds based on CUDA version ([#469](https://github.com/rapidsai/cuvs/pull/469)) [@jameslamb](https://github.com/jameslamb)
+- Modify cuvs-bench to be able to generate ground truth in CPU systems ([#466](https://github.com/rapidsai/cuvs/pull/466)) [@dantegd](https://github.com/dantegd)
+- enforce wheel size limits, README formatting in CI ([#464](https://github.com/rapidsai/cuvs/pull/464)) [@jameslamb](https://github.com/jameslamb)
+- Moving spectral embedding and kernel gramm APIs to cuVS ([#463](https://github.com/rapidsai/cuvs/pull/463)) [@cjnolet](https://github.com/cjnolet)
+- Migrate sparse knn and distances code from raft ([#457](https://github.com/rapidsai/cuvs/pull/457)) [@benfred](https://github.com/benfred)
+- Don&#39;t presume pointers location infers usability. ([#441](https://github.com/rapidsai/cuvs/pull/441)) [@robertmaynard](https://github.com/robertmaynard)
+- call `enable_testing` in root CMakeLists.txt ([#437](https://github.com/rapidsai/cuvs/pull/437)) [@robertmaynard](https://github.com/robertmaynard)
+- CAGRA tech debt: distance descriptor and workspace memory ([#436](https://github.com/rapidsai/cuvs/pull/436)) [@achirkin](https://github.com/achirkin)
+- Add ci run_ scripts needed for build infra ([#434](https://github.com/rapidsai/cuvs/pull/434)) [@robertmaynard](https://github.com/robertmaynard)
+- Use environment variables in cache hit rate computation. ([#422](https://github.com/rapidsai/cuvs/pull/422)) [@bdice](https://github.com/bdice)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+- We need to enable the c_api by default ([#416](https://github.com/rapidsai/cuvs/pull/416)) [@robertmaynard](https://github.com/robertmaynard)
+- print sccache stats in builds ([#413](https://github.com/rapidsai/cuvs/pull/413)) [@jameslamb](https://github.com/jameslamb)
+- make conda installs in CI stricter ([#406](https://github.com/rapidsai/cuvs/pull/406)) [@jameslamb](https://github.com/jameslamb)
+- Ivf c example ([#404](https://github.com/rapidsai/cuvs/pull/404)) [@abner-ma](https://github.com/abner-ma)
+- Prune workflows based on changed files ([#392](https://github.com/rapidsai/cuvs/pull/392)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- [WIP] Add pinned memory resource to C API ([#311](https://github.com/rapidsai/cuvs/pull/311)) [@ajit283](https://github.com/ajit283)
+- Dynamic Batching ([#261](https://github.com/rapidsai/cuvs/pull/261)) [@achirkin](https://github.com/achirkin)
+
 # cuvs 24.10.00 (9 Oct 2024)
 
 ## 🐛 Bug Fixes
diff --git a/README.md b/README.md
index 213fde632..23759f598 100755
--- a/README.md
+++ b/README.md
@@ -35,6 +35,7 @@ Finally, faster vector search enables interactions between dense vectors and gra
 
 Below are some common use-cases for vector search
 
+
 - ### Semantic search
   - Generative AI & Retrieval augmented generation (RAG)
   - Recommender systems
@@ -68,6 +69,14 @@ There are several benefits to using cuVS and GPUs for vector search, including
 
 In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale. 
 
+## cuVS Technology Stack
+
+cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU. 
+
+![cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU](img/tech_stack.png "cuVS Technology Stack")
+
+
+
 ## Installing cuVS
 
 cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/). Different packages are available for the different languages supported by cuVS:
@@ -100,7 +109,7 @@ pip install cuvs-cu12 --extra-index-url=https://pypi.nvidia.com
 If installing a version that has not yet been released, the `rapidsai` channel can be replaced with `rapidsai-nightly`:
 
 ```bash
-conda install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.10
+conda install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.12
 ```
 
 cuVS also has `pip` wheel packages that can be installed. Please see the [Build and Install Guide](https://docs.rapids.ai/api/cuvs/nightly/build/) for more information on installing the available cuVS packages and building from source.
@@ -233,7 +242,7 @@ If you are interested in contributing to the cuVS library, please read our [Cont
 
 For the interested reader, many of the accelerated implementations in cuVS are also based on research papers which can provide a lot more background. We also ask you to please cite the corresponding algorithms by referencing them in your own research. 
 - [CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search](https://arxiv.org/abs/2308.15136)
-- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062>)
+- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062)
 - [Fast K-NN Graph Construction by GPU Based NN-Descent](https://dl.acm.org/doi/abs/10.1145/3459637.3482344?casa_token=O_nan1B1F5cAAAAA:QHWDEhh0wmd6UUTLY9_Gv6c3XI-5DXM9mXVaUXOYeStlpxTPmV3nKvABRfoivZAaQ3n8FWyrkWw>)
 - [cuSLINK: Single-linkage Agglomerative Clustering on the GPU](https://arxiv.org/abs/2306.16354)
 - [GPU Semiring Primitives for Sparse Neighborhood Methods](https://arxiv.org/abs/2104.06357)
diff --git a/VERSION b/VERSION
index 7c7ba0443..af28c42b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/build.sh b/build.sh
index b787d3a41..bd5fa649b 100755
--- a/build.sh
+++ b/build.sh
@@ -76,8 +76,8 @@ BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 BUILD_SHARED_LIBS=ON
 
-TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST"
-ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL"
+TEST_TARGETS=""
+ANN_BENCH_TARGETS=""
 
 CACHE_ARGS=""
 NVTX=ON
@@ -273,14 +273,6 @@ fi
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
-
-    # Force compile library when needed test targets are specified
-    if [[ $CMAKE_TARGET == *"CAGRA_C_TEST"* || \
-          $CMAKE_TARGET == *"INTEROP_TEST"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* ]]; then
-      echo "-- Enabling compiled lib for gtests"
-      COMPILE_LIBRARY=ON
-    fi
 fi
 
 if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
@@ -410,14 +402,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           if [[ ${CACHE_TOOL} == "sccache" && -x "$(command -v sccache)" ]]; then
               COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
               CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }')
-              HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+              HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')")
               MSG="${MSG}<br/>cache hit rate ${HIT_RATE} %"
           elif [[ ${CACHE_TOOL} == "ccache" && -x "$(command -v ccache)" ]]; then
               CACHE_STATS_LINE=$(ccache -s | grep "Hits: \+ [0-9]\+ / [0-9]\+" | tail -n1)
               if [[ ! -z "$CACHE_STATS_LINE" ]]; then
                   CACHE_HITS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $2 }')
                   COMPILE_REQUESTS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $4 }')
-                  HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+                  HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')")
                   MSG="${MSG}<br/>cache hit rate ${HIT_RATE} %"
               fi
           fi
@@ -447,7 +439,7 @@ if (( ${NUMARGS} == 0 )) || hasArg python; then
         python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs
 fi
 
-# Build and (optionally) install the cuvs_bench Python package
+# Build and (optionally) install the cuvs-bench Python package
 if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then
     python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs_bench
 fi
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 7bc0be5a7..db4c496cc 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,6 +15,10 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/libcuvs
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 460cc3899..bce93c605 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,6 +6,9 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key docs \
@@ -28,11 +31,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcuvs cuvs
+  "libcuvs=${RAPIDS_VERSION}" \
+  "cuvs=${RAPIDS_VERSION}"
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
@@ -54,4 +55,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/cuvs/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/cuvs/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 7b0c639af..3241a2c2b 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -24,6 +24,8 @@ version=$(rapids-generate-version)
 export RAPIDS_PACKAGE_VERSION=${version}
 echo "${version}" > VERSION
 
+sccache --zero-stats
+
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
 rapids-conda-retry mambabuild \
@@ -31,14 +33,20 @@ rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cuvs
 
-# Build cuvs_bench for each cuda and python version
+sccache --show-adv-stats
+sccache --zero-stats
+
+# Build cuvs-bench for each cuda and python version
 rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cuvs_bench
+  conda/recipes/cuvs-bench
 
-# Build cuvs_bench_cpu only in CUDA 12 jobs since it only depends on python
+sccache --show-adv-stats
+sccache --zero-stats
+
+# Build cuvs-bench-cpu only in CUDA 12 jobs since it only depends on python
 # version
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
@@ -46,7 +54,9 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cuvs_bench_cpu
+  conda/recipes/cuvs-bench-cpu
+
+  sccache --show-adv-stats
 fi
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_rust.sh b/ci/build_rust.sh
index 31d0de053..309501c32 100755
--- a/ci/build_rust.sh
+++ b/ci/build_rust.sh
@@ -6,6 +6,8 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key rust \
@@ -32,7 +34,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # installing libcuvs/libraft will speed up the rust build substantially
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcuvs  \
-  libraft
+  "libcuvs=${RAPIDS_VERSION}" \
+  "libraft=${RAPIDS_VERSION}"
 
 bash ./build.sh rust
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index d1030276f..4994374a8 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -32,10 +32,20 @@ case "${RAPIDS_CUDA_VERSION}" in
   ;;
 esac
 
-# Hardcode the output dir
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+rapids-logger "Building '${package_name}' wheel"
+
+sccache --zero-stats
+
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
+
+sccache --show-adv-stats
 
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh
index e03da9f19..444657cc0 100755
--- a/ci/build_wheel_cuvs.sh
+++ b/ci/build_wheel_cuvs.sh
@@ -3,6 +3,8 @@
 
 set -euo pipefail
 
+package_dir="python/cuvs"
+
 case "${RAPIDS_CUDA_VERSION}" in
   12.*)
     EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
@@ -15,4 +17,5 @@ esac
 # Set up skbuild options. Enable sccache in skbuild config options
 export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}"
 
-ci/build_wheel.sh cuvs python/cuvs
+ci/build_wheel.sh cuvs ${package_dir}
+ci/validate_wheel.sh ${package_dir} final_dist
diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh
new file mode 100755
index 000000000..6bf83961b
--- /dev/null
+++ b/ci/run_ctests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support customizing the ctests' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcuvs/"
+
+ctest --output-on-failure --no-tests=error "$@"
diff --git a/ci/run_cuvs_pytests.sh b/ci/run_cuvs_pytests.sh
new file mode 100755
index 000000000..4de8927b1
--- /dev/null
+++ b/ci/run_cuvs_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuvs/cuvs
+
+pytest --cache-clear --verbose "$@" tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 6dfc2cf71..134dc4421 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -26,7 +28,8 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcuvs libcuvs-tests
+  "libcuvs=${RAPIDS_VERSION}" \
+  "libcuvs-tests=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 93bc597cf..b9c394062 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -31,7 +33,8 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcuvs cuvs
+  "libcuvs=${RAPIDS_VERSION}" \
+  "cuvs=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 000000000..f2b235765
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir=$1
+wheel_dir_relative_path=$2
+
+RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
+
+# some packages are much larger on CUDA 11 than on CUDA 12
+if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
+    PYDISTCHECK_ARGS=(
+        --max-allowed-size-compressed '1.4G'
+    )
+else
+    PYDISTCHECK_ARGS=(
+        --max-allowed-size-compressed '950M'
+    )
+fi
+
+cd "${package_dir}"
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "${PYDISTCHECK_ARGS[@]}" \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 5f05ab165..80bfb0c24 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -45,7 +45,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index aadc23390..07937726c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -45,7 +45,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 75d61a8fb..b7fd6fcfa 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 6824f34aa..83a457465 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 215212c9a..59d471bda 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -15,16 +15,17 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -33,7 +34,8 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -41,7 +43,7 @@ dependencies:
 - nvcc_linux-aarch64=11.8
 - openblas
 - pandas
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 5f3bc7636..31a416eb5 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -15,16 +15,17 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -33,7 +34,8 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -41,7 +43,7 @@ dependencies:
 - nvcc_linux-64=11.8
 - openblas
 - pandas
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index dbc3e35c9..3efe9ebde 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -17,27 +17,29 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
 - pandas
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index fb4c42b43..7fbd77368 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -17,27 +17,29 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*
+- libcuvs==24.12.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
 - pandas
-- pylibraft==24.10.*
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/conda/recipes/cuvs_bench_cpu/build.sh b/conda/recipes/cuvs-bench-cpu/build.sh
similarity index 100%
rename from conda/recipes/cuvs_bench_cpu/build.sh
rename to conda/recipes/cuvs-bench-cpu/build.sh
diff --git a/conda/recipes/cuvs_bench_cpu/conda_build_config.yaml b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
similarity index 100%
rename from conda/recipes/cuvs_bench_cpu/conda_build_config.yaml
rename to conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
diff --git a/conda/recipes/cuvs_bench_cpu/meta.yaml b/conda/recipes/cuvs-bench-cpu/meta.yaml
similarity index 96%
rename from conda/recipes/cuvs_bench_cpu/meta.yaml
rename to conda/recipes/cuvs-bench-cpu/meta.yaml
index 0ce5db744..016df56be 100644
--- a/conda/recipes/cuvs_bench_cpu/meta.yaml
+++ b/conda/recipes/cuvs-bench-cpu/meta.yaml
@@ -8,7 +8,7 @@
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
 package:
-  name: cuvs_bench_cpu
+  name: cuvs-bench-cpu
   version: {{ version }}
   script: build.sh
 
@@ -55,9 +55,11 @@ requirements:
 
   run:
     - benchmark
+    - click
     - glog {{ glog_version }}
     - h5py {{ h5py_version }}
     - matplotlib
+    - numpy >=1.23,<3.0a0
     - pandas
     - pyyaml
     - python
diff --git a/conda/recipes/cuvs_bench/build.sh b/conda/recipes/cuvs-bench/build.sh
similarity index 100%
rename from conda/recipes/cuvs_bench/build.sh
rename to conda/recipes/cuvs-bench/build.sh
diff --git a/conda/recipes/cuvs_bench/conda_build_config.yaml b/conda/recipes/cuvs-bench/conda_build_config.yaml
similarity index 100%
rename from conda/recipes/cuvs_bench/conda_build_config.yaml
rename to conda/recipes/cuvs-bench/conda_build_config.yaml
diff --git a/conda/recipes/cuvs_bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
similarity index 97%
rename from conda/recipes/cuvs_bench/meta.yaml
rename to conda/recipes/cuvs-bench/meta.yaml
index 9ecbf82bb..0681a1038 100644
--- a/conda/recipes/cuvs_bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -10,7 +10,7 @@
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
 package:
-  name: cuvs_bench
+  name: cuvs-bench
   version: {{ version }}
   script: build.sh
 
@@ -82,15 +82,17 @@ requirements:
 
   run:
     - benchmark
+    - click
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
     {% else %}
     - cuda-cudart
+    - cupy>=12.0.0
     - libcublas
     {% endif %}
     - glog {{ glog_version }}
-    - libcuvs {{ version }}
+    - cuvs {{ version }}
     - h5py {{ h5py_version }}
     - matplotlib
     - pandas
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index e7e2daf0c..560c95feb 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -26,6 +26,7 @@ build:
     - {{ compiler('cuda') }}
     - cuda-cudart-dev
     {% endif %}
+    - cuda-python
 
 requirements:
   build:
@@ -42,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -60,13 +61,14 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     {% else %}
     - cuda-cudart
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     {% endif %}
     - pylibraft {{ minor_version }}
     - libcuvs {{ version }}
     - python x.x
-    - cuda-python
     - numpy >=1.23,<3.0a0
 
 tests:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3e98a247e..95fb7e63b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -53,8 +53,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON)
 option(BUILD_TESTS "Build cuvs unit-tests" ON)
-option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
-option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
+option(BUILD_C_LIBRARY "Build cuVS C API library" ON)
 option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
 option(BUILD_MG_ALGOS "Build with multi-GPU support" ON)
@@ -72,21 +71,12 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(CUVS_NVTX "Enable nvtx markers" OFF)
 option(CUVS_RAFT_CLONE_ON_PIN "Explicitly clone RAFT branch when pinned to non-feature branch" ON)
 
-if((BUILD_TESTS OR BUILD_C_LIBRARY) AND NOT BUILD_CPU_ONLY)
-
-endif()
-
 if(BUILD_CPU_ONLY)
   set(BUILD_SHARED_LIBS OFF)
   set(BUILD_TESTS OFF)
   set(BUILD_C_LIBRARY OFF)
-endif()
-
-if(NOT BUILD_C_LIBRARY)
-  set(BUILD_C_TESTS OFF)
-endif()
-
-if(NOT BUILD_SHARED_LIBS)
+  set(BUILD_CAGRA_HNSWLIB OFF)
+elseif(NOT BUILD_SHARED_LIBS)
   set(BUILD_TESTS OFF)
   set(BUILD_C_LIBRARY OFF)
   set(BUILD_CAGRA_HNSWLIB OFF)
@@ -334,6 +324,9 @@ if(BUILD_SHARED_LIBS)
     src/cluster/kmeans_transform_float.cu
     src/cluster/single_linkage_float.cu
     src/core/bitset.cu
+    src/distance/detail/kernels/gram_matrix.cu
+    src/distance/detail/kernels/kernel_factory.cu
+    src/distance/detail/kernels/kernel_matrices.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
@@ -379,7 +372,10 @@ if(BUILD_SHARED_LIBS)
     src/distance/detail/fused_distance_nn.cu
     src/distance/distance.cu
     src/distance/pairwise_distance.cu
+    src/distance/sparse_distance.cu
+    src/embed/spectral.cu
     src/neighbors/brute_force.cu
+    src/neighbors/brute_force_serialize.cu
     src/neighbors/cagra_build_float.cu
     src/neighbors/cagra_build_half.cu
     src/neighbors/cagra_build_int8.cu
@@ -405,6 +401,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
     src/neighbors/detail/cagra/cagra_build.cpp
     src/neighbors/detail/cagra/topk_for_cagra/topk.cu
+    src/neighbors/dynamic_batching.cu
     $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
     src/neighbors/ivf_flat_index.cpp
     src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
@@ -446,6 +443,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/nn_descent.cu
     src/neighbors/nn_descent_float.cu
     src/neighbors/nn_descent_half.cu
+    src/neighbors/nn_descent_index.cpp
     src/neighbors/nn_descent_int8.cu
     src/neighbors/nn_descent_uint8.cu
     src/neighbors/reachability.cu
@@ -458,12 +456,14 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/refine/detail/refine_host_int8_t_float.cpp
     src/neighbors/refine/detail/refine_host_uint8_t_float.cpp
     src/neighbors/sample_filter.cu
+    src/neighbors/sparse_brute_force.cu
     src/neighbors/vamana_build_float.cu
     src/neighbors/vamana_build_uint8.cu
     src/neighbors/vamana_build_int8.cu
     src/neighbors/vamana_serialize_float.cu
     src/neighbors/vamana_serialize_uint8.cu
     src/neighbors/vamana_serialize_int8.cu
+    src/preprocessing/quantize/scalar.cu
     src/selection/select_k_float_int64_t.cu
     src/selection/select_k_float_int32_t.cu
     src/selection/select_k_float_uint32_t.cu
@@ -583,6 +583,7 @@ if(BUILD_SHARED_LIBS)
 
   if(BUILD_CAGRA_HNSWLIB)
     target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib)
+    target_compile_definitions(cuvs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
     target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
   endif()
 
@@ -613,6 +614,9 @@ SECTIONS
     # This enables NVTX within the project with no option to disable it downstream.
     target_link_libraries(cuvs PUBLIC CUDA::nvtx3)
     target_compile_definitions(cuvs PUBLIC NVTX_ENABLED)
+
+    target_link_libraries(cuvs-cagra-search PUBLIC CUDA::nvtx3)
+    target_compile_definitions(cuvs-cagra-search PUBLIC NVTX_ENABLED)
   else()
     # Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
     # which is set by default to OFF, but can be enabled in the dependent project.
@@ -771,7 +775,8 @@ endif()
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
 
-if(BUILD_TESTS OR BUILD_C_TESTS)
+if(BUILD_TESTS)
+  enable_testing()
   add_subdirectory(internal)
   add_subdirectory(test)
 endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index c36e70ace..c161a68bc 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -90,21 +90,6 @@ if(CUVS_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss)
 endif()
 
-# ##################################################################################################
-# * Enable NVTX if available
-
-# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench
-# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just
-# raft/cuVS.
-if(TARGET CUDA::nvtx3)
-  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
-  get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES)
-  unset(NVTX3_HEADERS_FOUND CACHE)
-  # Check the headers explicitly to make sure the cpu-only build succeeds
-  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
-  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
-endif()
-
 # ##################################################################################################
 # * Target function -------------------------------------------------------------
 
@@ -130,12 +115,9 @@ function(ConfigureAnnBench)
     add_dependencies(${BENCH_NAME} ANN_BENCH)
   else()
     add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH})
-    target_compile_definitions(
-      ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN
-                            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
-    )
+    target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN>)
     target_link_libraries(
-      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
     )
   endif()
 
@@ -243,9 +225,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA)
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
-  ConfigureAnnBench(
-    NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib
-  )
+  ConfigureAnnBench(NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs)
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_MG)
@@ -318,7 +298,7 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
   target_link_libraries(
     ANN_BENCH
     PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
-            spdlog::spdlog_header_only $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+            spdlog::spdlog_header_only $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
   )
   set_target_properties(
     ANN_BENCH
@@ -336,7 +316,6 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
     ANN_BENCH
     PRIVATE
       $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}">
-      $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
   )
 
   target_link_options(ANN_BENCH PRIVATE -export-dynamic)
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index db3e533e0..06e1e27af 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -119,7 +119,8 @@ template <typename T>
 void bench_build(::benchmark::State& state,
                  std::shared_ptr<const dataset<T>> dataset,
                  configuration::index index,
-                 bool force_overwrite)
+                 bool force_overwrite,
+                 bool no_lap_sync)
 {
   // NB: these two thread-local vars can be used within algo wrappers
   cuvs::bench::benchmark_thread_id = state.thread_index();
@@ -149,9 +150,22 @@ void bench_build(::benchmark::State& state,
   cuda_timer gpu_timer{algo};
   {
     nvtx_case nvtx{state.name()};
+    /* Note: GPU timing
+
+    The GPU time is measured between construction and destruction of `cuda_lap` objects (`gpu_all`
+    and `gpu_lap` variables) and added to the `gpu_timer` object.
+
+    We sync with the GPU (cudaEventSynchronize) either each iteration (lifetime of the `gpu_lap`
+    variable) or once per benchmark loop (lifetime of the `gpu_all` variable). The decision is
+
+    controlled by the `no_lap_sync` argument. In either case, we need at least one sync throughout
+    the benchmark loop to make sure the GPU has finished its work before we measure the total run
+    time.
+    */
+    [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync);
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync);
       try {
         algo->build(base_set, index_size);
       } catch (const std::exception& e) {
@@ -173,7 +187,8 @@ template <typename T>
 void bench_search(::benchmark::State& state,
                   configuration::index index,
                   std::size_t search_param_ix,
-                  std::shared_ptr<const dataset<T>> dataset)
+                  std::shared_ptr<const dataset<T>> dataset,
+                  bool no_lap_sync)
 {
   // NB: these two thread-local vars can be used within algo wrappers
   cuvs::bench::benchmark_thread_id = state.thread_index();
@@ -300,25 +315,29 @@ void bench_search(::benchmark::State& state,
     // Initialize with algo, so that the timer.lap() object can sync with algo::get_sync_stream()
     cuda_timer gpu_timer{a};
     auto start = std::chrono::high_resolution_clock::now();
-    for (auto _ : state) {
-      [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
-      try {
-        a->search(query_set + batch_offset * dataset->dim(),
-                  n_queries,
-                  k,
-                  neighbors_ptr + out_offset * k,
-                  distances_ptr + out_offset * k);
-      } catch (const std::exception& e) {
-        state.SkipWithError("Benchmark loop: " + std::string(e.what()));
-        break;
-      }
+    {
+      /* See the note above: GPU timing */
+      [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync);
+      for (auto _ : state) {
+        [[maybe_unused]] auto ntx_lap = nvtx.lap();
+        [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync);
+        try {
+          a->search(query_set + batch_offset * dataset->dim(),
+                    n_queries,
+                    k,
+                    neighbors_ptr + out_offset * k,
+                    distances_ptr + out_offset * k);
+        } catch (const std::exception& e) {
+          state.SkipWithError("Benchmark loop: " + std::string(e.what()));
+          break;
+        }
 
-      // advance to the next batch
-      batch_offset = (batch_offset + queries_stride) % query_set_size;
-      out_offset   = (out_offset + n_queries) % query_set_size;
+        // advance to the next batch
+        batch_offset = (batch_offset + queries_stride) % query_set_size;
+        out_offset   = (out_offset + n_queries) % query_set_size;
 
-      queries_processed += n_queries;
+        queries_processed += n_queries;
+      }
     }
     auto end      = std::chrono::high_resolution_clock::now();
     auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
@@ -379,44 +398,51 @@ void bench_search(::benchmark::State& state,
 inline void printf_usage()
 {
   ::benchmark::PrintDefaultHelp();
-  fprintf(stdout,
-          "          [--build|--search] \n"
-          "          [--force]\n"
-          "          [--data_prefix=<prefix>]\n"
-          "          [--index_prefix=<prefix>]\n"
-          "          [--override_kv=<key:value1:value2:...:valueN>]\n"
-          "          [--mode=<latency|throughput>\n"
-          "          [--threads=min[:max]]\n"
-          "          <conf>.json\n"
-          "\n"
-          "Note the non-standard benchmark parameters:\n"
-          "  --build: build mode, will build index\n"
-          "  --search: search mode, will search using the built index\n"
-          "            one and only one of --build and --search should be specified\n"
-          "  --force: force overwriting existing index files\n"
-          "  --data_prefix=<prefix>:"
-          " prepend <prefix> to dataset file paths specified in the <conf>.json (default = "
-          "'data/').\n"
-          "  --index_prefix=<prefix>:"
-          " prepend <prefix> to index file paths specified in the <conf>.json (default = "
-          "'index/').\n"
-          "  --override_kv=<key:value1:value2:...:valueN>:"
-          " override a build/search key one or more times multiplying the number of configurations;"
-          " you can use this parameter multiple times to get the Cartesian product of benchmark"
-          " configs.\n"
-          "  --mode=<latency|throughput>"
-          " run the benchmarks in latency (accumulate times spent in each batch) or "
-          " throughput (pipeline batches and measure end-to-end) mode\n"
-          "  --threads=min[:max] specify the number threads to use for throughput benchmark."
-          " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified,"
-          " then a single test is run with 'min' threads. By default min=1, max=<num hyper"
-          " threads>.\n");
+  fprintf(
+    stdout,
+    "          [--build|--search] \n"
+    "          [--force]\n"
+    "          [--data_prefix=<prefix>]\n"
+    "          [--index_prefix=<prefix>]\n"
+    "          [--override_kv=<key:value1:value2:...:valueN>]\n"
+    "          [--mode=<latency|throughput>\n"
+    "          [--threads=min[:max]]\n"
+    "          [--no-lap-sync]\n"
+    "          <conf>.json\n"
+    "\n"
+    "Note the non-standard benchmark parameters:\n"
+    "  --build: build mode, will build index\n"
+    "  --search: search mode, will search using the built index\n"
+    "            one and only one of --build and --search should be specified\n"
+    "  --force: force overwriting existing index files\n"
+    "  --data_prefix=<prefix>:"
+    " prepend <prefix> to dataset file paths specified in the <conf>.json (default = "
+    "'data/').\n"
+    "  --index_prefix=<prefix>:"
+    " prepend <prefix> to index file paths specified in the <conf>.json (default = "
+    "'index/').\n"
+    "  --override_kv=<key:value1:value2:...:valueN>:"
+    " override a build/search key one or more times multiplying the number of configurations;"
+    " you can use this parameter multiple times to get the Cartesian product of benchmark"
+    " configs.\n"
+    "  --mode=<latency|throughput>"
+    " run the benchmarks in latency (accumulate times spent in each batch) or "
+    " throughput (pipeline batches and measure end-to-end) mode\n"
+    "  --threads=min[:max] specify the number threads to use for throughput benchmark."
+    " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified,"
+    " then a single test is run with 'min' threads. By default min=1, max=<num hyper"
+    " threads>.\n"
+    "  --no-lap-sync disable CUDA event synchronization between benchmark iterations. If a GPU"
+    " algorithm has no sync with CPU, this can make the GPU processing significantly lag behind the"
+    " CPU scheduling. Then this also hides the scheduling latencies and thus improves the measured"
+    " throughput (QPS). Note there's a sync at the end of the benchmark loop in any case.\n");
 }
 
 template <typename T>
 void register_build(std::shared_ptr<const dataset<T>> dataset,
                     std::vector<configuration::index> indices,
-                    bool force_overwrite)
+                    bool force_overwrite,
+                    bool no_lap_sync)
 {
   for (auto index : indices) {
     auto suf      = static_cast<std::string>(index.build_param["override_suffix"]);
@@ -425,7 +451,7 @@ void register_build(std::shared_ptr<const dataset<T>> dataset,
     std::replace(file_suf.begin(), file_suf.end(), '/', '-');
     index.file += file_suf;
     auto* b = ::benchmark::RegisterBenchmark(
-      index.name + suf, bench_build<T>, dataset, index, force_overwrite);
+      index.name + suf, bench_build<T>, dataset, index, force_overwrite, no_lap_sync);
     b->Unit(benchmark::kSecond);
     b->MeasureProcessCPUTime();
     b->UseRealTime();
@@ -436,14 +462,16 @@ template <typename T>
 void register_search(std::shared_ptr<const dataset<T>> dataset,
                      std::vector<configuration::index> indices,
                      Mode metric_objective,
-                     const std::vector<int>& threads)
+                     const std::vector<int>& threads,
+                     bool no_lap_sync)
 {
   for (auto index : indices) {
     for (std::size_t i = 0; i < index.search_params.size(); i++) {
       auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
       index.search_params[i].erase("override_suffix");
 
-      auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search<T>, index, i, dataset)
+      auto* b = ::benchmark::RegisterBenchmark(
+                  index.name + suf, bench_search<T>, index, i, dataset, no_lap_sync)
                   ->Unit(benchmark::kMillisecond)
                   /**
                    * The following are important for getting accuracy QPS measurements on both CPU
@@ -470,7 +498,8 @@ void dispatch_benchmark(std::string cmdline,
                         std::string index_prefix,
                         kv_series override_kv,
                         Mode metric_objective,
-                        const std::vector<int>& threads)
+                        const std::vector<int>& threads,
+                        bool no_lap_sync)
 {
   ::benchmark::AddCustomContext("command_line", cmdline);
   for (auto [key, value] : host_info()) {
@@ -514,7 +543,7 @@ void dispatch_benchmark(std::string cmdline,
         more_indices.push_back(modified_index);
       }
     }
-    register_build<T>(dataset, more_indices, force_overwrite);
+    register_build<T>(dataset, more_indices, force_overwrite, no_lap_sync);
   } else if (search_mode) {
     if (file_exists(query_file)) {
       log_info("Using the query file '%s'", query_file.c_str());
@@ -543,7 +572,7 @@ void dispatch_benchmark(std::string cmdline,
       index.search_params = apply_overrides(index.search_params, override_kv);
       index.file          = combine_path(index_prefix, index.file);
     }
-    register_search<T>(dataset, indices, metric_objective, threads);
+    register_search<T>(dataset, indices, metric_objective, threads, no_lap_sync);
   }
 }
 
@@ -571,6 +600,7 @@ inline auto run_main(int argc, char** argv) -> int
   bool force_overwrite                = false;
   bool build_mode                     = false;
   bool search_mode                    = false;
+  bool no_lap_sync                    = false;
   std::string data_prefix             = "data";
   std::string index_prefix            = "index";
   std::string new_override_kv         = "";
@@ -604,6 +634,7 @@ inline auto run_main(int argc, char** argv) -> int
     if (parse_bool_flag(argv[i], "--force", force_overwrite) ||
         parse_bool_flag(argv[i], "--build", build_mode) ||
         parse_bool_flag(argv[i], "--search", search_mode) ||
+        parse_bool_flag(argv[i], "--no-lap-sync", no_lap_sync) ||
         parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
         parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
         parse_string_flag(argv[i], "--mode", mode) ||
@@ -686,7 +717,8 @@ inline auto run_main(int argc, char** argv) -> int
                               index_prefix,
                               override_kv,
                               metric_objective,
-                              threads);
+                              threads,
+                              no_lap_sync);
   } else if (dtype == "half") {
     dispatch_benchmark<half>(cmdline,
                              conf,
@@ -697,7 +729,8 @@ inline auto run_main(int argc, char** argv) -> int
                              index_prefix,
                              override_kv,
                              metric_objective,
-                             threads);
+                             threads,
+                             no_lap_sync);
   } else if (dtype == "uint8") {
     dispatch_benchmark<std::uint8_t>(cmdline,
                                      conf,
@@ -708,7 +741,8 @@ inline auto run_main(int argc, char** argv) -> int
                                      index_prefix,
                                      override_kv,
                                      metric_objective,
-                                     threads);
+                                     threads,
+                                     no_lap_sync);
   } else if (dtype == "int8") {
     dispatch_benchmark<std::int8_t>(cmdline,
                                     conf,
@@ -719,7 +753,8 @@ inline auto run_main(int argc, char** argv) -> int
                                     index_prefix,
                                     override_kv,
                                     metric_objective,
-                                    threads);
+                                    threads,
+                                    no_lap_sync);
   } else {
     log_error("datatype '%s' is not supported", dtype.c_str());
     return -1;
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index c3db2bb4b..dbde74ccc 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -18,7 +18,8 @@
 #include "ann_types.hpp"
 #include "cuda_stub.hpp"  // cuda-related utils
 
-#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+#if __has_include(<nvtx3/nvToolsExt.h>)
+#define ANN_BENCH_NVTX3_HEADERS_FOUND
 #include <nvtx3/nvToolsExt.h>
 #endif
 
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 57d5b1910..7617bfa66 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -56,6 +56,26 @@ extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #include "cuvs_mg_cagra_wrapper.h"
 #endif
 
+template <typename ParamT>
+void parse_dynamic_batching_params(const nlohmann::json& conf, ParamT& param)
+{
+  if (!conf.value("dynamic_batching", false)) { return; }
+  param.dynamic_batching = true;
+  if (conf.contains("dynamic_batching_max_batch_size")) {
+    param.dynamic_batching_max_batch_size = conf.at("dynamic_batching_max_batch_size");
+  }
+  param.dynamic_batching_conservative_dispatch =
+    conf.value("dynamic_batching_conservative_dispatch", false);
+  if (conf.contains("dynamic_batching_dispatch_timeout_ms")) {
+    param.dynamic_batching_dispatch_timeout_ms = conf.at("dynamic_batching_dispatch_timeout_ms");
+  }
+  if (conf.contains("dynamic_batching_n_queues")) {
+    param.dynamic_batching_n_queues = conf.at("dynamic_batching_n_queues");
+  }
+  param.dynamic_batching_k =
+    uint32_t(uint32_t(conf.at("k")) * float(conf.value("refine_ratio", 1.0f)));
+}
+
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
@@ -138,6 +158,9 @@ void parse_search_param(const nlohmann::json& conf,
     param.refine_ratio = conf.at("refine_ratio");
     if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
   }
+
+  // enable dynamic batching
+  parse_dynamic_batching_params(conf, param);
 }
 #endif
 
@@ -291,5 +314,8 @@ void parse_search_param(const nlohmann::json& conf,
   }
   // Same ratio as in IVF-PQ
   param.refine_ratio = conf.value("refine_ratio", 1.0f);
+
+  // enable dynamic batching
+  parse_dynamic_batching_params(conf, param);
 }
 #endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index 558ba01e0..e45a3bd5a 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -24,12 +24,35 @@
 
 namespace cuvs::bench {
 
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_cagra_hnswlib<T, IdxT>::build_param& param)
+{
+  if (conf.contains("hierarchy")) {
+    if (conf.at("hierarchy") == "none") {
+      param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::NONE;
+    } else if (conf.at("hierarchy") == "cpu") {
+      param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
+    } else {
+      THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get<std::string>().c_str());
+    }
+  }
+  if (conf.contains("ef_construction")) {
+    param.hnsw_index_params.ef_construction = conf.at("ef_construction");
+  }
+  if (conf.contains("num_threads")) {
+    param.hnsw_index_params.num_threads = conf.at("num_threads");
+  }
+}
+
 template <typename T, typename IdxT>
 void parse_search_param(const nlohmann::json& conf,
                         typename cuvs::bench::cuvs_cagra_hnswlib<T, IdxT>::search_param& param)
 {
-  param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  param.hnsw_search_param.ef = conf.at("ef");
+  if (conf.contains("num_threads")) {
+    param.hnsw_search_param.num_threads = conf.at("num_threads");
+  }
 }
 
 template <typename T>
@@ -43,9 +66,10 @@ auto create_algo(const std::string& algo_name,
 
   if constexpr (std::is_same_v<T, float> or std::is_same_v<T, std::uint8_t>) {
     if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") {
-      typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::build_param param;
-      parse_build_param<T, uint32_t>(conf, param);
-      a = std::make_unique<cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>>(metric, dim, param);
+      typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::build_param bparam;
+      ::parse_build_param<T, uint32_t>(conf, bparam.cagra_build_param);
+      parse_build_param<T, uint32_t>(conf, bparam);
+      a = std::make_unique<cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>>(metric, dim, bparam);
     }
   }
 
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index 875fe0bba..e4169f6f8 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include "../hnswlib/hnswlib_wrapper.h"
 #include "cuvs_cagra_wrapper.h"
+#include <cuvs/neighbors/hnsw.hpp>
 
 #include <memory>
 
@@ -26,14 +26,20 @@ template <typename T, typename IdxT>
 class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
  public:
   using search_param_base = typename algo<T>::search_param;
-  using build_param       = typename cuvs_cagra<T, IdxT>::build_param;
-  using search_param      = typename hnsw_lib<T>::search_param;
+
+  struct build_param {
+    typename cuvs_cagra<T, IdxT>::build_param cagra_build_param;
+    cuvs::neighbors::hnsw::index_params hnsw_index_params;
+  };
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::hnsw::search_params hnsw_search_param;
+  };
 
   cuvs_cagra_hnswlib(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
     : algo<T>(metric, dim),
-      cagra_build_{metric, dim, param, concurrent_searches},
-      // hnsw_lib param values don't matter since we don't build with hnsw_lib
-      hnswlib_search_{metric, dim, typename hnsw_lib<T>::build_param{50, 100}}
+      build_param_{param},
+      cagra_build_{metric, dim, param.cagra_build_param, concurrent_searches}
   {
   }
 
@@ -69,40 +75,67 @@ class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
   }
 
  private:
+  raft::resources handle_{};
+  build_param build_param_;
+  search_param search_param_;
   cuvs_cagra<T, IdxT> cagra_build_;
-  hnsw_lib<T> hnswlib_search_;
+  std::shared_ptr<cuvs::neighbors::hnsw::index<T>> hnsw_index_;
 };
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
 {
   cagra_build_.build(dataset, nrow);
+  auto* cagra_index      = cagra_build_.get_index();
+  auto host_dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+  auto opt_dataset_view =
+    std::optional<raft::host_matrix_view<const T, int64_t>>(std::move(host_dataset_view));
+  hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
+    handle_, build_param_.hnsw_index_params, *cagra_index, opt_dataset_view);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& param_)
 {
-  hnswlib_search_.set_search_param(param_);
+  search_param_ = dynamic_cast<const search_param&>(param_);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
 {
-  cagra_build_.save_to_hnswlib(file);
+  cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get()));
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::load(const std::string& file)
 {
-  hnswlib_search_.load(file);
-  hnswlib_search_.set_base_layer_only();
+  cuvs::neighbors::hnsw::index<T>* idx = nullptr;
+  cuvs::neighbors::hnsw::deserialize(handle_,
+                                     build_param_.hnsw_index_params,
+                                     file,
+                                     this->dim_,
+                                     parse_metric_type(this->metric_),
+                                     &idx);
+  hnsw_index_ = std::shared_ptr<cuvs::neighbors::hnsw::index<T>>(idx);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::search(
   const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
 {
-  hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
+  // Only Latency mode is supported for now
+  auto queries_view =
+    raft::make_host_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
+  auto neighbors_view = raft::make_host_matrix_view<uint64_t, int64_t>(
+    reinterpret_cast<uint64_t*>(neighbors), batch_size, k);
+  auto distances_view = raft::make_host_matrix_view<float, int64_t>(distances, batch_size, k);
+
+  cuvs::neighbors::hnsw::search(handle_,
+                                search_param_.hnsw_search_param,
+                                *(hnsw_index_.get()),
+                                queries_view,
+                                neighbors_view,
+                                distances_view);
 }
 
 }  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
index b2ba35eee..8c9cb2d4f 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -24,6 +24,7 @@
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -63,6 +64,13 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
     AllocatorType graph_mem   = AllocatorType::kDevice;
     AllocatorType dataset_mem = AllocatorType::kDevice;
     [[nodiscard]] auto needs_dataset() const -> bool override { return true; }
+    /* Dynamic batching */
+    bool dynamic_batching = false;
+    int64_t dynamic_batching_k;
+    int64_t dynamic_batching_max_batch_size     = 4;
+    double dynamic_batching_dispatch_timeout_ms = 0.01;
+    size_t dynamic_batching_n_queues            = 8;
+    bool dynamic_batching_conservative_dispatch = false;
   };
 
   struct build_param {
@@ -154,6 +162,8 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
   void save_to_hnswlib(const std::string& file) const;
   std::unique_ptr<algo<T>> copy() override;
 
+  auto get_index() const -> const cuvs::neighbors::cagra::index<T, IdxT>* { return index_.get(); }
+
  private:
   // handle_ must go first to make sure it dies last and all memory allocated in pool
   configured_raft_resources handle_{};
@@ -171,6 +181,12 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
   std::shared_ptr<raft::device_matrix<T, int64_t, raft::row_major>> dataset_;
   std::shared_ptr<raft::device_matrix_view<const T, int64_t, raft::row_major>> input_dataset_v_;
 
+  std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
+  cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
+  int64_t dynamic_batching_max_batch_size_;
+  size_t dynamic_batching_n_queues_;
+  bool dynamic_batching_conservative_dispatch_;
+
   inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
   {
     switch (mem_type) {
@@ -214,26 +230,33 @@ inline auto allocator_to_string(AllocatorType mem_type) -> std::string
 template <typename T, typename IdxT>
 void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
 {
-  auto sp        = dynamic_cast<const search_param&>(param);
-  search_params_ = sp.p;
-  refine_ratio_  = sp.refine_ratio;
+  auto sp = dynamic_cast<const search_param&>(param);
+  bool needs_dynamic_batcher_update =
+    (dynamic_batching_max_batch_size_ != sp.dynamic_batching_max_batch_size) ||
+    (dynamic_batching_n_queues_ != sp.dynamic_batching_n_queues) ||
+    (dynamic_batching_conservative_dispatch_ != sp.dynamic_batching_conservative_dispatch);
+  dynamic_batching_max_batch_size_        = sp.dynamic_batching_max_batch_size;
+  dynamic_batching_n_queues_              = sp.dynamic_batching_n_queues;
+  dynamic_batching_conservative_dispatch_ = sp.dynamic_batching_conservative_dispatch;
+  search_params_                          = sp.p;
+  refine_ratio_                           = sp.refine_ratio;
   if (sp.graph_mem != graph_mem_) {
     // Move graph to correct memory space
     graph_mem_ = sp.graph_mem;
     RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str());
     // We create a new graph and copy to it from existing graph
-    auto mr        = get_mr(graph_mem_);
-    auto new_graph = raft::make_device_mdarray<IdxT, int64_t>(
+    auto mr = get_mr(graph_mem_);
+    *graph_ = raft::make_device_mdarray<IdxT, int64_t>(
       handle_, mr, raft::make_extents<int64_t>(index_->graph().extent(0), index_->graph_degree()));
 
-    raft::copy(new_graph.data_handle(),
+    raft::copy(graph_->data_handle(),
                index_->graph().data_handle(),
                index_->graph().size(),
                raft::resource::get_cuda_stream(handle_));
 
-    index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
-    // update_graph() only stores a view in the index. We need to keep the graph object alive.
-    *graph_ = std::move(new_graph);
+    // NB: update_graph() only stores a view in the index. We need to keep the graph object alive.
+    index_->update_graph(handle_, make_const_mdspan(graph_->view()));
+    needs_dynamic_batcher_update = true;
   }
 
   if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) {
@@ -254,7 +277,26 @@ void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
       dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1));
     index_->update_dataset(handle_, dataset_view);
 
-    need_dataset_update_ = false;
+    need_dataset_update_         = false;
+    needs_dynamic_batcher_update = true;
+  }
+
+  // dynamic batching
+  if (sp.dynamic_batching) {
+    if (!dynamic_batcher_ || needs_dynamic_batcher_update) {
+      dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
+        handle_,
+        cuvs::neighbors::dynamic_batching::index_params{{},
+                                                        sp.dynamic_batching_k,
+                                                        sp.dynamic_batching_max_batch_size,
+                                                        sp.dynamic_batching_n_queues,
+                                                        sp.dynamic_batching_conservative_dispatch},
+        *index_,
+        search_params_);
+    }
+    dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+  } else {
+    if (dynamic_batcher_) { dynamic_batcher_.reset(); }
   }
 }
 
@@ -304,7 +346,7 @@ void cuvs_cagra<T, IdxT>::load(const std::string& file)
 template <typename T, typename IdxT>
 std::unique_ptr<algo<T>> cuvs_cagra<T, IdxT>::copy()
 {
-  return std::make_unique<cuvs_cagra<T, IdxT>>(*this);  // use copy constructor
+  return std::make_unique<cuvs_cagra<T, IdxT>>(std::cref(*this));  // use copy constructor
 }
 
 template <typename T, typename IdxT>
@@ -328,8 +370,17 @@ void cuvs_cagra<T, IdxT>::search_base(const T* queries,
     raft::make_device_matrix_view<IdxT, int64_t>(neighbors_idx_t, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
 
-  cuvs::neighbors::cagra::search(
-    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  if (dynamic_batcher_) {
+    cuvs::neighbors::dynamic_batching::search(handle_,
+                                              dynamic_batcher_sp_,
+                                              *dynamic_batcher_,
+                                              queries_view,
+                                              neighbors_view,
+                                              distances_view);
+  } else {
+    cuvs::neighbors::cagra::search(
+      handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  }
 
   if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
     if (raft::get_device_for_address(neighbors) < 0 &&
@@ -365,11 +416,23 @@ void cuvs_cagra<T, IdxT>::search(
   const raft::resources& res    = handle_;
   auto mem_type =
     raft::get_device_for_address(neighbors) >= 0 ? MemoryType::kDevice : MemoryType::kHostPinned;
-  auto& tmp_buf = get_tmp_buffer_from_global_pool(
-    ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
-     (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
-    batch_size * k0);
-  auto* candidates_ptr = reinterpret_cast<algo_base::index_type*>(tmp_buf.data(mem_type));
+
+  // If dynamic batching is used and there's no sync between benchmark laps, multiple sequential
+  // requests can group together. The data is copied asynchronously, and if the same intermediate
+  // buffer is used for multiple requests, they can override each other's data. Hence, we need to
+  // allocate as much space as required by the maximum number of sequential requests.
+  auto max_dyn_grouping = dynamic_batcher_ ? raft::div_rounding_up_safe<int64_t>(
+                                               dynamic_batching_max_batch_size_, batch_size) *
+                                               dynamic_batching_n_queues_
+                                           : 1;
+  auto tmp_buf_size = ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
+                       (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
+                      batch_size * k0;
+  auto& tmp_buf = get_tmp_buffer_from_global_pool(tmp_buf_size * max_dyn_grouping);
+  thread_local static int64_t group_id = 0;
+  auto* candidates_ptr                 = reinterpret_cast<algo_base::index_type*>(
+    reinterpret_cast<uint8_t*>(tmp_buf.data(mem_type)) + tmp_buf_size * group_id);
+  group_id = (group_id + 1) % max_dyn_grouping;
   auto* candidate_dists_ptr =
     reinterpret_cast<float*>(candidates_ptr + (disable_refinement ? 0 : batch_size * k0));
   auto* neighbors_idx_t =
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
index 4c8a91f23..dac766669 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
@@ -19,7 +19,9 @@
 #include "cuvs_ann_bench_utils.h"
 
 #include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
@@ -46,6 +48,13 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
     cuvs::neighbors::ivf_pq::search_params pq_param;
     float refine_ratio = 1.0f;
     [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
+    /* Dynamic batching */
+    bool dynamic_batching = false;
+    int64_t dynamic_batching_k;
+    int64_t dynamic_batching_max_batch_size     = 128;
+    double dynamic_batching_dispatch_timeout_ms = 0.01;
+    size_t dynamic_batching_n_queues            = 3;
+    bool dynamic_batching_conservative_dispatch = true;
   };
 
   using build_param = cuvs::neighbors::ivf_pq::index_params;
@@ -98,6 +107,9 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
   int dimension_;
   float refine_ratio_ = 1.0;
   raft::device_matrix_view<const T, IdxT> dataset_;
+
+  std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
+  cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
 };
 
 template <typename T, typename IdxT>
@@ -138,6 +150,21 @@ void cuvs_ivf_pq<T, IdxT>::set_search_param(const search_param_base& param)
   search_params_ = sp.pq_param;
   refine_ratio_  = sp.refine_ratio;
   assert(search_params_.n_probes <= index_params_.n_lists);
+
+  if (sp.dynamic_batching) {
+    dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
+      handle_,
+      cuvs::neighbors::dynamic_batching::index_params{{},
+                                                      sp.dynamic_batching_k,
+                                                      sp.dynamic_batching_max_batch_size,
+                                                      sp.dynamic_batching_n_queues,
+                                                      sp.dynamic_batching_conservative_dispatch},
+      *index_,
+      search_params_);
+    dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+  } else {
+    dynamic_batcher_.reset();
+  }
 }
 
 template <typename T, typename IdxT>
@@ -168,8 +195,17 @@ void cuvs_ivf_pq<T, IdxT>::search_base(
     raft::make_device_matrix_view<IdxT, uint32_t>(neighbors_idx_t, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
 
-  cuvs::neighbors::ivf_pq::search(
-    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  if (dynamic_batcher_) {
+    cuvs::neighbors::dynamic_batching::search(handle_,
+                                              dynamic_batcher_sp_,
+                                              *dynamic_batcher_,
+                                              queries_view,
+                                              neighbors_view,
+                                              distances_view);
+  } else {
+    cuvs::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  }
 
   if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
     raft::linalg::unaryOp(neighbors,
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index 755c7c8d6..6e219d2a7 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -33,7 +33,7 @@ void parse_build_param(const nlohmann::json& conf,
 {
   param.ef_construction = conf.at("efConstruction");
   param.m               = conf.at("M");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
 }
 
 template <typename T>
@@ -41,7 +41,7 @@ void parse_search_param(const nlohmann::json& conf,
                         typename cuvs::bench::hnsw_lib<T>::search_param& param)
 {
   param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
 }
 
 template <typename T, template <typename> class Algo>
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 74da25660..3e91d9995 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -22,8 +22,12 @@ endif()
 # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
 # clang)
 if(CMAKE_COMPILER_IS_GNUCXX)
-  list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
-  list(APPEND CUVS_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+  list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations
+       -Wno-reorder
+  )
+  list(APPEND CUVS_CUDA_FLAGS
+       -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations,-Wno-reorder
+  )
 
   # set warnings as errors
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
diff --git a/cpp/cmake/patches/cutlass/build-export.patch b/cpp/cmake/patches/cutlass/build-export.patch
new file mode 100644
index 000000000..a6423e9c0
--- /dev/null
+++ b/cpp/cmake/patches/cutlass/build-export.patch
@@ -0,0 +1,27 @@
+From e0a9597946257a01ae8444200f836ee51d5597ba Mon Sep 17 00:00:00 2001
+From: Kyle Edwards <kyedwards@nvidia.com>
+Date: Wed, 20 Nov 2024 16:37:38 -0500
+Subject: [PATCH] Remove erroneous include directories
+
+These directories are left over from when CuTe was a separate
+CMake project. Remove them.
+---
+ CMakeLists.txt | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 7419bdf5e..545384d82 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -665,8 +665,6 @@ target_include_directories(
+   $<INSTALL_INTERFACE:include>
+   $<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
+   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
+-  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
+-  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
+   )
+ 
+ # Mark CTK headers as system to supress warnings from them
+-- 
+2.34.1
+
diff --git a/cpp/cmake/patches/cutlass_override.json b/cpp/cmake/patches/cutlass_override.json
new file mode 100644
index 000000000..7bf818987
--- /dev/null
+++ b/cpp/cmake/patches/cutlass_override.json
@@ -0,0 +1,16 @@
+{
+  "packages" : {
+    "cutlass" : {
+      "version": "3.5.1",
+      "git_url": "https://github.com/NVIDIA/cutlass.git",
+      "git_tag": "v${version}",
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/cutlass/build-export.patch",
+          "issue" : "Fix build directory export",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/patches/hnswlib.diff b/cpp/cmake/patches/hnswlib.diff
index e7f89a8cc..f20c27d91 100644
--- a/cpp/cmake/patches/hnswlib.diff
+++ b/cpp/cmake/patches/hnswlib.diff
@@ -1,188 +1,159 @@
+diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h
+index bef0017..0ee7931 100644
 --- a/hnswlib/hnswalg.h
 +++ b/hnswlib/hnswalg.h
-@@ -3,6 +3,7 @@
- #include "visited_list_pool.h"
- #include "hnswlib.h"
- #include <atomic>
-+#include <limits>
- #include <random>
- #include <stdlib.h>
- #include <assert.h>
-@@ -16,6 +17,8 @@ namespace hnswlib {
-     template<typename dist_t>
-     class HierarchicalNSW : public AlgorithmInterface<dist_t> {
-     public:
-+        bool base_layer_only{false};
-+        int num_seeds=32;
-         static const tableint max_update_element_locks = 65536;
-         HierarchicalNSW(SpaceInterface<dist_t> *s) {
-         }
-@@ -56,7 +59,7 @@ namespace hnswlib {
-             visited_list_pool_ = new VisitedListPool(1, max_elements);
- 
-             //initializations for special treatment of the first node
--            enterpoint_node_ = -1;
-+            enterpoint_node_ = std::numeric_limits<tableint>::max();
-             maxlevel_ = -1;
- 
-             linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
-@@ -527,7 +530,7 @@ namespace hnswlib {
-                     tableint *datal = (tableint *) (data + 1);
-                     for (int i = 0; i < size; i++) {
-                         tableint cand = datal[i];
--                        if (cand < 0 || cand > max_elements_)
-+                        if (cand > max_elements_)
-                             throw std::runtime_error("cand error");
-                         dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
- 
-@@ -1067,7 +1070,7 @@ namespace hnswlib {
-                             tableint *datal = (tableint *) (data + 1);
-                             for (int i = 0; i < size; i++) {
-                                 tableint cand = datal[i];
--                                if (cand < 0 || cand > max_elements_)
-+                                if (cand > max_elements_)
-                                     throw std::runtime_error("cand error");
-                                 dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
-                                 if (d < curdist) {
-@@ -1119,28 +1122,41 @@ namespace hnswlib {
-             tableint currObj = enterpoint_node_;
-             dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
- 
--            for (int level = maxlevel_; level > 0; level--) {
--                bool changed = true;
--                while (changed) {
--                    changed = false;
--                    unsigned int *data;
-+            if (base_layer_only) {
-+                // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
-+                for (int i = 0; i < num_seeds; i++) {
-+                    tableint obj = i * (max_elements_ / num_seeds);
-+                    dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
-+                    if (dist < curdist) {
-+                        curdist = dist;
-+                        currObj = obj;
-+                    }
+@@ -16,6 +16,9 @@ typedef unsigned int linklistsizeint;
+ template<typename dist_t>
+ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+  public:
++    bool base_layer_only = false;
++    int num_seeds = 32;
++    bool base_layer_init = true;
+     static const tableint MAX_LABEL_OPERATION_LOCKS = 65536;
+     static const unsigned char DELETE_MARK = 0x01;
+ 
+@@ -1098,7 +1101,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+ 
+         std::unique_lock <std::mutex> lock_el(link_list_locks_[cur_c]);
+         int curlevel = getRandomLevel(mult_);
+-        if (level > 0)
++        if (level > -1)
+             curlevel = level;
+ 
+         element_levels_[cur_c] = curlevel;
+@@ -1116,6 +1119,9 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+         memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype));
+         memcpy(getDataByInternalId(cur_c), data_point, data_size_);
+ 
++        if (!base_layer_init && curlevel == 0)
++            return cur_c;
++
+         if (curlevel) {
+             linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1);
+             if (linkLists_[cur_c] == nullptr)
+@@ -1138,7 +1144,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+                         tableint *datal = (tableint *) (data + 1);
+                         for (int i = 0; i < size; i++) {
+                             tableint cand = datal[i];
+-                            if (cand < 0 || cand > max_elements_)
++                            if (static_cast<int>(cand) < 0 || cand > max_elements_)
+                                 throw std::runtime_error("cand error");
+                             dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
+                             if (d < curdist) {
+@@ -1188,28 +1194,41 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+         tableint currObj = enterpoint_node_;
+         dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+ 
+-        for (int level = maxlevel_; level > 0; level--) {
+-            bool changed = true;
+-            while (changed) {
+-                changed = false;
+-                unsigned int *data;
++        if (base_layer_only) {
++            // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
++            for (int i = 0; i < num_seeds; i++) {
++                tableint obj = i * (max_elements_ / num_seeds);
++                dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
++                if (dist < curdist) {
++                    curdist = dist;
++                    currObj = obj;
 +                }
 +            }
-+            else{
-+                for (int level = maxlevel_; level > 0; level--) {
-+                    bool changed = true;
-+                    while (changed) {
-+                        changed = false;
-+                        unsigned int *data;
- 
--                    data = (unsigned int *) get_linklist(currObj, level);
--                    int size = getListCount(data);
--                    metric_hops++;
--                    metric_distance_computations+=size;
-+                        data = (unsigned int *) get_linklist(currObj, level);
-+                        int size = getListCount(data);
-+                        metric_hops++;
-+                        metric_distance_computations+=size;
- 
--                    tableint *datal = (tableint *) (data + 1);
--                    for (int i = 0; i < size; i++) {
--                        tableint cand = datal[i];
--                        if (cand < 0 || cand > max_elements_)
--                            throw std::runtime_error("cand error");
--                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
-+                        tableint *datal = (tableint *) (data + 1);
-+                        for (int i = 0; i < size; i++) {
-+                            tableint cand = datal[i];
-+                            if (cand > max_elements_)
-+                                throw std::runtime_error("cand error");
-+                            dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
- 
--                        if (d < curdist) {
--                            curdist = d;
--                            currObj = cand;
--                            changed = true;
-+                            if (d < curdist) {
-+                                curdist = d;
-+                                currObj = cand;
-+                                changed = true;
-+                            }
-                         }
++        }
++        else {
++            for (int level = maxlevel_; level > 0; level--) {
++                bool changed = true;
++                while (changed) {
++                    changed = false;
++                    unsigned int *data;
+ 
+-                data = (unsigned int *) get_linklist(currObj, level);
+-                int size = getListCount(data);
+-                metric_hops++;
+-                metric_distance_computations+=size;
++                    data = (unsigned int *) get_linklist(currObj, level);
++                    int size = getListCount(data);
++                    metric_hops++;
++                    metric_distance_computations+=size;
++
++                    tableint *datal = (tableint *) (data + 1);
++                    for (int i = 0; i < size; i++) {
++                        tableint cand = datal[i];
++                        if (static_cast<int>(cand) < 0 || cand > max_elements_)
++                            throw std::runtime_error("cand error");
++                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+ 
+-                tableint *datal = (tableint *) (data + 1);
+-                for (int i = 0; i < size; i++) {
+-                    tableint cand = datal[i];
+-                    if (cand < 0 || cand > max_elements_)
+-                        throw std::runtime_error("cand error");
+-                    dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+-
+-                    if (d < curdist) {
+-                        curdist = d;
+-                        currObj = cand;
+-                        changed = true;
++                        if (d < curdist) {
++                            curdist = d;
++                            currObj = cand;
++                            changed = true;
++                        }
                      }
                  }
+             }
 diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
-index 4413537..c3240f3 100644
+index 834d19f..0c0af26 100644
 --- a/hnswlib/space_l2.h
 +++ b/hnswlib/space_l2.h
-@@ -252,13 +252,14 @@ namespace hnswlib {
-         ~L2Space() {}
-     };
- 
-+    template <typename T>
-     static int
-     L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
- 
-         size_t qty = *((size_t *) qty_ptr);
-         int res = 0;
--        unsigned char *a = (unsigned char *) pVect1;
--        unsigned char *b = (unsigned char *) pVect2;
-+        T *a = (T *) pVect1;
-+        T *b = (T *) pVect2;
- 
-         qty = qty >> 2;
-         for (size_t i = 0; i < qty; i++) {
-@@ -279,11 +280,12 @@ namespace hnswlib {
-         return (res);
-     }
- 
-+    template <typename T>
-     static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
-         size_t qty = *((size_t*)qty_ptr);
-         int res = 0;
--        unsigned char* a = (unsigned char*)pVect1;
--        unsigned char* b = (unsigned char*)pVect2;
-+        T* a = (T*)pVect1;
-+        T* b = (T*)pVect2;
- 
-         for(size_t i = 0; i < qty; i++)
-         {
-@@ -294,6 +296,7 @@ namespace hnswlib {
-         return (res);
-     }
- 
-+    template <typename T>
-     class L2SpaceI : public SpaceInterface<int> {
- 
-         DISTFUNC<int> fstdistfunc_;
-@@ -302,10 +305,10 @@ namespace hnswlib {
-     public:
-         L2SpaceI(size_t dim) {
-             if(dim % 4 == 0) {
--                fstdistfunc_ = L2SqrI4x;
-+                fstdistfunc_ = L2SqrI4x<T>;
-             }
-             else {
--                fstdistfunc_ = L2SqrI;
-+                fstdistfunc_ = L2SqrI<T>;
-             }
-             dim_ = dim;
-             data_size_ = dim * sizeof(unsigned char);
-diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h
-index 5e1a4a5..4195ebd 100644
---- a/hnswlib/visited_list_pool.h
-+++ b/hnswlib/visited_list_pool.h
-@@ -3,6 +3,7 @@
- #include <mutex>
- #include <string.h>
- #include <deque>
-+#include <limits>
- 
- namespace hnswlib {
-     typedef unsigned short int vl_type;
-@@ -14,7 +15,7 @@ namespace hnswlib {
-         unsigned int numelements;
- 
-         VisitedList(int numelements1) {
--            curV = -1;
-+            curV = std::numeric_limits<vl_type>::max();
-             numelements = numelements1;
-             mass = new vl_type[numelements];
+@@ -252,12 +252,13 @@ class L2Space : public SpaceInterface<float> {
+     ~L2Space() {}
+ };
+ 
++template <typename T>
+ static int
+ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
+     size_t qty = *((size_t *) qty_ptr);
+     int res = 0;
+-    unsigned char *a = (unsigned char *) pVect1;
+-    unsigned char *b = (unsigned char *) pVect2;
++    T *a = (T *) pVect1;
++    T *b = (T *) pVect2;
+ 
+     qty = qty >> 2;
+     for (size_t i = 0; i < qty; i++) {
+@@ -277,11 +278,12 @@ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const voi
+     return (res);
+ }
+ 
++template <typename T>
+ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
+     size_t qty = *((size_t*)qty_ptr);
+     int res = 0;
+-    unsigned char* a = (unsigned char*)pVect1;
+-    unsigned char* b = (unsigned char*)pVect2;
++    T* a = (T*)pVect1;
++    T* b = (T*)pVect2;
+ 
+     for (size_t i = 0; i < qty; i++) {
+         res += ((*a) - (*b)) * ((*a) - (*b));
+@@ -291,6 +293,7 @@ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2,
+     return (res);
+ }
+ 
++template <typename T>
+ class L2SpaceI : public SpaceInterface<int> {
+     DISTFUNC<int> fstdistfunc_;
+     size_t data_size_;
+@@ -299,9 +302,9 @@ class L2SpaceI : public SpaceInterface<int> {
+  public:
+     L2SpaceI(size_t dim) {
+         if (dim % 4 == 0) {
+-            fstdistfunc_ = L2SqrI4x;
++            fstdistfunc_ = L2SqrI4x<T>;
+         } else {
+-            fstdistfunc_ = L2SqrI;
++            fstdistfunc_ = L2SqrI<T>;
          }
--- 
-2.43.0
-
+         dim_ = dim;
+         data_size_ = dim * sizeof(unsigned char);
diff --git a/cpp/cmake/patches/hnswlib_override.json b/cpp/cmake/patches/hnswlib_override.json
index aef2da772..c50220e24 100644
--- a/cpp/cmake/patches/hnswlib_override.json
+++ b/cpp/cmake/patches/hnswlib_override.json
@@ -1,16 +1,16 @@
 {
-    "packages" : {
-      "hnswlib" : {
-        "version": "0.6.2",
-        "git_url": "https://github.com/nmslib/hnswlib.git",
-        "git_tag": "v${version}",
-        "patches" : [
-          {
-            "file" : "${current_json_dir}/hnswlib.diff",
-            "issue" : "Correct compilation issues",
-            "fixed_in" : ""
-          }
-        ]
-      }
+  "packages": {
+    "hnswlib": {
+      "version": "0.7.0",
+      "git_url": "https://github.com/nmslib/hnswlib.git",
+      "git_tag": "v${version}",
+      "patches": [
+        {
+          "file": "${current_json_dir}/hnswlib.diff",
+          "issue": "Correct compilation issues",
+          "fixed_in": ""
+        }
+      ]
     }
-  }
\ No newline at end of file
+  }
+}
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake
index 61065318b..71bd2d26c 100644
--- a/cpp/cmake/thirdparty/get_cutlass.cmake
+++ b/cpp/cmake/thirdparty/get_cutlass.cmake
@@ -13,10 +13,11 @@
 # =============================================================================
 
 function(find_and_configure_cutlass)
-  set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  # if(RAFT_ENABLE_DIST_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES)
   set(CUTLASS_ENABLE_HEADERS_ONLY
       ON
       CACHE BOOL "Enable only the header library"
@@ -34,13 +35,22 @@ function(find_and_configure_cutlass)
     set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE)
   endif()
 
+  include("${rapids-cmake-dir}/cpm/package_override.cmake")
+  rapids_cpm_package_override("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches/cutlass_override.json")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(cutlass version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(cutlass ${version} patch_command)
+
   rapids_cpm_find(
-    NvidiaCutlass ${PKG_VERSION}
+    NvidiaCutlass ${version}
     GLOBAL_TARGETS nvidia::cutlass::cutlass
     CPM_ARGS
-    GIT_REPOSITORY ${PKG_REPOSITORY}
-    GIT_TAG ${PKG_PINNED_TAG}
-    GIT_SHALLOW TRUE
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
     OPTIONS "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
   )
 
@@ -56,7 +66,6 @@ function(find_and_configure_cutlass)
       NAMESPACE nvidia::cutlass::
     )
   endif()
-  # endif()
 
   # We generate the cutlass-config files when we built cutlass locally, so always do
   # `find_dependency`
@@ -79,14 +88,4 @@ function(find_and_configure_cutlass)
   )
 endfunction()
 
-if(NOT RAFT_CUTLASS_GIT_TAG)
-  set(RAFT_CUTLASS_GIT_TAG v2.10.0)
-endif()
-
-if(NOT RAFT_CUTLASS_GIT_REPOSITORY)
-  set(RAFT_CUTLASS_GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git)
-endif()
-
-find_and_configure_cutlass(
-  VERSION 2.10.0 REPOSITORY ${RAFT_CUTLASS_GIT_REPOSITORY} PINNED_TAG ${RAFT_CUTLASS_GIT_TAG}
-)
+find_and_configure_cutlass()
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 2e6c895e5..5b4d89aa2 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -15,6 +15,7 @@
 #=============================================================================
 
 function(find_and_configure_hnswlib)
+  message(STATUS "Finding or building hnswlib")
   set(oneValueArgs)
 
   include(${rapids-cmake-dir}/cpm/package_override.cmake)
diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp
index e1da04085..8f7e8675a 100644
--- a/cpp/include/cuvs/cluster/agglomerative.hpp
+++ b/cpp/include/cuvs/cluster/agglomerative.hpp
@@ -18,6 +18,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <optional>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
diff --git a/cpp/include/cuvs/core/c_api.h b/cpp/include/cuvs/core/c_api.h
index c8c8d3934..400d162ad 100644
--- a/cpp/include/cuvs/core/c_api.h
+++ b/cpp/include/cuvs/core/c_api.h
@@ -151,6 +151,22 @@ cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
  */
 cuvsError_t cuvsRMMMemoryResourceReset();
 
+/**
+ * @brief Allocates pinned memory on the host using RMM
+ * @param[out] ptr Pointer to allocated host memory
+ * @param[in] bytes Size in bytes to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes);
+
+/**
+ * @brief Deallocates pinned memory on the host using RMM
+ * @param[in] ptr Pointer to allocated host memory to free
+ * @param[in] bytes Size in bytes to deallocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes);
+
 /** @} */
 
 #ifdef __cplusplus
diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp
index def72641e..42c574e58 100644
--- a/cpp/include/cuvs/distance/distance.hpp
+++ b/cpp/include/cuvs/distance/distance.hpp
@@ -20,6 +20,7 @@
 
 #include <cstdint>
 #include <cuda_fp16.h>
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
@@ -331,6 +332,86 @@ void pairwise_distance(
   cuvs::distance::DistanceType metric,
   float metric_arg = 2.0f);
 
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_csr_matrix.hpp>
+ * #include <raft/core/device_mdspan.hpp>
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix<float>(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix<float>(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix<float>(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const float, int, int, int> x,
+                       raft::device_csr_matrix_view<const float, int, int, int> y,
+                       raft::device_matrix_view<float, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg = 2.0f);
+
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_csr_matrix.hpp>
+ * #include <raft/core/device_mdspan.hpp>
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix<double>(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix<double>(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix<double>(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const double, int, int, int> x,
+                       raft::device_csr_matrix_view<const double, int, int, int> y,
+                       raft::device_matrix_view<double, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg = 2.0f);
+
 /** @} */  // end group pairwise_distance_runtime
 
 };  // namespace cuvs::distance
diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp
new file mode 100644
index 000000000..0c904d493
--- /dev/null
+++ b/cpp/include/cuvs/distance/grammian.hpp
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::distance::kernels {
+
+template <typename math_t>
+using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, raft::layout_stride>;
+template <typename math_t>
+using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, raft::layout_stride>;
+template <typename math_t>
+using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+template <typename math_t>
+class GramMatrixBase {
+ protected:
+  cublasHandle_t cublas_handle;
+  bool legacy_interface;
+
+ public:
+  GramMatrixBase() : legacy_interface(false){};
+  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
+    : cublas_handle(cublas_handle), legacy_interface(true){};
+
+  virtual ~GramMatrixBase(){};
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  dense_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  csr_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        dense_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        csr_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] virtual void evaluate(const math_t* x1,
+                                       int n1,
+                                       int n_cols,
+                                       const math_t* x2,
+                                       int n2,
+                                       math_t* out,
+                                       bool is_row_major,
+                                       cudaStream_t stream,
+                                       int ld1,
+                                       int ld2,
+                                       int ld_out);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void operator()(const math_t* x1,
+                                 int n1,
+                                 int n_cols,
+                                 const math_t* x2,
+                                 int n2,
+                                 math_t* out,
+                                 bool is_row_major,
+                                 cudaStream_t stream,
+                                 int ld1    = 0,
+                                 int ld2    = 0,
+                                 int ld_out = 0);
+
+ protected:
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void linear(const math_t* x1,
+                             int n1,
+                             int n_cols,
+                             const math_t* x2,
+                             int n2,
+                             math_t* out,
+                             bool is_row_major,
+                             cudaStream_t stream,
+                             int ld1,
+                             int ld2,
+                             int ld_out);
+
+ protected:
+  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              dense_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              csr_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+};
+
+template <typename math_t>
+class KernelFactory {
+ public:
+  static GramMatrixBase<math_t>* create(KernelParams params);
+  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle);
+};
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+class PolynomialKernel : public GramMatrixBase<math_t> {
+  exp_t exponent;
+  math_t gain;
+  math_t offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a polynomial kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
+   *
+   * @tparam math_t floating point type
+   * @tparam exp_t type of exponent
+   * @param exponent
+   * @param gain
+   * @param offset
+   */
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
+    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset){};
+
+  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+class TanhKernel : public GramMatrixBase<math_t> {
+  math_t gain, offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a tanh kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   * @param offset
+   */
+  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
+
+  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+class RBFKernel : public GramMatrixBase<math_t> {
+  math_t gain;
+
+  void applyKernel(math_t* inout,
+                   int ld,
+                   int rows,
+                   int cols,
+                   math_t* norm_x1,
+                   math_t* norm_x2,
+                   bool is_row_major,
+                   cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a RBF kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = exp(-gain*|x1_i- x2_k|^2)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   */
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain){};
+
+  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain){};
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       dense_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       csr_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/include/cuvs/embed/spectral.hpp b/cpp/include/cuvs/embed/spectral.hpp
new file mode 100644
index 000000000..1a8fed96a
--- /dev/null
+++ b/cpp/include/cuvs/embed/spectral.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_coo_matrix.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::embed::spectral {
+
+/**
+ * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings
+ * (lowest n_components eigenvectors), using Lanczos min cut algorithm. Please note that this
+ * algorithm does not compute a full laplacian eigenmap, as the laplacian eigenmap would embed each
+ * connected component. Laplacian eigenmaps can be built from this algorithm by running it on the
+ * vectors for each connected component.
+
+ * @param[in] handle
+ * @param[in] knn_graph KNN Graph
+ * @param[in] n_components the number of components to project into
+ * @param[out] out output array for embedding (size n*n_comonents)
+ * @param[in] seed
+ */
+void fit(const raft::resources& handle,
+         raft::device_coo_matrix_view<float, int, int, int> knn_graph,
+         int n_components,
+         raft::device_matrix_view<float, int> out,
+         unsigned long long seed = 0L);
+};  // namespace cuvs::embed::spectral
diff --git a/cpp/include/cuvs/neighbors/brute_force.h b/cpp/include/cuvs/neighbors/brute_force.h
index c9e172f62..33b92f11b 100644
--- a/cpp/include/cuvs/neighbors/brute_force.h
+++ b/cpp/include/cuvs/neighbors/brute_force.h
@@ -166,6 +166,66 @@ cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
  * @}
  */
 
+/**
+ * @defgroup bruteforce_c_serialize BRUTEFORCE C-API serialize functions
+ * @{
+ */
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.c}
+ * #include <cuvs/neighbors/brute_force.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create an index with `cuvsBruteforceBuild`
+ * cuvsBruteForceSerialize(res, "/path/to/index", index);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index BRUTEFORCE index
+ *
+ */
+cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                    const char* filename,
+                                    cuvsBruteForceIndex_t index);
+
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.c}
+ * #include <cuvs/neighbors/brute_force.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // Deserialize an index previously built with `cuvsBruteforceBuild`
+ * cuvsBruteForceIndex_t index;
+ * cuvsBruteForceIndexCreate(&index);
+ * cuvsBruteForceDeserialize(res, "/path/to/index", index);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index BRUTEFORCE index loaded disk
+ */
+cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                      const char* filename,
+                                      cuvsBruteForceIndex_t index);
+
+/**
+ * @}
+ */
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp
index 428fa592a..d040e03db 100644
--- a/cpp/include/cuvs/neighbors/brute_force.hpp
+++ b/cpp/include/cuvs/neighbors/brute_force.hpp
@@ -18,6 +18,7 @@
 
 #include "common.hpp"
 #include <cuvs/neighbors/common.hpp>
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
@@ -47,6 +48,14 @@ struct index : cuvs::neighbors::index {
   index& operator=(index&&)      = default;
   ~index()                       = default;
 
+  /**
+   * @brief Construct an empty index.
+   *
+   * Constructs an empty index. This index will either need to be trained with `build`
+   * or loaded from a saved copy with `deserialize`
+   */
+  index(raft::resources const& handle);
+
   /** Construct a brute force index from dataset
    *
    * Constructs a brute force index from a dataset. This lets us precompute norms for
@@ -375,4 +384,342 @@ void search(raft::resources const& handle,
  * @}
  */
 
+/**
+ * @defgroup sparse_bruteforce_cpp_index Sparse Brute Force index
+ * @{
+ */
+/**
+ * @brief Sparse Brute Force index.
+ *
+ * @tparam T Data element type
+ * @tparam IdxT Index element type
+ */
+template <typename T, typename IdxT>
+struct sparse_index {
+ public:
+  sparse_index(const sparse_index&)            = delete;
+  sparse_index(sparse_index&&)                 = default;
+  sparse_index& operator=(const sparse_index&) = delete;
+  sparse_index& operator=(sparse_index&&)      = default;
+  ~sparse_index()                              = default;
+
+  /** Construct a sparse brute force sparse_index from dataset */
+  sparse_index(raft::resources const& res,
+               raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset,
+               cuvs::distance::DistanceType metric,
+               T metric_arg);
+
+  /** Distance metric used for retrieval */
+  cuvs::distance::DistanceType metric() const noexcept { return metric_; }
+
+  /** Metric argument */
+  T metric_arg() const noexcept { return metric_arg_; }
+
+  raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset() const noexcept
+  {
+    return dataset_;
+  }
+
+ private:
+  raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset_;
+  cuvs::distance::DistanceType metric_;
+  T metric_arg_;
+};
+/**
+ * @}
+ */
+
+/**
+ * @defgroup sparse_bruteforce_cpp_index_build Sparse Brute Force index build
+ * @{
+ */
+
+/*
+ * @brief Build the Sparse index from the dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // create and fill the index from a CSR dataset
+ *   auto index = brute_force::build(handle, dataset, metric);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] dataset A sparse CSR matrix in device memory to search against
+ * @param[in] metric cuvs::distance::DistanceType
+ * @param[in] metric_arg metric argument
+ *
+ * @return the constructed Sparse brute-force index
+ */
+auto build(raft::resources const& handle,
+           raft::device_csr_matrix_view<const float, int, int, int> dataset,
+           cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
+           float metric_arg = 0) -> cuvs::neighbors::brute_force::sparse_index<float, int>;
+/**
+ * @}
+ */
+
+/**
+ * @defgroup sparse_bruteforce_cpp_index_search Sparse Brute Force index search
+ * @{
+ */
+struct sparse_search_params {
+  int batch_size_index = 2 << 14;
+  int batch_size_query = 2 << 14;
+};
+
+/*
+ * @brief Search the sparse bruteforce index for nearest neighbors
+ *
+ * @param[in] handle
+ * @param[in] index Sparse brute-force constructed index
+ * @param[in] queries a sparse CSR matrix on the device to query
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ */
+void search(raft::resources const& handle,
+            const sparse_search_params& params,
+            const sparse_index<float, int>& index,
+            raft::device_csr_matrix_view<const float, int, int, int> dataset,
+            raft::device_matrix_view<int, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+/**
+ * @}
+ */
+
+/**
+ * @defgroup bruteforce_cpp_index_serialize Bruteforce index serialize functions
+ * @{
+ */
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index brute force index
+ * @param[in] include_dataset whether to include the dataset in the serialized
+ * output
+ */
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::brute_force::index<half, float>& index,
+               bool include_dataset = true);
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index brute force index
+ * @param[in] include_dataset whether to include the dataset in the serialized
+ * output
+ *
+ */
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::brute_force::index<float, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Write the index to an output stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index brute force index
+ * @param[in] include_dataset Whether or not to write out the dataset to the file.
+ */
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const cuvs::neighbors::brute_force::index<half, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Write the index to an output stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index brute force index
+ * @param[in] include_dataset Whether or not to write out the dataset to the file.
+ */
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const cuvs::neighbors::brute_force::index<float, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * using T    = half; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<half, float>* index);
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * using T    = float; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<float, float>* index);
+/**
+ * Load index from input stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an input stream
+ * std::istream is(std::cin.rdbuf());
+ * using T    = half; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, is, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] is input stream
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<half, float>* index);
+/**
+ * Load index from input stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an input stream
+ * std::istream is(std::cin.rdbuf());
+ * using T    = float; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, is, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] is input stream
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<float, float>* index);
+/**
+ * @}
+ */
+
 }  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index e48050756..a4684ce26 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -272,6 +272,10 @@ static_assert(std::is_aggregate_v<search_params>);
  */
 template <typename T, typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = cagra::index_params;
+  using search_params_type = cagra::search_params;
+  using index_type         = IdxT;
+  using value_type         = T;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
@@ -363,7 +367,7 @@ struct index : cuvs::neighbors::index {
    *   // search K nearest neighbours
    *   auto neighbors = raft::make_device_matrix<uint32_t, int64_t>(res, n_queries, k);
    *   auto distances = raft::make_device_matrix<float, int64_t>(res, n_queries, k);
-   *   cagra::search(res, search_params, index, queries, neighbors, distances);
+   *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
    * @endcode
    *   In the above example, we have passed a host dataset to build. The returned index will own a
    * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a
@@ -530,7 +534,7 @@ struct index : cuvs::neighbors::index {
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -567,7 +571,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -604,7 +608,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -640,7 +644,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -676,7 +680,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -713,7 +717,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -750,7 +754,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -787,7 +791,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
diff --git a/cpp/include/cuvs/neighbors/dynamic_batching.hpp b/cpp/include/cuvs/neighbors/dynamic_batching.hpp
new file mode 100644
index 000000000..410800357
--- /dev/null
+++ b/cpp/include/cuvs/neighbors/dynamic_batching.hpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuvs/neighbors/common.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+namespace detail {
+template <typename T, typename IdxT>
+class batch_runner;
+}
+
+/**
+ * @defgroup dynamic_batching_cpp_index_params Dynamic Batching index parameters
+ * @{
+ */
+struct index_params : cuvs::neighbors::index_params {
+  /** The number of neighbors to search is fixed at construction time. */
+  int64_t k;
+  /** Maximum size of the batch to submit to the upstream index. */
+  int64_t max_batch_size = 100;
+  /**
+   * The number of independent request queues.
+   *
+   * Each queue is associated with a unique CUDA stream and IO device buffers. If the number of
+   * concurrent requests is high, using multiple queues allows to fill-in data and prepare the batch
+   * while the other queue is busy. Moreover, the queues are submitted concurrently; this allows to
+   * better utilize the GPU by hiding the kernel launch latencies, which helps to improve the
+   * throughput.
+   */
+  size_t n_queues = 3;
+  /**
+   * By default (`conservative_dispatch = false`) the first CPU thread to commit a query to a batch
+   * dispatches the upstream search function as soon as possible (before the batch is full). In that
+   * case, it does not know the final batch size at the time of calling the upstream search and thus
+   * runs the upstream search with the maximum batch size every time, even if only one valid query
+   * is present in the batch. This reduces the latency at the cost of wasted GPU resources.
+   *
+   * The alternative behavaior (`conservative_dispatch = true`) is more conservative: the dispatcher
+   * thread starts the kernel that gathers input queries, but waits till the batch is full or the
+   * waiting time is exceeded. Only then it acquires the actual batch size and launches the upstream
+   * search. As a result, less GPU resources are wasted at the cost of exposing upstream search
+   * latency.
+   *
+   * *Rule of Thumb*:
+   *    for a large `max_batch_size` set `conservative_dispatch = true`, otherwise keep it disabled.
+   */
+  bool conservative_dispatch = false;
+};
+/** @} */
+
+/**
+ * @defgroup dynamic_batching_cpp_search_params Dynamic Batching search parameters
+ * @{
+ */
+struct search_params : cuvs::neighbors::search_params {
+  /**
+   * How long a request can stay in the queue (milliseconds).
+   * Note, this only affects the dispatch time and does not reflect full request latency;
+   * the latter depends on the upstream search parameters and the batch size.
+   */
+  double dispatch_timeout_ms = 1.0;
+};
+/** @} */
+
+/**
+ * @defgroup dynamic_batching_cpp_index Dynamic Batching index type
+ * @{
+ */
+
+/**
+ * @brief Lightweight dynamic batching index wrapper
+ *
+ * @tparam T data type
+ * @tparam IdxT index type
+ *
+ * One lightweight dynamic batching index manages a single index and a single search parameter set.
+ * This structure should be shared among multiple users via copy semantics: access to the
+ * underlying implementation is managed via a shared pointer, and concurrent search among the
+ * participants is thread-safe.
+ *
+ * __Usage example__
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // When creating a dynamic batching index, k parameter has to be passed explicitly.
+ *   // The first empty braces default-initialize the parent `neighbors::index_params` (unused).
+ *   dynamic_batching::index_params dynb_index_params{{}, k};
+ *   // Construct the index by wrapping the upstream index and search parameters.
+ *   dynamic_batching::index<float, uint32_t> index{
+ *       res, dynb_index_params, upstream_index, upstream_search_params
+ *   };
+ *   // Use default search parameters
+ *   dynamic_batching::search_params search_params;
+ *   // Search K nearest neighbours
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   dynamic_batching::search(
+ *       res, search_params, index, queries, neighbors.view(), distances.view()
+ *   );
+ * @endcode
+ *
+ *
+ * __Priority queues__
+ *
+ * The dynamic batching index has a limited support for prioritizing individual requests.
+ * There's only one pool of queues in the batcher and no functionality to prioritize one bach over
+ * the other. The `search_params::dispatch_timeout_ms` parameters passed in each request are
+ * aggregated internally and the batch is dispatched no later than any of the timeouts is exceeded.
+ * In this logic, a high-priority request can never be processed earlier than any lower-priority
+ * requests submitted earlier.
+ *
+ * However, dynamic batching indexes are lightweight and do not contain any global or static state.
+ * This means it's easy to combine multiple batchers.
+ * As an example, you can construct one batching index per priority class:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // Large batch size (128), couple queues (2),
+ *   //   enabled conservative dispatch - all for better throughput
+ *   dynamic_batching::index_params low_priority_params{{}, k, 128, 2, true};
+ *   // Small batch size (16), more queues (4),
+ *   //   disabled conservative dispatch - to minimize latency with reasonable throughput
+ *   dynamic_batching::index_params high_priority_params{{}, k, 16, 4, false};
+ *   // Construct the indexes by wrapping the upstream index and search parameters.
+ *   dynamic_batching::index<float, uint32_t> low_priority_index{
+ *       res, low_priority_params, upstream_index, upstream_search_params
+ *   };
+ *   dynamic_batching::index<float, uint32_t> high_priority_index{
+ *       res, high_priority_params, upstream_index, upstream_search_params
+ *   };
+ *   // Define a combined search function with priority selection
+ *   double high_priority_threshold_ms = 0.1;
+ *   auto search_function =
+ *      [low_priority_index, high_priority_index, high_priority_threshold_ms](
+ *        raft::resources const &res,
+ *        dynamic_batching::search_params search_params,
+ *        raft::device_matrix_view<const float, int64_t> queries,
+ *        raft::device_matrix_view<uint32_t, int64_t> neighbors,
+ *        raft::device_matrix_view<float, int64_t> distances) {
+ *      dynamic_batching::search(
+ *          res,
+ *          search_params,
+ *          search_params.dispatch_timeout_ms < high_priority_threshold_ms
+ *            ? high_priority_index : low_priority_index,
+ *          queries,
+ *          neighbors,
+ *          distances
+ *      );
+ *   };
+ * @endcode
+ */
+template <typename T, typename IdxT>
+struct index : cuvs::neighbors::index {
+  std::shared_ptr<detail::batch_runner<T, IdxT>> runner;
+
+  /**
+   * @brief Construct a dynamic batching index by wrapping the upstream index.
+   *
+   * @tparam Upstream the upstream index type
+   *
+   * @param[in] res raft resources
+   * @param[in] params dynamic batching parameters
+   * @param[in] upstream_index the original index to perform the search
+   *     (the reference must be alive for the lifetime of the dynamic batching index)
+   * @param[in] upstream_params the original index search parameters for all queries in a batch
+   *     (the parameters are captured by value for the lifetime of the dynamic batching index)
+   * @param[in] sample_filter
+   *     filtering function, if any, must be the same for all requests in a batch
+   *     (the pointer must be alive for the lifetime of the dynamic batching index)
+   */
+  template <typename Upstream>
+  index(const raft::resources& res,
+        const cuvs::neighbors::dynamic_batching::index_params& params,
+        const Upstream& upstream_index,
+        const typename Upstream::search_params_type& upstream_params,
+        const cuvs::neighbors::filtering::base_filter* sample_filter = nullptr);
+};
+/** @} */
+
+/**
+ *
+ * @defgroup dynamic_batching_cpp_search Dynamic Batching search
+ *
+ * @{
+ */
+
+/**
+ * @brief Search ANN using a dynamic batching index.
+ *
+ * The search parameters of the upstream index and the optional filtering function are configured at
+ * the dynamic batching index construction time.
+ *
+ * Like with many other indexes, the dynamic batching search has the stream-ordered semantics: the
+ * host function may return the control before the results are ready. Synchronize with the main CUDA
+ * stream in the given resource object to wait for arrival of the search results.
+ *
+ * Dynamic batching search is thread-safe: call the search function with copies of the same index in
+ * multiple threads to increase the occupancy of the batches.
+ *
+ * @param[in] res
+ * @param[in] params query-specific batching parameters, such as the maximum waiting time
+ * @param[in] index a dynamic batching index
+ * @param[in] queries a device matrix view to a row-major matrix
+ *               [n_queries, dim]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ *               [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors
+ *               [n_queries, k]
+ *
+ */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<float, uint32_t> const& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<half, uint32_t> const& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<int8_t, uint32_t> const& index,
+            raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<uint8_t, uint32_t> const& index,
+            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<float, int64_t> const& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<half, int64_t> const& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<int8_t, int64_t> const& index,
+            raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<uint8_t, int64_t> const& index,
+            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @} */
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h
index 0495c574a..b7eda54b8 100644
--- a/cpp/include/cuvs/neighbors/hnsw.h
+++ b/cpp/include/cuvs/neighbors/hnsw.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "cagra.h"
+
 #include <cuvs/core/c_api.h>
 #include <cuvs/distance/distance.h>
 #include <dlpack/dlpack.h>
@@ -27,32 +29,51 @@ extern "C" {
 #endif
 
 /**
- * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
+ * @defgroup hnsw_c_index_params C API for HNSW index params
  * @{
  */
 
-struct cuvsHnswSearchParams {
-  int32_t ef;
-  int32_t numThreads;
+/**
+ * @brief Hierarchy for HNSW index when converting from CAGRA index
+ *
+ * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
+ */
+enum cuvsHnswHierarchy {
+  /* Flat hierarchy, search is base-layer only */
+  NONE,
+  /* Full hierarchy is built using the CPU */
+  CPU
 };
 
-typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
+struct cuvsHnswIndexParams {
+  /* hierarchy of the hnsw index */
+  cuvsHnswHierarchy hierarchy;
+  /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
+  int ef_construction;
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
+  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
+  to parallelism, and increasing the number of threads can reduce the quality of the index.
+   */
+  int num_threads;
+};
+
+typedef struct cuvsHnswIndexParams* cuvsHnswIndexParams_t;
 
 /**
- * @brief Allocate HNSW search params, and populate with default values
+ * @brief Allocate HNSW Index params, and populate with default values
  *
- * @param[in] params cuvsHnswSearchParams_t to allocate
+ * @param[in] params cuvsHnswIndexParams_t to allocate
  * @return cuvsError_t
  */
-cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
+cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params);
 
 /**
- * @brief De-allocate HNSW search params
+ * @brief De-allocate HNSW Index params
  *
- * @param[in] params cuvsHnswSearchParams_t to de-allocate
+ * @param[in] params
  * @return cuvsError_t
  */
-cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
+cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params);
 
 /**
  * @}
@@ -90,6 +111,184 @@ cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index);
  */
 cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
 
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_extend_params Parameters for extending HNSW index
+ * @{
+ */
+
+struct cuvsHnswExtendParams {
+  /** Number of CPU threads used to extend additional vectors */
+  int num_threads;
+};
+
+typedef struct cuvsHnswExtendParams* cuvsHnswExtendParams_t;
+
+/**
+ * @brief Allocate HNSW extend params, and populate with default values
+ *
+ * @param[in] params cuvsHnswExtendParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params);
+
+/**
+ * @brief De-allocate HNSW extend params
+ *
+ * @param[in] params cuvsHnswExtendParams_t to de-allocate
+ * @return cuvsError_t
+ */
+
+cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_index_load Load CAGRA index as hnswlib index
+ * @{
+ */
+
+/**
+ * @brief Convert a CAGRA Index to an HNSW index.
+ * NOTE: When hierarchy is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
+ * @param[in] cagra_index cuvsCagraIndex_t to convert to HNSW index
+ * @param[out] hnsw_index cuvsHnswIndex_t to return the HNSW index
+ *
+ * @return cuvsError_t
+ *
+ * @code{.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/cagra.h>
+ * #include <cuvs/neighbors/hnsw.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create a CAGRA index with `cuvsCagraBuild`
+ *
+ * // Convert the CAGRA index to an HNSW index
+ * cuvsHnswIndex_t hnsw_index;
+ * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnswIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+ *
+ * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+ * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+ * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ */
+cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                              cuvsHnswIndexParams_t params,
+                              cuvsCagraIndex_t cagra_index,
+                              cuvsHnswIndex_t hnsw_index);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_index_extend Extend HNSW index with additional vectors
+ * @{
+ */
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the hierarchy is `CPU`
+ *       when converting from a CAGRA index.
+
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswExtendParams_t used to extend Hnsw index
+ * @param[in] additional_dataset DLManagedTensor* additional dataset to extend the index
+ * @param[inout] index cuvsHnswIndex_t to extend
+  *
+  * @return cuvsError_t
+  *
+  * @code{.c}
+  * #include <cuvs/core/c_api.h>
+  * #include <cuvs/neighbors/cagra.h>
+  * #include <cuvs/neighbors/hnsw.h>
+  *
+  * // Create cuvsResources_t
+  * cuvsResources_t res;
+  * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+  *
+  * // create an index with `cuvsCagraBuild`
+  *
+  * // Convert the CAGRA index to an HNSW index
+  * cuvsHnswIndex_t hnsw_index;
+  * cuvsHnswIndexCreate(&hnsw_index);
+  * cuvsHnswIndexParams_t hnsw_params;
+  * cuvsHnswIndexParamsCreate(&hnsw_params);
+  * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+  *
+  * // Extend the HNSW index with additional vectors
+  * DLManagedTensor additional_dataset;
+  * cuvsHnswExtendParams_t extend_params;
+  * cuvsHnswExtendParamsCreate(&extend_params);
+  * cuvsHnswExtend(res, extend_params, additional_dataset, hnsw_index);
+  *
+  * // de-allocate `hnsw_params`, `hnsw_index`, `extend_params` and `res`
+  * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+  * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+  * cuvsError_t extend_params_destroy_status = cuvsHnswExtendParamsDestroy(extend_params);
+  * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+  * @endcode
+  */
+
+cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                           cuvsHnswExtendParams_t params,
+                           DLManagedTensor* additional_dataset,
+                           cuvsHnswIndex_t index);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
+ * @{
+ */
+
+struct cuvsHnswSearchParams {
+  int32_t ef;
+  int32_t num_threads;
+};
+
+typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
+
+/**
+ * @brief Allocate HNSW search params, and populate with default values
+ *
+ * @param[in] params cuvsHnswSearchParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
+
+/**
+ * @brief De-allocate HNSW search params
+ *
+ * @param[in] params cuvsHnswSearchParams_t to de-allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
+
 /**
  * @}
  */
@@ -111,8 +310,8 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  *          c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
  *        2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 64`
  *        3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * NOTE: When hierarchy is `NONE`, the HNSW index can only be searched by the hnswlib wrapper in
+ * cuVS, as the format is not compatible with the original hnswlib.
  *
  * @code {.c}
  * #include <cuvs/core/c_api.h>
@@ -131,7 +330,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  * cuvsHnswSearchParams_t params;
  * cuvsError_t params_create_status = cuvsHnswSearchParamsCreate(&params);
  *
- * // Search the `index` built using `cuvsHnswBuild`
+ * // Search the `index` built using `cuvsHnswFromCagra`
  * cuvsError_t search_status = cuvsHnswSearch(res, params, index, &queries, &neighbors,
  * &distances);
  *
@@ -142,7 +341,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  *
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] params cuvsHnswSearchParams_t used to search Hnsw index
- * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswBuild`
+ * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswFromCagra`
  * @param[in] queries DLManagedTensor* queries dataset to search
  * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
  * @param[out] distances DLManagedTensor* output `k` distances for queries
@@ -163,9 +362,50 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,
  * @{
  */
 
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the name of the file to save the index
+ * @param[in] index cuvsHnswIndex_t to serialize
+ * @return cuvsError_t
+ *
+ * @code{.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/cagra.h>
+ * #include <cuvs/neighbors/hnsw.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create an index with `cuvsCagraBuild`
+ *
+ * // Convert the CAGRA index to an HNSW index
+ * cuvsHnswIndex_t hnsw_index;
+ * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnswIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+ *
+ * // Serialize the HNSW index
+ * cuvsHnswSerialize(res, "/path/to/index", hnsw_index);
+ *
+ * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+ * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+ * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ */
+cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHnswIndex_t index);
+
 /**
  * Load hnswlib index from file which was serialized from a HNSW index.
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
  * Experimental, both the API and the serialization format are subject to change.
  *
@@ -185,17 +425,22 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,
  * // The index should have the same dtype as the one used to build CAGRA the index
  * cuvsHnswIndex_t hnsw_index;
  * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnsWIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * hnsw_params->hierarchy = NONE;
  * hnsw_index->dtype = index->dtype;
- * cuvsCagraDeserialize(res, "/path/to/index", hnsw_index);
+ * cuvsHnswDeserialize(res, hnsw_params, "/path/to/index", dim, metric hnsw_index);
  * @endcode
  *
  * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
  * @param[in] filename the name of the file that stores the index
  * @param[in] dim the dimension of the vectors in the index
  * @param[in] metric the distance metric used to build the index
  * @param[out] index HNSW index loaded disk
  */
 cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                cuvsHnswIndexParams_t params,
                                 const char* filename,
                                 int dim,
                                 cuvsDistanceType metric,
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index d5abd6d55..f0b433d8e 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -34,14 +34,30 @@
 namespace cuvs::neighbors::hnsw {
 
 /**
- * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib
+ * @defgroup hnsw_cpp_index_params hnswlib index wrapper params
  * @{
  */
 
-struct search_params : cuvs::neighbors::search_params {
-  int ef;               // size of the candidate list
-  int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
-                        // automatically maximizes parallelism
+/**
+ * @brief Hierarchy for HNSW index when converting from CAGRA index
+ *
+ * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
+ */
+enum class HnswHierarchy {
+  NONE,  // base-layer-only index
+  CPU    // full index with CPU-built hierarchy
+};
+
+struct index_params : cuvs::neighbors::index_params {
+  /** Hierarchy build type for HNSW index when converting from CAGRA index */
+  HnswHierarchy hierarchy = HnswHierarchy::NONE;
+  /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
+  int ef_construction = 200;
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
+  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
+  to parallelism, and increasing the number of threads can reduce the quality of the index.
+   */
+  int num_threads = 2;
 };
 
 /**@}*/
@@ -62,8 +78,12 @@ struct index : cuvs::neighbors::index {
    *
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
+   * @param[in] hierarchy hierarchy used for upper HNSW layers
    */
-  index(int dim, cuvs::distance::DistanceType metric) : dim_{dim}, metric_{metric} {}
+  index(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy = HnswHierarchy::NONE)
+    : dim_{dim}, metric_{metric}, hierarchy_{hierarchy}
+  {
+  }
 
   virtual ~index() {}
 
@@ -76,6 +96,8 @@ struct index : cuvs::neighbors::index {
 
   auto metric() const -> cuvs::distance::DistanceType { return metric_; }
 
+  auto hierarchy() const -> HnswHierarchy { return hierarchy_; }
+
   /**
   @brief Set ef for search
   */
@@ -84,24 +106,41 @@ struct index : cuvs::neighbors::index {
  private:
   int dim_;
   cuvs::distance::DistanceType metric_;
+  HnswHierarchy hierarchy_;
 };
 
 /**@}*/
 
+/**
+ * @defgroup hnsw_cpp_extend_params HNSW index extend parameters
+ * @{
+ */
+
+struct extend_params {
+  /** Number of host threads to use to add additional vectors to the index.
+  Value of 0 automatically maximizes parallelism. */
+  int num_threads = 0;
+};
+
 /**
  * @defgroup hnsw_cpp_index_load Load CAGRA index as hnswlib index
  * @{
  */
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file. The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -110,24 +149,34 @@ struct index : cuvs::neighbors::index {
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<float>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<float, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<float, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const float, int64_t, raft::row_major>> dataset =
+    std::nullopt);
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file.  The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -136,24 +185,34 @@ std::unique_ptr<index<float>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<uint8_t>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const uint8_t, int64_t, raft::row_major>> dataset =
+    std::nullopt);
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file.  The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -162,14 +221,138 @@ std::unique_ptr<index<uint8_t>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<int8_t>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<int8_t, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<int8_t, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const int8_t, int64_t, raft::row_major>> dataset =
+    std::nullopt);
+
+/**@}*/
+
+/**
+ * @defgroup hnsw_cpp_index_extend Extend HNSW index with additional vectors
+ * @{
+ */
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> additional_dataset,
+            index<float>& idx);
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> additional_dataset,
+            index<uint8_t>& idx);
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const int8_t, int64_t, raft::row_major> additional_dataset,
+            index<int8_t>& idx);
+
+/**@} */
+
+/**
+ * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib
+ * @{
+ */
+
+struct search_params : cuvs::neighbors::search_params {
+  int ef;               // size of the candidate list
+  int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
+                        // automatically maximizes parallelism
+};
 
 /**@}*/
 
@@ -181,9 +364,9 @@ std::unique_ptr<index<int8_t>> from_cagra(
  */
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSW index constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -201,10 +384,11 @@ std::unique_ptr<index<int8_t>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -224,9 +408,9 @@ void search(raft::resources const& res,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSWindex constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -244,10 +428,11 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -267,9 +452,9 @@ void search(raft::resources const& res,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSW index constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -287,10 +472,11 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -312,16 +498,106 @@ void search(raft::resources const& res,
 /**@}*/
 
 /**
- * @defgroup hnsw_cpp_index_deserialize Deserialize CAGRA index as hnswlib index
+ * @defgroup hnsw_cpp_index_serialize Deserialize CAGRA index as hnswlib index
  * @{
  */
 
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<float>& idx);
+
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<uint8_t>& idx);
+
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<int8_t>& idx);
+
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -334,19 +610,23 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<float>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<float>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
@@ -354,10 +634,13 @@ void deserialize(raft::resources const& res,
 
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -370,19 +653,23 @@ void deserialize(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<uint8_t>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<uint8_t>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
@@ -390,10 +677,13 @@ void deserialize(raft::resources const& res,
 
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -406,19 +696,23 @@ void deserialize(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<int8_t>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<int8_t>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
diff --git a/cpp/include/cuvs/neighbors/ivf_flat.hpp b/cpp/include/cuvs/neighbors/ivf_flat.hpp
index 7f852d635..e017946d9 100644
--- a/cpp/include/cuvs/neighbors/ivf_flat.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_flat.hpp
@@ -138,6 +138,10 @@ using list_data = ivf::list<list_spec, SizeT, ValueT, IdxT>;
  */
 template <typename T, typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = ivf_flat::index_params;
+  using search_params_type = ivf_flat::search_params;
+  using index_type         = IdxT;
+  using value_type         = T;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp
index ae543c9e9..d85753b7f 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp
@@ -319,6 +319,9 @@ using list_data = ivf::list<list_spec, SizeT, IdxT>;
  */
 template <typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = ivf_pq::index_params;
+  using search_params_type = ivf_pq::search_params;
+  using index_type         = IdxT;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp
index 347ccf889..9cd8192b5 100644
--- a/cpp/include/cuvs/neighbors/nn_descent.hpp
+++ b/cpp/include/cuvs/neighbors/nn_descent.hpp
@@ -55,15 +55,16 @@ struct index_params : cuvs::neighbors::index_params {
   size_t intermediate_graph_degree = 128;     // Degree of input graph for pruning.
   size_t max_iterations            = 20;      // Number of nn-descent iterations.
   float termination_threshold      = 0.0001;  // Termination threshold of nn-descent.
+  bool return_distances            = true;    // return distances if true
+  size_t n_clusters                = 1;       // defaults to not using any batching
 
   /** @brief Construct NN descent parameters for a specific kNN graph degree
    *
    * @param graph_degree output graph degree
+   * @param metric distance metric to use
    */
-  index_params(size_t graph_degree = 64)
-    : graph_degree(graph_degree), intermediate_graph_degree(1.5 * graph_degree)
-  {
-  }
+  index_params(size_t graph_degree                 = 64,
+               cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded);
 };
 
 /**
@@ -100,14 +101,25 @@ struct index : cuvs::neighbors::index {
    * @param res raft::resources is an object mangaging resources
    * @param n_rows number of rows in knn-graph
    * @param n_cols number of cols in knn-graph
+   * @param return_distances whether to return distances
+   * @param metric distance metric to use
    */
-  index(raft::resources const& res, int64_t n_rows, int64_t n_cols)
+  index(raft::resources const& res,
+        int64_t n_rows,
+        int64_t n_cols,
+        bool return_distances               = false,
+        cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
     : cuvs::neighbors::index(),
       res_{res},
-      metric_{cuvs::distance::DistanceType::L2Expanded},
+      metric_{metric},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(n_rows, n_cols)},
-      graph_view_{graph_.view()}
+      graph_view_{graph_.view()},
+      return_distances_{return_distances}
   {
+    if (return_distances) {
+      distances_      = raft::make_device_matrix<float, int64_t>(res_, n_rows, n_cols);
+      distances_view_ = distances_.value().view();
+    }
   }
 
   /**
@@ -119,14 +131,22 @@ struct index : cuvs::neighbors::index {
    *
    * @param res raft::resources is an object mangaging resources
    * @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
+   * @param distances_view optional raft::device_matrix_view<float, int64_t, row_major> for storing
+   * distances
+   * @param metric distance metric to use
    */
   index(raft::resources const& res,
-        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view)
+        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
+        std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
+          std::nullopt,
+        cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
     : cuvs::neighbors::index(),
       res_{res},
-      metric_{cuvs::distance::DistanceType::L2Expanded},
+      metric_{metric},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0)},
-      graph_view_{graph_view}
+      graph_view_{graph_view},
+      distances_view_{distances_view},
+      return_distances_{distances_view.has_value()}
   {
   }
 
@@ -155,6 +175,13 @@ struct index : cuvs::neighbors::index {
     return graph_view_;
   }
 
+  /** neighborhood graph distances [size, graph-degree] */
+  [[nodiscard]] inline auto distances() noexcept
+    -> std::optional<device_matrix_view<float, int64_t, row_major>>
+  {
+    return distances_view_;
+  }
+
   // Don't allow copying the index for performance reasons (try avoiding copying data)
   index(const index&)                    = delete;
   index(index&&)                         = default;
@@ -166,8 +193,11 @@ struct index : cuvs::neighbors::index {
   raft::resources const& res_;
   cuvs::distance::DistanceType metric_;
   raft::host_matrix<IdxT, int64_t, raft::row_major> graph_;  // graph to return for non-int IdxT
+  std::optional<raft::device_matrix<float, int64_t, row_major>> distances_;
   raft::host_matrix_view<IdxT, int64_t, raft::row_major>
     graph_view_;  // view of graph for user provided matrix
+  std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view_;
+  bool return_distances_;
 };
 
 /** @} */
@@ -200,12 +230,15 @@ struct index : cuvs::neighbors::index {
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -232,12 +265,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -262,12 +298,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -294,12 +333,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -324,12 +366,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -356,12 +401,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -386,14 +434,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
-
-/** @} */
+           raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -420,12 +469,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Test if we have enough GPU memory to run NN descent algorithm.
diff --git a/cpp/include/cuvs/preprocessing/quantize/scalar.hpp b/cpp/include/cuvs/preprocessing/quantize/scalar.hpp
new file mode 100644
index 000000000..49b4bb7a6
--- /dev/null
+++ b/cpp/include/cuvs/preprocessing/quantize/scalar.hpp
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+
+#include <cuda_fp16.h>
+
+namespace cuvs::preprocessing::quantize::scalar {
+
+/**
+ * @defgroup scalar Scalar quantizer utilities
+ * @{
+ */
+
+/**
+ * @brief quantizer parameters.
+ */
+struct params {
+  /*
+   * specifies how many outliers at top & bottom will be ignored
+   * needs to be within range of (0, 1]
+   */
+  float quantile = 0.99;
+};
+
+/**
+ * @brief Defines and stores scalar for quantisation upon training
+ *
+ * The quantization is performed by a linear mapping of an interval in the
+ * float data type to the full range of the quantized int type.
+ *
+ * @tparam T data element type
+ *
+ */
+template <typename T>
+struct quantizer {
+  T min_;
+  T max_;
+};
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on device
+ *
+ * @return quantizer
+ */
+quantizer<double> train(raft::resources const& res,
+                        const params params,
+                        raft::device_matrix_view<const double, int64_t> dataset);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on host
+ *
+ * @return quantizer
+ */
+quantizer<double> train(raft::resources const& res,
+                        const params params,
+                        raft::host_matrix_view<const double, int64_t> dataset);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples,
+ * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<double>& quantizer,
+               raft::device_matrix_view<const double, int64_t> dataset,
+               raft::device_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<double>& quantizer,
+               raft::host_matrix_view<const double, int64_t> dataset,
+               raft::host_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix<double,
+ * int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<double>& quantizer,
+                       raft::device_matrix_view<const int8_t, int64_t> dataset,
+                       raft::device_matrix_view<double, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix<double, int64_t>(samples,
+ * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<double>& quantizer,
+                       raft::host_matrix_view<const int8_t, int64_t> dataset,
+                       raft::host_matrix_view<double, int64_t> out);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on device
+ *
+ * @return quantizer
+ */
+quantizer<float> train(raft::resources const& res,
+                       const params params,
+                       raft::device_matrix_view<const float, int64_t> dataset);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on host
+ *
+ * @return quantizer
+ */
+quantizer<float> train(raft::resources const& res,
+                       const params params,
+                       raft::host_matrix_view<const float, int64_t> dataset);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples,
+ * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<float>& quantizer,
+               raft::device_matrix_view<const float, int64_t> dataset,
+               raft::device_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<float>& quantizer,
+               raft::host_matrix_view<const float, int64_t> dataset,
+               raft::host_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix<float, int64_t>(handle,
+ * samples, features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<float>& quantizer,
+                       raft::device_matrix_view<const int8_t, int64_t> dataset,
+                       raft::device_matrix_view<float, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix<float, int64_t>(samples,
+ * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<float>& quantizer,
+                       raft::host_matrix_view<const int8_t, int64_t> dataset,
+                       raft::host_matrix_view<float, int64_t> out);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on device
+ *
+ * @return quantizer
+ */
+quantizer<half> train(raft::resources const& res,
+                      const params params,
+                      raft::device_matrix_view<const half, int64_t> dataset);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on host
+ *
+ * @return quantizer
+ */
+quantizer<half> train(raft::resources const& res,
+                      const params params,
+                      raft::host_matrix_view<const half, int64_t> dataset);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples,
+ * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<half>& quantizer,
+               raft::device_matrix_view<const half, int64_t> dataset,
+               raft::device_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<half>& quantizer,
+               raft::host_matrix_view<const half, int64_t> dataset,
+               raft::host_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix<half, int64_t>(handle,
+ * samples, features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<half>& quantizer,
+                       raft::device_matrix_view<const int8_t, int64_t> dataset,
+                       raft::device_matrix_view<half, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix<half, int64_t>(samples,
+ * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<half>& quantizer,
+                       raft::host_matrix_view<const int8_t, int64_t> dataset,
+                       raft::host_matrix_view<half, int64_t> out);
+
+/** @} */  // end of group scalar
+
+}  // namespace cuvs::preprocessing::quantize::scalar
diff --git a/cpp/src/cluster/detail/kmeans.cuh b/cpp/src/cluster/detail/kmeans.cuh
index 9b673bca3..3d054f0fd 100644
--- a/cpp/src/cluster/detail/kmeans.cuh
+++ b/cpp/src/cluster/detail/kmeans.cuh
@@ -15,12 +15,12 @@
  */
 #pragma once
 
+#include "../../core/nvtx.hpp"
 #include "kmeans_common.cuh"
 
 #include <cuvs/cluster/kmeans.hpp>
 #include <cuvs/distance/distance.hpp>
 
-#include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
@@ -71,7 +71,7 @@ void initRandom(raft::resources const& handle,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 raft::device_matrix_view<DataT, IndexT> centroids)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("initRandom");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("initRandom");
   auto n_clusters = params.n_clusters;
   cuvs::cluster::kmeans::detail::shuffleAndGather<DataT, IndexT>(
     handle, X, centroids, n_clusters, params.rng_state.seed);
@@ -98,7 +98,7 @@ void kmeansPlusPlus(raft::resources const& handle,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                     rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeansPlusPlus");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeansPlusPlus");
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -372,7 +372,7 @@ void kmeans_fit_main(raft::resources const& handle,
                      raft::host_scalar_view<IndexT> n_iter,
                      rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_main");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit_main");
   raft::logger::get(RAFT_NAME).set_level(params.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
@@ -590,7 +590,7 @@ void initScalableKMeansPlusPlus(raft::resources const& handle,
                                 raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                                 rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "initScalableKMeansPlusPlus");
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
@@ -841,7 +841,7 @@ void kmeans_fit(raft::resources const& handle,
                 raft::host_scalar_view<DataT> inertia,
                 raft::host_scalar_view<IndexT> n_iter)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   auto n_clusters     = pams.n_clusters;
@@ -1009,7 +1009,7 @@ void kmeans_predict(raft::resources const& handle,
                     bool normalize_weight,
                     raft::host_scalar_view<DataT> inertia)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_predict");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_predict");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
@@ -1153,7 +1153,7 @@ void kmeans_fit_predict(raft::resources const& handle,
                         raft::host_scalar_view<DataT> inertia,
                         raft::host_scalar_view<IndexT> n_iter)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_predict");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit_predict");
   if (!centroids.has_value()) {
     auto n_features = X.extent(1);
     auto centroids_matrix =
@@ -1217,7 +1217,7 @@ void kmeans_transform(raft::resources const& handle,
                       raft::device_matrix_view<const DataT> centroids,
                       raft::device_matrix_view<DataT> X_new)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_transform");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_transform");
   raft::logger::get(RAFT_NAME).set_level(pams.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
diff --git a/cpp/src/cluster/detail/kmeans_balanced.cuh b/cpp/src/cluster/detail/kmeans_balanced.cuh
index 34bb22e85..3f1ad2334 100644
--- a/cpp/src/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/src/cluster/detail/kmeans_balanced.cuh
@@ -20,10 +20,10 @@
 #include "kmeans_common.cuh"
 #include <cuvs/cluster/kmeans.hpp>
 
+#include "../../core/nvtx.hpp"
 #include "../../distance/distance.cuh"
 
 #include <cuvs/distance/distance.hpp>
-#include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger-ext.hpp>
 #include <raft/core/operators.hpp>
@@ -378,7 +378,7 @@ void compute_norm(const raft::resources& handle,
                   FinOpT norm_fin_op,
                   std::optional<rmm::device_async_resource_ref> mr = std::nullopt)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("compute_norm");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("compute_norm");
   auto stream = raft::resource::get_cuda_stream(handle);
   rmm::device_uvector<MathT> mapped_dataset(
     0, stream, mr.value_or(raft::resource::get_workspace_resource(handle)));
@@ -434,7 +434,7 @@ void predict(const raft::resources& handle,
              const MathT* dataset_norm                        = nullptr)
 {
   auto stream = raft::resource::get_cuda_stream(handle);
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle));
   auto [max_minibatch_size, _mem_per_row] =
@@ -603,7 +603,7 @@ auto adjust_centers(MathT* centers,
                     rmm::cuda_stream_view stream,
                     rmm::device_async_resource_ref device_memory) -> bool
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   if (n_clusters == 0) { return false; }
   constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
@@ -1036,7 +1036,7 @@ void build_hierarchical(const raft::resources& handle,
   auto stream  = raft::resource::get_cuda_stream(handle);
   using LabelT = uint32_t;
 
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
 
   IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));
diff --git a/cpp/src/core/c_api.cpp b/cpp/src/core/c_api.cpp
index cfbeed2d5..4333bff0c 100644
--- a/cpp/src/core/c_api.cpp
+++ b/cpp/src/core/c_api.cpp
@@ -26,6 +26,7 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
 #include <thread>
 
 extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
@@ -130,6 +131,21 @@ extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
   });
 }
 
+thread_local std::unique_ptr<rmm::mr::pinned_memory_resource> pinned_mr;
+
+extern "C" cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (pinned_mr == nullptr) { pinned_mr = std::make_unique<rmm::mr::pinned_memory_resource>(); }
+    *ptr = pinned_mr->allocate(bytes);
+  });
+}
+
+extern "C" cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes)
+{
+  return cuvs::core::translate_exceptions([=] { pinned_mr->deallocate(ptr, bytes); });
+}
+
 thread_local std::string last_error_text = "";
 
 extern "C" const char* cuvsGetLastErrorText()
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cu b/cpp/src/distance/detail/kernels/gram_matrix.cu
new file mode 100644
index 000000000..0e4f3e639
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/gram_matrix.cu
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../distance.cuh"
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/sparse/linalg/spmm.hpp>
+
+namespace cuvs::distance::kernels {
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        dense_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        csr_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+// unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      dense_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      csr_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::evaluate(const math_t* x1,
+                                                     int n1,
+                                                     int n_cols,
+                                                     const math_t* x2,
+                                                     int n2,
+                                                     math_t* out,
+                                                     bool is_row_major,
+                                                     cudaStream_t stream,
+                                                     int ld1,
+                                                     int ld2,
+                                                     int ld_out)
+{
+  linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::operator()(const math_t* x1,
+                                                       int n1,
+                                                       int n_cols,
+                                                       const math_t* x2,
+                                                       int n2,
+                                                       math_t* out,
+                                                       bool is_row_major,
+                                                       cudaStream_t stream,
+                                                       int ld1,
+                                                       int ld2,
+                                                       int ld_out)
+{
+  ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+  if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+  if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+  if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+  evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::linear(const math_t* x1,
+                                                   int n1,
+                                                   int n_cols,
+                                                   const math_t* x2,
+                                                   int n2,
+                                                   math_t* out,
+                                                   bool is_row_major,
+                                                   cudaStream_t stream,
+                                                   int ld1,
+                                                   int ld2,
+                                                   int ld_out)
+{
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     n2,
+                                                     n1,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x2,
+                                                     ld2,
+                                                     x1,
+                                                     ld1,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  } else {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_T,
+                                                     n1,
+                                                     n2,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x1,
+                                                     ld1,
+                                                     x2,
+                                                     ld2,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  }
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    dense_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major,
+         "GramMatrix leading dimensions for x1, x2 and out do not match");
+
+  // check dimensions
+  int n1     = out.extent(0);
+  int n2     = out.extent(1);
+  int n_cols = x1.extent(1);
+  ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  // extract major stride
+  int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
+  int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
+  int ld_out = is_row_major ? out.stride(0) : out.stride(1);
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       true,
+                       false,
+                       n2,
+                       n1,
+                       n_cols,
+                       &alpha,
+                       x2.data_handle(),
+                       ld2,
+                       x1.data_handle(),
+                       ld1,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  } else {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       false,
+                       true,
+                       n1,
+                       n2,
+                       n_cols,
+                       &alpha,
+                       x1.data_handle(),
+                       ld1,
+                       x2.data_handle(),
+                       ld2,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  }
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major, "GramMatrix leading dimensions for x2 and out do not match");
+
+  // check dimensions
+  auto x1_structure = x1.structure_view();
+  ASSERT(x1_structure.get_n_rows() == out.extent(0),
+         "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == out.extent(1),
+         "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
+         "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+
+  raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    csr_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check layout consistency (w.r.t. strides a matrix might be both row & col major)
+  bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
+  bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
+
+  ASSERT(is_row_major_nopad || is_col_major_nopad,
+         "Sparse linear Kernel distance does not support ld_out parameter");
+
+  // switch a,b based on is_row_major
+  if (is_col_major_nopad) {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(1), out.extent(0));
+
+    cuvs::distance::pairwise_distance(
+      handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  } else {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(0), out.extent(1));
+    cuvs::distance::pairwise_distance(
+      handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  }
+}
+
+template class GramMatrixBase<float>;
+template class GramMatrixBase<double>;
+
+};  // namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cuh b/cpp/src/distance/detail/kernels/gram_matrix.cuh
deleted file mode 100644
index d435fb4d1..000000000
--- a/cpp/src/distance/detail/kernels/gram_matrix.cuh
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../../distance.cuh"
-#include <cuvs/distance/distance.hpp>
-#include <raft/core/device_csr_matrix.hpp>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resources.hpp>
-// #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/distance/distance.cuh>
-#include <raft/sparse/linalg/spmm.hpp>
-
-namespace cuvs::distance::kernels::detail {
-
-template <typename math_t>
-using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
-template <typename math_t>
-using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, layout_stride>;
-template <typename math_t>
-using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
-
-/**
- * Base class for general Gram matrices
- * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
- * Here, the  inner product is evaluated for all elements from vectors sets X1,
- * and X2.
- *
- * To be more precise, on exit the output buffer will store:
- * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
- * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
- * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
- * from the x2 set.
- */
-template <typename math_t>
-class GramMatrixBase {
- protected:
-  cublasHandle_t cublas_handle;
-  bool legacy_interface;
-
- public:
-  GramMatrixBase() : legacy_interface(false){};
-  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
-    : cublas_handle(cublas_handle), legacy_interface(true){};
-
-  virtual ~GramMatrixBase(){};
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  dense_input_matrix_view_t<math_t> x1,
-                  dense_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  csr_input_matrix_view_t<math_t> x1,
-                  dense_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  csr_input_matrix_view_t<math_t> x1,
-                  csr_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        dense_input_matrix_view_t<math_t> x1,
-                        dense_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        csr_input_matrix_view_t<math_t> x1,
-                        dense_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        csr_input_matrix_view_t<math_t> x1,
-                        csr_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] virtual void evaluate(const math_t* x1,
-                                       int n1,
-                                       int n_cols,
-                                       const math_t* x2,
-                                       int n2,
-                                       math_t* out,
-                                       bool is_row_major,
-                                       cudaStream_t stream,
-                                       int ld1,
-                                       int ld2,
-                                       int ld_out)
-  {
-    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  [[deprecated]] void operator()(const math_t* x1,
-                                 int n1,
-                                 int n_cols,
-                                 const math_t* x2,
-                                 int n2,
-                                 math_t* out,
-                                 bool is_row_major,
-                                 cudaStream_t stream,
-                                 int ld1    = 0,
-                                 int ld2    = 0,
-                                 int ld_out = 0)
-  {
-    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
- protected:
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  [[deprecated]] void linear(const math_t* x1,
-                             int n1,
-                             int n_cols,
-                             const math_t* x2,
-                             int n2,
-                             math_t* out,
-                             bool is_row_major,
-                             cudaStream_t stream,
-                             int ld1,
-                             int ld2,
-                             int ld_out)
-  {
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_T,
-                                                       CUBLAS_OP_N,
-                                                       n2,
-                                                       n1,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x2,
-                                                       ld2,
-                                                       x1,
-                                                       ld1,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    } else {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_T,
-                                                       n1,
-                                                       n2,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x1,
-                                                       ld1,
-                                                       x2,
-                                                       ld2,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    }
-  }
-
- protected:
-  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
-
-  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              dense_input_matrix_view_t<math_t> x1,
-              dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x1, x2 and out do not match");
-
-    // check dimensions
-    int n1     = out.extent(0);
-    int n2     = out.extent(1);
-    int n_cols = x1.extent(1);
-    ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    // extract major stride
-    int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
-    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
-    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         true,
-                         false,
-                         n2,
-                         n1,
-                         n_cols,
-                         &alpha,
-                         x2.data_handle(),
-                         ld2,
-                         x1.data_handle(),
-                         ld1,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    } else {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         false,
-                         true,
-                         n1,
-                         n2,
-                         n_cols,
-                         &alpha,
-                         x1.data_handle(),
-                         ld1,
-                         x2.data_handle(),
-                         ld2,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    }
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              csr_input_matrix_view_t<math_t> x1,
-              dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x2 and out do not match");
-
-    // check dimensions
-    auto x1_structure = x1.structure_view();
-    ASSERT(x1_structure.get_n_rows() == out.extent(0),
-           "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == out.extent(1),
-           "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
-           "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-
-    raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              csr_input_matrix_view_t<math_t> x1,
-              csr_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check layout consistency (w.r.t. strides a matrix might be both row & col major)
-    bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
-    bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
-
-    ASSERT(is_row_major_nopad || is_col_major_nopad,
-           "Sparse linear Kernel distance does not support ld_out parameter");
-
-    // switch a,b based on is_row_major
-    if (is_col_major_nopad) {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(1), out.extent(0));
-      raft::sparse::distance::pairwise_distance(
-        handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    } else {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(0), out.extent(1));
-      raft::sparse::distance::pairwise_distance(
-        handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    }
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cu b/cpp/src/distance/detail/kernels/kernel_factory.cu
new file mode 100644
index 000000000..25f9e9b84
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_factory.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/distance/grammian.hpp>
+
+namespace cuvs::distance::kernels {
+
+template <typename math_t>
+GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(); break;
+    case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
+    case RBF: res = new RBFKernel<math_t>(gamma); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+template <typename math_t>
+[[deprecated]] GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params,
+                                                                     cublasHandle_t handle)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
+    case POLYNOMIAL:
+      res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
+      break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
+    case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+template class KernelFactory<float>;
+template class KernelFactory<double>;
+
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cuh b/cpp/src/distance/detail/kernels/kernel_factory.cuh
deleted file mode 100644
index 5c50a95a3..000000000
--- a/cpp/src/distance/detail/kernels/kernel_factory.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-#include "kernel_matrices.cuh"
-
-#include <cuvs/distance/distance.hpp>
-#include <raft/util/cudart_utils.hpp>
-
-namespace cuvs::distance::kernels::detail {
-
-template <typename math_t>
-class KernelFactory {
- public:
-  static GramMatrixBase<math_t>* create(KernelParams params)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(); break;
-      case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
-      case RBF: res = new RBFKernel<math_t>(gamma); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-
-  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
-      case POLYNOMIAL:
-        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
-        break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cu b/cpp/src/distance/detail/kernels/kernel_matrices.cu
new file mode 100644
index 000000000..526ca106f
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_matrices.cu
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../distance/distance.cuh"
+#include <cuvs/distance/grammian.hpp>
+
+#include "rbf_fin_op.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/sparse/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+namespace cuvs::distance::kernels {
+
+/** Epiloge function for polynomial kernel without padding.
+ * Calculates output = (gain*in + offset)^exponent
+ * @param inout device vector in column major format, size [len]
+ * @param len array length
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel_nopad(
+  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = pow(gain * inout[tid] + offset, exponent);
+  }
+}
+
+/** Epiloge function for polynomial kernel with padding.
+ * Calculates output = (gain*input + offset)^exponent
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel(
+  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
+    }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector, size [len]
+ * @param len length of the input vector
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = tanh(gain * inout[tid] + offset);
+  }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
+    }
+}
+
+/** Epiloge function for rbf kernel using expansion.
+ *
+ * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
+ *
+ * Intended usage
+ *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
+ *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
+ *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
+ *
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param norm_x l2-norm of X's rows
+ * @param norm_y l2-norm of Y's rows
+ * @param gain
+ */
+template <typename math_t>
+RAFT_KERNEL rbf_kernel_expanded(
+  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y) {
+    math_t norm_y_val = norm_y[tidy];
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] =
+        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
+    }
+  }
+}
+
+std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
+{
+  dim3 block_shape       = dim3(32, 4);
+  const int num_blocks_x = raft::ceildiv(n1, 32);
+  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
+  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
+  return std::make_tuple(grid_shape, block_shape);
+}
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, exponent, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
+      inout, ld, n1, n2, exponent, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               dense_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               csr_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t, typename exp_t>
+[[deprecated]] void PolynomialKernel<math_t, exp_t>::evaluate(const math_t* x1,
+                                                              int n1,
+                                                              int n_cols,
+                                                              const math_t* x2,
+                                                              int n2,
+                                                              math_t* out,
+                                                              bool is_row_major,
+                                                              cudaStream_t stream,
+                                                              int ld1,
+                                                              int ld2,
+                                                              int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  dense_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  csr_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void TanhKernel<math_t>::evaluate(const math_t* x1,
+                                                 int n1,
+                                                 int n_cols,
+                                                 const math_t* x2,
+                                                 int n2,
+                                                 math_t* out,
+                                                 bool is_row_major,
+                                                 cudaStream_t stream,
+                                                 int ld1,
+                                                 int ld2,
+                                                 int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::applyKernel(math_t* inout,
+                                    int ld,
+                                    int rows,
+                                    int cols,
+                                    math_t* norm_x1,
+                                    math_t* norm_x2,
+                                    bool is_row_major,
+                                    cudaStream_t stream)
+{
+  int n1                         = is_row_major ? cols : rows;
+  int n2                         = is_row_major ? rows : cols;
+  math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
+  math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
+  auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+  rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
+    inout, ld, n1, n2, norm_n1, norm_n2, gain);
+}
+
+template <typename math_t>
+void RBFKernel<math_t>::matrixRowNormL2(raft::resources const& handle,
+                                        dense_input_matrix_view_t<math_t> matrix,
+                                        math_t* target)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
+  int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
+  int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
+  ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
+  raft::linalg::rowNorm(target,
+                        matrix.data_handle(),
+                        matrix.extent(1),
+                        matrix.extent(0),
+                        raft::linalg::NormType::L2Norm,
+                        is_row_major,
+                        raft::resource::get_cuda_stream(handle));
+}
+
+template <typename math_t>
+void RBFKernel<math_t>::matrixRowNormL2(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> matrix,
+                                        math_t* target)
+{
+  auto matrix_structure = matrix.structure_view();
+  raft::sparse::linalg::rowNormCsr(handle,
+                                   matrix_structure.get_indptr().data(),
+                                   matrix.get_elements().data(),
+                                   matrix_structure.get_nnz(),
+                                   matrix_structure.get_n_rows(),
+                                   target,
+                                   raft::linalg::NormType::L2Norm);
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 dense_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.extent(0), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 csr_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void RBFKernel<math_t>::evaluate(const math_t* x1,
+                                                int n1,
+                                                int n_cols,
+                                                const math_t* x2,
+                                                int n2,
+                                                math_t* out,
+                                                bool is_row_major,
+                                                cudaStream_t stream,
+                                                int ld1,
+                                                int ld2,
+                                                int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  int minor1    = is_row_major ? n_cols : n1;
+  int minor2    = is_row_major ? n_cols : n2;
+  int minor_out = is_row_major ? n2 : n1;
+  ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+  ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+  ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+
+  math_t gain   = this->gain;
+  using index_t = int64_t;
+
+  rbf_fin_op fin_op{gain};
+
+  raft::resources handle;
+  raft::resource::set_cuda_stream(handle, stream);
+
+  cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
+                           math_t,
+                           math_t,
+                           math_t,
+                           decltype(fin_op),
+                           index_t>(handle,
+                                    const_cast<math_t*>(x1),
+                                    const_cast<math_t*>(x2),
+                                    out,
+                                    n1,
+                                    n2,
+                                    n_cols,
+                                    NULL,
+                                    0,
+                                    fin_op,
+                                    is_row_major);
+}
+
+template class PolynomialKernel<float, int>;
+template class PolynomialKernel<double, int>;
+template class TanhKernel<float>;
+template class TanhKernel<double>;
+template class RBFKernel<float>;
+template class RBFKernel<double>;
+
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cuh b/cpp/src/distance/detail/kernels/kernel_matrices.cuh
deleted file mode 100644
index bff5bda92..000000000
--- a/cpp/src/distance/detail/kernels/kernel_matrices.cuh
+++ /dev/null
@@ -1,777 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-
-#include "../detail/kernels/rbf_fin_op.cuh"
-#include <cuvs/distance/distance.cuh>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace cuvs::distance::kernels::detail {
-
-/** Epiloge function for polynomial kernel without padding.
- * Calculates output = (gain*in + offset)^exponent
- * @param inout device vector in column major format, size [len]
- * @param len array length
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel_nopad(
-  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = pow(gain * inout[tid] + offset, exponent);
-  }
-}
-
-/** Epiloge function for polynomial kernel with padding.
- * Calculates output = (gain*input + offset)^exponent
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel(
-  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
-    }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector, size [len]
- * @param len length of the input vector
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = tanh(gain * inout[tid] + offset);
-  }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
-    }
-}
-
-/** Epiloge function for rbf kernel using expansion.
- *
- * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
- *
- * Intended usage
- *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
- *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
- *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
- *
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param norm_x l2-norm of X's rows
- * @param norm_y l2-norm of Y's rows
- * @param gain
- */
-template <typename math_t>
-RAFT_KERNEL rbf_kernel_expanded(
-  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y) {
-    math_t norm_y_val = norm_y[tidy];
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] =
-        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
-    }
-  }
-}
-
-namespace {
-std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
-{
-  dim3 block_shape       = dim3(32, 4);
-  const int num_blocks_x = raft::ceildiv(n1, 32);
-  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
-  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
-  return std::make_tuple(grid_shape, block_shape);
-}
-}  // namespace
-
-/**
- * Create a kernel matrix using polynomial kernel function.
- */
-template <typename math_t, typename exp_t>
-class PolynomialKernel : public GramMatrixBase<math_t> {
-  exp_t exponent;
-  math_t gain;
-  math_t offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, exponent, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
-        inout, ld, n1, n2, exponent, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a polynomial kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
-   *
-   * @tparam math_t floating point type
-   * @tparam exp_t type of exponent
-   * @param exponent
-   * @param gain
-   * @param offset
-   */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
-    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using tanh kernel function.
- */
-template <typename math_t>
-class TanhKernel : public GramMatrixBase<math_t> {
-  math_t gain, offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a tanh kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   * @param offset
-   */
-  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
-
-  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using RBF kernel function.
- */
-template <typename math_t>
-class RBFKernel : public GramMatrixBase<math_t> {
-  math_t gain;
-
-  void applyKernel(math_t* inout,
-                   int ld,
-                   int rows,
-                   int cols,
-                   math_t* norm_x1,
-                   math_t* norm_x2,
-                   bool is_row_major,
-                   cudaStream_t stream)
-  {
-    int n1                         = is_row_major ? cols : rows;
-    int n2                         = is_row_major ? rows : cols;
-    math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
-    math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
-    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-    rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
-      inout, ld, n1, n2, norm_n1, norm_n2, gain);
-  }
-
- public:
-  /**
-   * Constructs a RBF kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = exp(-gain*|x1_i- x2_k|^2)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {}
-
-  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain)
-  {
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       dense_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
-    int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
-    int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
-    ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
-    raft::linalg::rowNorm(target,
-                          matrix.data_handle(),
-                          matrix.extent(1),
-                          matrix.extent(0),
-                          raft::linalg::NormType::L2Norm,
-                          is_row_major,
-                          resource::get_cuda_stream(handle));
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       csr_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    auto matrix_structure = matrix.structure_view();
-    raft::sparse::linalg::rowNormCsr(handle,
-                                     matrix_structure.get_indptr().data(),
-                                     matrix.get_elements().data(),
-                                     matrix_structure.get_nnz(),
-                                     matrix_structure.get_n_rows(),
-                                     target,
-                                     raft::linalg::NormType::L2Norm);
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.extent(0), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    int minor1    = is_row_major ? n_cols : n1;
-    int minor2    = is_row_major ? n_cols : n2;
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
-    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-
-    math_t gain   = this->gain;
-    using index_t = int64_t;
-
-    rbf_fin_op fin_op{gain};
-
-    raft::resources handle;
-    resource::set_cuda_stream(handle, stream);
-
-    cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
-                             math_t,
-                             math_t,
-                             math_t,
-                             decltype(fin_op),
-                             index_t>(handle,
-                                      const_cast<math_t*>(x1),
-                                      const_cast<math_t*>(x2),
-                                      out,
-                                      n1,
-                                      n2,
-                                      n_cols,
-                                      NULL,
-                                      0,
-                                      fin_op,
-                                      is_row_major);
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
index 73588baea..53022368d 100644
--- a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
+++ b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
@@ -28,7 +28,7 @@
 #include <raft/core/math.hpp>                 // raft::exp
 #include <raft/util/cuda_dev_essentials.cuh>  // HD
 
-namespace cuvs::distance::kernels::detail {
+namespace cuvs::distance::kernels {
 
 /** @brief: Final op for Gram matrix with RBF kernel.
  *
@@ -48,4 +48,4 @@ struct rbf_fin_op {
   }
 };  // struct rbf_fin_op
 
-}  // namespace cuvs::distance::kernels::detail
+}  // namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
index f9955334d..f4a7feaba 100644
--- a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
+++ b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
@@ -61,6 +61,7 @@ class PairwiseDistanceEpilogueElementwise {
   using ElementT                      = ElementT_;
   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kCount             = kElementsPerAccess;
+  static bool const kIsSingleSource   = true;
 
   using DistanceOp = DistanceOp_;
   using FinalOp    = FinalOp_;
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
index edfd7cf5f..49497ab3a 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -118,9 +118,7 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::russel_rao_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo(
-  cuvs::distance::detail::ops::l2_unexp_distance_op,
-  int64_t,
-  cuvs::distance::kernels::detail::rbf_fin_op);
+  cuvs::distance::detail::ops::l2_unexp_distance_op, int64_t, cuvs::distance::kernels::rbf_fin_op);
 
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::l2_exp_distance_op, int64_t);
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
index 3c8f25109..a2e12b6df 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
@@ -50,7 +50,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   float,
   float,
   float,
-  cuvs::distance::kernels::detail::rbf_fin_op<float>,
+  cuvs::distance::kernels::rbf_fin_op<float>,
   int64_t);
 
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
@@ -58,7 +58,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   double,
   double,
   double,
-  cuvs::distance::kernels::detail::rbf_fin_op<double>,
+  cuvs::distance::kernels::rbf_fin_op<double>,
   int64_t);
 
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
@@ -66,7 +66,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   half,
   float,
   float,
-  cuvs::distance::kernels::detail::rbf_fin_op<half>,
+  cuvs::distance::kernels::rbf_fin_op<half>,
   int64_t);
 
 #undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/sparse/bin_distance.cuh b/cpp/src/distance/detail/sparse/bin_distance.cuh
new file mode 100644
index 000000000..1a63a8eb9
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/bin_distance.cuh
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "ip_distance.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_binary_row_norm_kernel(value_t* out,
+                                           const value_idx* __restrict__ coo_rows,
+                                           const value_t* __restrict__ data,
+                                           value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) {
+    // We do conditional here only because it's
+    // possible there could be some stray zeros in
+    // the sparse structure and removing them would be
+    // more expensive.
+    atomicAdd(&out[coo_rows[i]], data[i] == 1.0);
+  }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+RAFT_KERNEL compute_binary_warp_kernel(value_t* __restrict__ C,
+                                       const value_t* __restrict__ Q_norms,
+                                       const value_t* __restrict__ R_norms,
+                                       value_idx n_rows,
+                                       value_idx n_cols,
+                                       expansion_f expansion_func)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t q_norm            = Q_norms[i];
+  value_t r_norm            = R_norms[j];
+  value_t dot               = C[(size_t)i * n_cols + j];
+  C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_binary(value_t* C,
+                    const value_t* Q_norms,
+                    const value_t* R_norms,
+                    value_idx n_rows,
+                    value_idx n_cols,
+                    expansion_f expansion_func,
+                    cudaStream_t stream)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_bin_distance(value_t* out,
+                          const value_idx* Q_coo_rows,
+                          const value_t* Q_data,
+                          value_idx Q_nnz,
+                          const value_idx* R_coo_rows,
+                          const value_t* R_data,
+                          value_idx R_nnz,
+                          value_idx m,
+                          value_idx n,
+                          cudaStream_t stream,
+                          expansion_f expansion_func)
+{
+  rmm::device_uvector<value_t> Q_norms(m, stream);
+  rmm::device_uvector<value_t> R_norms(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+
+  compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream);
+}
+
+/**
+ * Jaccard distance using the expanded form:
+ * 1 - (sum(x_k * y_k) / ((sum(x_k) + sum(y_k)) - sum(x_k * y_k))
+ */
+template <typename value_idx = int, typename value_t = float>
+class jaccard_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit jaccard_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         raft::resource::get_cuda_stream(config_->handle),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t denom     = q_r_union - dot;
+
+                           value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
+
+                           // flip the similarity when both rows are 0
+                           bool both_empty = q_r_union == 0;
+                           return 1 - ((!both_empty * jacc) + both_empty);
+                         });
+  }
+
+  ~jaccard_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Dice distance using the expanded form:
+ * 1 - ((2 * sum(x_k * y_k)) / (sum(x_k) + sum(y_k)))
+ */
+template <typename value_idx = int, typename value_t = float>
+class dice_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit dice_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         raft::resource::get_cuda_stream(config_->handle),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t dice      = (2 * dot) / q_r_union;
+                           bool both_empty   = q_r_union == 0;
+                           return 1 - ((!both_empty * dice) + both_empty);
+                         });
+  }
+
+  ~dice_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/common.hpp b/cpp/src/distance/detail/sparse/common.hpp
new file mode 100644
index 000000000..803dabe56
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/common.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resources.hpp>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t>
+struct distances_config_t {
+  distances_config_t(raft::resources const& handle_) : handle(handle_) {}
+
+  // left side
+  value_idx a_nrows;
+  value_idx a_ncols;
+  value_idx a_nnz;
+  value_idx* a_indptr;
+  value_idx* a_indices;
+  value_t* a_data;
+
+  // right side
+  value_idx b_nrows;
+  value_idx b_ncols;
+  value_idx b_nnz;
+  value_idx* b_indptr;
+  value_idx* b_indices;
+  value_t* b_data;
+
+  raft::resources const& handle;
+};
+
+template <typename value_t>
+class distances_t {
+ public:
+  virtual void compute(value_t* out) {}
+  virtual ~distances_t() = default;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv.cuh b/cpp/src/distance/detail/sparse/coo_spmv.cuh
new file mode 100644
index 000000000..181b531f7
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "coo_spmv_strategies/dense_smem_strategy.cuh"
+#include "coo_spmv_strategies/hash_strategy.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cusparse_v2.h>
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
+          typename strategy_t>
+inline void balanced_coo_pairwise_generalized_spmv(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows;
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle)));
+
+  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+};
+
+/**
+ * Performs generalized sparse-matrix-sparse-matrix multiplication via a
+ * sparse-matrix-sparse-vector layout `out=A*B` where generalized product()
+ * and sum() operations can be used in place of the standard sum and product:
+ *
+ * out_ij = sum_k(product(A_ik, B_ik)) The sum goes through values of
+ * k=0..n_cols-1 where B_kj is nonzero.
+ *
+ * The product and sum operations shall form a semiring algebra with the
+ * following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 1} is a product monoid with identity element 1
+ * 3. Multiplication by 0 annihilates x. e.g. product(x, 0) = 0
+ *
+ * Each vector of A is loaded into shared memory in dense form and the
+ * non-zeros of B load balanced across the threads of each block.
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n in row-major
+ *             format.
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_b coo row array for B
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ * @param[in] chunk_size number of nonzeros of B to process for each row of A
+ *            this value was found through profiling and represents a reasonable
+ *            setting for both large and small densities
+ */
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows;
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle)));
+
+  int max_cols = max_cols_per_block<value_idx, value_t>();
+
+  if (max_cols > config_.a_ncols) {
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+  } else {
+    hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+  }
+};
+
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
+          typename strategy_t>
+inline void balanced_coo_pairwise_generalized_spmv_rev(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+};
+
+/**
+ * Used for computing distances where the reduction (e.g. product()) function
+ * requires an implicit union (product(x, 0) = x) to capture the difference A-B.
+ * This is necessary in some applications because the standard semiring algebra
+ * endowed with the default multiplication product monoid will only
+ * compute the intersection & B-A.
+ *
+ * This particular function is meant to accompany the function
+ * `balanced_coo_pairwise_generalized_spmv` and executes the product operation
+ * on only those columns that exist in B and not A.
+ *
+ * The product and sum operations shall enable the computation of a
+ * non-annihilating semiring algebra with the following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 0} is a product monoid with identity element 0
+ * 3. Multiplication by 0 does not annihilate x. e.g. product(x, 0) = x
+ *
+ * Manattan distance sum(abs(x_k-y_k)) is a great example of when this type of
+ * execution pattern is necessary.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_a coo row array for A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ * @param[in] chunk_size number of nonzeros of B to process for each row of A
+ *            this value was found through profiling and represents a reasonable
+ *            setting for both large and small densities
+ */
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv_rev(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  // try dense first
+  int max_cols = max_cols_per_block<value_idx, value_t>();
+
+  if (max_cols > config_.b_ncols) {
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+  } else {
+    hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+  }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
new file mode 100644
index 000000000..1f4b19af4
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/cub.cuh>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
+{
+  return __ffs(peer_group) - 1;
+}
+
+/**
+ * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+ * sparse-matrix-sparse-vector multiplication layout (SPMV).
+ * This is intended to be scheduled n_chunks_b times for each row of a.
+ * The steps are as follows:
+ *
+ * 1. Load row from A into dense vector in shared memory.
+ *    This can be further chunked in the future if necessary to support larger
+ *    column sizes.
+ * 2. Threads of block all step through chunks of B in parallel.
+ *    When a new row is encountered in row_indices_b, a segmented
+ *    reduction is performed across the warps and then across the
+ *    block and the final value written out to host memory.
+ *
+ * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb threads per block configured on launch
+ * @tparam rev if this is true, the reduce/accumulate functions are only
+ *         executed when A[col] == 0.0. when executed before/after !rev
+ *         and A & B are reversed, this allows the full symmetric difference
+ *         and intersection to be computed.
+ * @tparam kv_t data type stored in shared mem cache
+ * @tparam product_f reduce function type (semiring product() function).
+ *                  accepts two arguments of value_t and returns a value_t
+ * @tparam accum_f accumulation function type (semiring sum() function).
+ *                 accepts two arguments of value_t and returns a value_t
+ * @tparam write_f function to write value out. this should be mathematically
+ *                 equivalent to the accumulate function but implemented as
+ *                 an atomic operation on global memory. Accepts two arguments
+ *                 of value_t* and value_t and updates the value given by the
+ *                 pointer.
+ * @param[in] indptrA column pointer array for A
+ * @param[in] indicesA column indices array for A
+ * @param[in] dataA data array for A
+ * @param[in] rowsB coo row array for B
+ * @param[in] indicesB column indices array for B
+ * @param[in] dataB data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[in] dim number of features
+ * @param[in] nnz_b number of nonzeros in B
+ * @param[out] out array of size m*n
+ * @param[in] n_blocks_per_row number of blocks of B per row of A
+ * @param[in] chunk_size number of nnz for B to use for each row of A
+ * @param[in] buffer_size amount of smem to use for each row of A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename strategy_t,
+          typename indptr_it,
+          typename value_idx,
+          typename value_t,
+          bool rev,
+          int tpb,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+RAFT_KERNEL balanced_coo_generalized_spmv_kernel(strategy_t strategy,
+                                                 indptr_it indptrA,
+                                                 value_idx* indicesA,
+                                                 value_t* dataA,
+                                                 value_idx nnz_a,
+                                                 value_idx* rowsB,
+                                                 value_idx* indicesB,
+                                                 value_t* dataB,
+                                                 value_idx m,
+                                                 value_idx n,
+                                                 int dim,
+                                                 value_idx nnz_b,
+                                                 value_t* out,
+                                                 int n_blocks_per_row,
+                                                 int chunk_size,
+                                                 value_idx b_ncols,
+                                                 product_f product_func,
+                                                 accum_f accum_func,
+                                                 write_f write_func)
+{
+  typedef cub::WarpReduce<value_t> warp_reduce;
+
+  value_idx cur_row_a        = indptrA.get_row_idx(n_blocks_per_row);
+  value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
+
+  // chunk starting offset
+  value_idx ind_offset = cur_chunk_offset * chunk_size * tpb;
+  // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
+  value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
+
+  int tid     = threadIdx.x;
+  int warp_id = tid / raft::warp_size();
+
+  // compute id relative to current warp
+  unsigned int lane_id = tid & (raft::warp_size() - 1);
+  value_idx ind        = ind_offset + threadIdx.x;
+
+  extern __shared__ char smem[];
+
+  typename strategy_t::smem_type A                = (typename strategy_t::smem_type)(smem);
+  typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim);
+
+  auto inserter = strategy.init_insert(A, dim);
+
+  __syncthreads();
+
+  value_idx start_offset_a, stop_offset_a;
+  bool first_a_chunk, last_a_chunk;
+  indptrA.get_row_offsets(
+    cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk);
+
+  // Convert current row vector in A to dense
+  for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) {
+    strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]);
+  }
+
+  __syncthreads();
+
+  auto finder = strategy.init_find(A, dim);
+
+  if (cur_row_a > m || cur_chunk_offset > n_blocks_per_row) return;
+  if (ind >= nnz_b) return;
+
+  value_idx start_index_a = 0, stop_index_a = b_ncols - 1;
+  indptrA.get_indices_boundary(indicesA,
+                               cur_row_a,
+                               start_offset_a,
+                               stop_offset_a,
+                               start_index_a,
+                               stop_index_a,
+                               first_a_chunk,
+                               last_a_chunk);
+
+  value_idx cur_row_b = -1;
+  value_t c           = 0.0;
+
+  auto warp_red = warp_reduce(*(temp_storage + warp_id));
+
+  if (tid < active_chunk_size) {
+    cur_row_b = rowsB[ind];
+
+    auto index_b   = indicesB[ind];
+    auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+
+    if (in_bounds) {
+      value_t a_col = strategy.find(finder, index_b);
+      if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); }
+    }
+  }
+
+  // loop through chunks in parallel, reducing when a new row is
+  // encountered by each thread
+  for (int i = tid; i < active_chunk_size; i += blockDim.x) {
+    value_idx ind_next   = ind + blockDim.x;
+    value_idx next_row_b = -1;
+
+    if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
+
+    bool diff_rows = next_row_b != cur_row_b;
+
+    if (__any_sync(0xffffffff, diff_rows)) {
+      // grab the threads currently participating in loops.
+      // because any other threads should have returned already.
+      unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
+      bool is_leader          = get_lowest_peer(peer_group) == lane_id;
+      value_t v               = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+
+      // thread with lowest lane id among peers writes out
+      if (is_leader && v != 0.0) {
+        // this conditional should be uniform, since rev is constant
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a;
+        write_func(out + idx, v);
+      }
+
+      c = 0.0;
+    }
+
+    if (next_row_b != -1) {
+      ind = ind_next;
+
+      auto index_b   = indicesB[ind];
+      auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+      if (in_bounds) {
+        value_t a_col = strategy.find(finder, index_b);
+
+        if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); }
+      }
+
+      cur_row_b = next_row_b;
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh
new file mode 100644
index 000000000..457b25eea
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../common.hpp"
+#include "../coo_spmv_kernel.cuh"
+#include "../utils.cuh"
+#include "coo_mask_row_iterators.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class coo_spmv_strategy {
+ public:
+  coo_spmv_strategy(const distances_config_t<value_idx, value_t>& config_) : config(config_)
+  {
+    smem = raft::getSharedMemPerBlock();
+  }
+
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base(strategy_t& strategy,
+                      int smem_dim,
+                      indptr_it& a_indptr,
+                      value_t* out_dists,
+                      value_idx* coo_rows_b,
+                      product_f product_func,
+                      accum_f accum_func,
+                      write_f write_func,
+                      int chunk_size,
+                      int n_blocks,
+                      int n_blocks_per_row)
+  {
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              false,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
+
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
+      <<<n_blocks, tpb, smem, raft::resource::get_cuda_stream(config.handle)>>>(strategy,
+                                                                                a_indptr,
+                                                                                config.a_indices,
+                                                                                config.a_data,
+                                                                                config.a_nnz,
+                                                                                coo_rows_b,
+                                                                                config.b_indices,
+                                                                                config.b_data,
+                                                                                config.a_nrows,
+                                                                                config.b_nrows,
+                                                                                smem_dim,
+                                                                                config.b_nnz,
+                                                                                out_dists,
+                                                                                n_blocks_per_row,
+                                                                                chunk_size,
+                                                                                config.b_ncols,
+                                                                                product_func,
+                                                                                accum_func,
+                                                                                write_func);
+  }
+
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base_rev(strategy_t& strategy,
+                          int smem_dim,
+                          indptr_it& b_indptr,
+                          value_t* out_dists,
+                          value_idx* coo_rows_a,
+                          product_f product_func,
+                          accum_f accum_func,
+                          write_f write_func,
+                          int chunk_size,
+                          int n_blocks,
+                          int n_blocks_per_row)
+  {
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              true,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
+
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
+      <<<n_blocks, tpb, smem, raft::resource::get_cuda_stream(config.handle)>>>(strategy,
+                                                                                b_indptr,
+                                                                                config.b_indices,
+                                                                                config.b_data,
+                                                                                config.b_nnz,
+                                                                                coo_rows_a,
+                                                                                config.a_indices,
+                                                                                config.a_data,
+                                                                                config.b_nrows,
+                                                                                config.a_nrows,
+                                                                                smem_dim,
+                                                                                config.a_nnz,
+                                                                                out_dists,
+                                                                                n_blocks_per_row,
+                                                                                chunk_size,
+                                                                                config.a_ncols,
+                                                                                product_func,
+                                                                                accum_func,
+                                                                                write_func);
+  }
+
+ protected:
+  int smem;
+  const distances_config_t<value_idx, value_t>& config;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh
new file mode 100644
index 000000000..a9040e1d8
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../common.hpp"
+#include "../utils.cuh"
+
+#include <raft/util/cuda_dev_essentials.cuh>  // raft::ceildiv
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx>
+class mask_row_it {
+ public:
+  mask_row_it(const value_idx* full_indptr_,
+              const value_idx& n_rows_,
+              value_idx* mask_row_idx_ = NULL)
+    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_)
+  {
+  }
+
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
+    if (mask_row_idx != NULL) {
+      return mask_row_idx[blockIdx.x / n_blocks_nnz_b];
+    } else {
+      return blockIdx.x / n_blocks_nnz_b;
+    }
+  }
+
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const value_idx& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    start_offset = full_indptr[row_idx];
+    stop_offset  = full_indptr[row_idx + 1] - 1;
+  }
+
+  __device__ constexpr inline void get_indices_boundary(const value_idx* indices,
+                                                        value_idx& indices_len,
+                                                        value_idx& start_offset,
+                                                        value_idx& stop_offset,
+                                                        value_idx& start_index,
+                                                        value_idx& stop_index,
+                                                        bool& first_a_chunk,
+                                                        bool& last_a_chunk)
+  {
+    // do nothing;
+  }
+
+  __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a,
+                                                        value_idx& stop_index_a,
+                                                        value_idx& index_b)
+  {
+    return true;
+  }
+
+  const value_idx *full_indptr, &n_rows;
+  value_idx* mask_row_idx;
+};
+
+template <typename value_idx>
+RAFT_KERNEL fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
+                                      value_idx* chunk_indices,
+                                      value_idx n_rows)
+{
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < n_rows) {
+    auto start = n_chunks_per_row[tid];
+    auto end   = n_chunks_per_row[tid + 1];
+
+#pragma unroll
+    for (int i = start; i < end; i++) {
+      chunk_indices[i] = tid;
+    }
+  }
+}
+
+template <typename value_idx>
+class chunked_mask_row_it : public mask_row_it<value_idx> {
+ public:
+  chunked_mask_row_it(const value_idx* full_indptr_,
+                      const value_idx& n_rows_,
+                      value_idx* mask_row_idx_,
+                      int row_chunk_size_,
+                      const value_idx* n_chunks_per_row_,
+                      const value_idx* chunk_indices_,
+                      const cudaStream_t stream_)
+    : mask_row_it<value_idx>(full_indptr_, n_rows_, mask_row_idx_),
+      row_chunk_size(row_chunk_size_),
+      n_chunks_per_row(n_chunks_per_row_),
+      chunk_indices(chunk_indices_),
+      stream(stream_)
+  {
+  }
+
+  static void init(const value_idx* indptr,
+                   const value_idx* mask_row_idx,
+                   const value_idx& n_rows,
+                   const int row_chunk_size,
+                   rmm::device_uvector<value_idx>& n_chunks_per_row,
+                   rmm::device_uvector<value_idx>& chunk_indices,
+                   cudaStream_t stream)
+  {
+    auto policy = rmm::exec_policy(stream);
+
+    constexpr value_idx first_element = 0;
+    n_chunks_per_row.set_element_async(0, first_element, stream);
+    n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size);
+    thrust::transform(
+      policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor);
+
+    thrust::inclusive_scan(
+      policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1);
+
+    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream);
+
+    fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream);
+  }
+
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
+    return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]];
+  }
+
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const int& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    auto chunk_index    = blockIdx.x / n_blocks_nnz_b;
+    auto chunk_val      = chunk_indices[chunk_index];
+    auto prev_n_chunks  = n_chunks_per_row[chunk_val];
+    auto relative_chunk = chunk_index - prev_n_chunks;
+    first_a_chunk       = relative_chunk == 0;
+
+    start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size;
+    stop_offset  = start_offset + row_chunk_size;
+
+    auto final_stop_offset = this->full_indptr[row_idx + 1];
+
+    last_a_chunk = stop_offset >= final_stop_offset;
+    stop_offset  = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
+  }
+
+  __device__ inline void get_indices_boundary(const value_idx* indices,
+                                              value_idx& row_idx,
+                                              value_idx& start_offset,
+                                              value_idx& stop_offset,
+                                              value_idx& start_index,
+                                              value_idx& stop_index,
+                                              bool& first_a_chunk,
+                                              bool& last_a_chunk)
+  {
+    start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1;
+    stop_index  = last_a_chunk ? stop_index : indices[stop_offset];
+  }
+
+  __device__ inline bool check_indices_bounds(value_idx& start_index_a,
+                                              value_idx& stop_index_a,
+                                              value_idx& index_b)
+  {
+    return (index_b >= start_index_a && index_b <= stop_index_a);
+  }
+
+  inline static value_idx total_row_blocks = 0;
+  const cudaStream_t stream;
+  const value_idx *n_chunks_per_row, *chunk_indices;
+  value_idx row_chunk_size;
+
+  struct n_chunks_per_row_functor {
+   public:
+    n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_)
+      : indptr(indptr_), row_chunk_size(row_chunk_size_)
+    {
+    }
+
+    __host__ __device__ value_idx operator()(const value_idx& i)
+    {
+      auto degree = indptr[i + 1] - indptr[i];
+      return raft::ceildiv(degree, (value_idx)row_chunk_size);
+    }
+
+    const value_idx* indptr;
+    value_idx row_chunk_size;
+  };
+
+ private:
+  static void fill_chunk_indices(const value_idx& n_rows,
+                                 rmm::device_uvector<value_idx>& n_chunks_per_row,
+                                 rmm::device_uvector<value_idx>& chunk_indices,
+                                 cudaStream_t stream)
+  {
+    auto n_threads = std::min(n_rows, 256);
+    auto n_blocks  = raft::ceildiv(n_rows, (value_idx)n_threads);
+
+    chunk_indices.resize(total_row_blocks, stream);
+
+    fill_chunk_indices_kernel<value_idx>
+      <<<n_blocks, n_threads, 0, stream>>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows);
+  }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh
new file mode 100644
index 000000000..baa913a6c
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "base_strategy.cuh"
+
+#include <raft/util/cuda_dev_essentials.cuh>  // raft::ceildiv
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
+ public:
+  using smem_type   = value_t*;
+  using insert_type = smem_type;
+  using find_type   = smem_type;
+
+  dense_smem_strategy(const distances_config_t<value_idx, value_t>& config_)
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_)
+  {
+  }
+
+  inline static int smem_per_block(int n_cols)
+  {
+    return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t));
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.a_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> a_indptr(this->config.a_indptr, this->config.a_nrows);
+
+    this->_dispatch_base(*this,
+                         this->config.b_ncols,
+                         a_indptr,
+                         out_dists,
+                         coo_rows_b,
+                         product_func,
+                         accum_func,
+                         write_func,
+                         chunk_size,
+                         n_blocks,
+                         n_blocks_per_row);
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.b_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> b_indptr(this->config.b_indptr, this->config.b_nrows);
+
+    this->_dispatch_base_rev(*this,
+                             this->config.a_ncols,
+                             b_indptr,
+                             out_dists,
+                             coo_rows_a,
+                             product_func,
+                             accum_func,
+                             write_func,
+                             chunk_size,
+                             n_blocks,
+                             n_blocks_per_row);
+  }
+
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
+    for (int k = threadIdx.x; k < cache_size; k += blockDim.x) {
+      cache[k] = 0.0;
+    }
+    return cache;
+  }
+
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
+    cache[key] = value;
+  }
+
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
+    return cache;
+  }
+
+  __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh
new file mode 100644
index 000000000..cf212076b
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "base_strategy.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+
+#include <cuco/static_map.cuh>
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+
+// this is needed by cuco as key, value must be bitwise comparable.
+// compilers don't declare float/double as bitwise comparable
+// but that is too strict
+// for example, the following is true (or 0):
+// float a = 5;
+// float b = 5;
+// memcmp(&a, &b, sizeof(float));
+CUCO_DECLARE_BITWISE_COMPARABLE(float);
+CUCO_DECLARE_BITWISE_COMPARABLE(double);
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
+ public:
+  using insert_type = typename cuco::legacy::
+    static_map<value_idx, value_t, cuda::thread_scope_block>::device_mutable_view;
+  using smem_type = typename insert_type::slot_type*;
+  using find_type =
+    typename cuco::legacy::static_map<value_idx, value_t, cuda::thread_scope_block>::device_view;
+
+  hash_strategy(const distances_config_t<value_idx, value_t>& config_,
+                float capacity_threshold_ = 0.5,
+                int map_size_             = get_map_size())
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_),
+      capacity_threshold(capacity_threshold_),
+      map_size(map_size_)
+  {
+  }
+
+  void chunking_needed(const value_idx* indptr,
+                       const value_idx n_rows,
+                       rmm::device_uvector<value_idx>& mask_indptr,
+                       std::tuple<value_idx, value_idx>& n_rows_divided,
+                       cudaStream_t stream)
+  {
+    auto policy = raft::resource::get_thrust_policy(this->config.handle);
+
+    auto less                   = thrust::copy_if(policy,
+                                thrust::make_counting_iterator(value_idx(0)),
+                                thrust::make_counting_iterator(n_rows),
+                                mask_indptr.data(),
+                                fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
+    std::get<0>(n_rows_divided) = less - mask_indptr.data();
+
+    auto more = thrust::copy_if(
+      policy,
+      thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows),
+      less,
+      fits_in_hash_table(
+        indptr, capacity_threshold * map_size, std::numeric_limits<value_idx>::max()));
+    std::get<1>(n_rows_divided) = more - less;
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb);
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.a_nrows, raft::resource::get_cuda_stream(this->config.handle));
+    std::tuple<value_idx, value_idx> n_rows_divided;
+
+    chunking_needed(this->config.a_indptr,
+                    this->config.a_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    raft::resource::get_cuda_stream(this->config.handle));
+
+    auto less_rows = std::get<0>(n_rows_divided);
+    if (less_rows > 0) {
+      mask_row_it<value_idx> less(this->config.a_indptr, less_rows, mask_indptr.data());
+
+      auto n_less_blocks = less_rows * n_blocks_per_row;
+      this->_dispatch_base(*this,
+                           map_size,
+                           less,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_less_blocks,
+                           n_blocks_per_row);
+    }
+
+    auto more_rows = std::get<1>(n_rows_divided);
+    if (more_rows > 0) {
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, raft::resource::get_cuda_stream(this->config.handle));
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, raft::resource::get_cuda_stream(this->config.handle));
+      chunked_mask_row_it<value_idx>::init(this->config.a_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           raft::resource::get_cuda_stream(this->config.handle));
+
+      chunked_mask_row_it<value_idx> more(this->config.a_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          raft::resource::get_cuda_stream(this->config.handle));
+
+      auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
+      this->_dispatch_base(*this,
+                           map_size,
+                           more,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_more_blocks,
+                           n_blocks_per_row);
+    }
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb);
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.b_nrows, raft::resource::get_cuda_stream(this->config.handle));
+    std::tuple<value_idx, value_idx> n_rows_divided;
+
+    chunking_needed(this->config.b_indptr,
+                    this->config.b_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    raft::resource::get_cuda_stream(this->config.handle));
+
+    auto less_rows = std::get<0>(n_rows_divided);
+    if (less_rows > 0) {
+      mask_row_it<value_idx> less(this->config.b_indptr, less_rows, mask_indptr.data());
+
+      auto n_less_blocks = less_rows * n_blocks_per_row;
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               less,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_less_blocks,
+                               n_blocks_per_row);
+    }
+
+    auto more_rows = std::get<1>(n_rows_divided);
+    if (more_rows > 0) {
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, raft::resource::get_cuda_stream(this->config.handle));
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, raft::resource::get_cuda_stream(this->config.handle));
+      chunked_mask_row_it<value_idx>::init(this->config.b_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           raft::resource::get_cuda_stream(this->config.handle));
+
+      chunked_mask_row_it<value_idx> more(this->config.b_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          raft::resource::get_cuda_stream(this->config.handle));
+
+      auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               more,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_more_blocks,
+                               n_blocks_per_row);
+    }
+  }
+
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
+    return insert_type::make_from_uninitialized_slots(cooperative_groups::this_thread_block(),
+                                                      cache,
+                                                      cache_size,
+                                                      cuco::empty_key{value_idx{-1}},
+                                                      cuco::empty_value{value_t{0}});
+  }
+
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
+    auto success = cache.insert(cuco::pair<value_idx, value_t>(key, value));
+  }
+
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
+    return find_type(
+      cache, cache_size, cuco::empty_key{value_idx{-1}}, cuco::empty_value{value_t{0}});
+  }
+
+  __device__ inline value_t find(find_type cache, const value_idx& key)
+  {
+    auto a_pair = cache.find(key);
+
+    value_t a_col = 0.0;
+    if (a_pair != cache.end()) { a_col = a_pair->second; }
+    return a_col;
+  }
+
+  struct fits_in_hash_table {
+   public:
+    fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_)
+      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_)
+    {
+    }
+
+    __host__ __device__ bool operator()(const value_idx& i)
+    {
+      auto degree = indptr[i + 1] - indptr[i];
+
+      return degree >= degree_l && degree < degree_r;
+    }
+
+   private:
+    const value_idx* indptr;
+    const value_idx degree_l, degree_r;
+  };
+
+  inline static int get_map_size()
+  {
+    return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+           sizeof(typename insert_type::slot_type);
+  }
+
+ private:
+  float capacity_threshold;
+  int map_size;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/ip_distance.cuh b/cpp/src/distance/detail/sparse/ip_distance.cuh
new file mode 100644
index 000000000..3a11d4e99
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/ip_distance.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "coo_spmv.cuh"
+
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/linalg/transpose.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t>
+class ip_distances_t : public distances_t<value_t> {
+ public:
+  /**
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   */
+  ip_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), coo_rows_b(config.b_nnz, raft::resource::get_cuda_stream(config.handle))
+  {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows_b.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  /**
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t* out_distances)
+  {
+    /**
+     * Compute pairwise distances and return dense matrix in row-major format
+     */
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_distances,
+                                                               *config_,
+                                                               coo_rows_b.data(),
+                                                               raft::mul_op(),
+                                                               raft::add_op(),
+                                                               raft::atomic_add_op());
+  }
+
+  value_idx* b_rows_coo() { return coo_rows_b.data(); }
+
+  value_t* b_data_coo() { return config_->b_data; }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<value_idx> coo_rows_b;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/l2_distance.cuh b/cpp/src/distance/detail/sparse/l2_distance.cuh
new file mode 100644
index 000000000..40e7070fc
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/l2_distance.cuh
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "ip_distance.cuh"
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_row_norm_kernel(value_t* out,
+                                    const value_idx* __restrict__ coo_rows,
+                                    const value_t* __restrict__ data,
+                                    value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
+}
+
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_row_sum_kernel(value_t* out,
+                                   const value_idx* __restrict__ coo_rows,
+                                   const value_t* __restrict__ data,
+                                   value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+RAFT_KERNEL compute_euclidean_warp_kernel(value_t* __restrict__ C,
+                                          const value_t* __restrict__ Q_sq_norms,
+                                          const value_t* __restrict__ R_sq_norms,
+                                          value_idx n_rows,
+                                          value_idx n_cols,
+                                          expansion_f expansion_func)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t dot = C[(size_t)i * n_cols + j];
+
+  // e.g. Euclidean expansion func = -2.0 * dot + q_norm + r_norm
+  value_t val = expansion_func(dot, Q_sq_norms[i], R_sq_norms[j]);
+
+  // correct for small instabilities
+  C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
+}
+
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_correlation_warp_kernel(value_t* __restrict__ C,
+                                            const value_t* __restrict__ Q_sq_norms,
+                                            const value_t* __restrict__ R_sq_norms,
+                                            const value_t* __restrict__ Q_norms,
+                                            const value_t* __restrict__ R_norms,
+                                            value_idx n_rows,
+                                            value_idx n_cols,
+                                            value_idx n)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t dot  = C[(size_t)i * n_cols + j];
+  value_t Q_l1 = Q_norms[i];
+  value_t R_l1 = R_norms[j];
+
+  value_t Q_l2 = Q_sq_norms[i];
+  value_t R_l2 = R_sq_norms[j];
+
+  value_t numer   = n * dot - (Q_l1 * R_l1);
+  value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
+  value_t R_denom = n * R_l2 - (R_l1 * R_l1);
+
+  value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom));
+
+  // correct for small instabilities
+  C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_euclidean(value_t* C,
+                       const value_t* Q_sq_norms,
+                       const value_t* R_sq_norms,
+                       value_idx n_rows,
+                       value_idx n_cols,
+                       cudaStream_t stream,
+                       expansion_f expansion_func)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_l2(value_t* out,
+                const value_idx* Q_coo_rows,
+                const value_t* Q_data,
+                value_idx Q_nnz,
+                const value_idx* R_coo_rows,
+                const value_t* R_data,
+                value_idx R_nnz,
+                value_idx m,
+                value_idx n,
+                cudaStream_t stream,
+                expansion_f expansion_func)
+{
+  rmm::device_uvector<value_t> Q_sq_norms(m, stream);
+  rmm::device_uvector<value_t> R_sq_norms(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+
+  compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256>
+void compute_correlation(value_t* C,
+                         const value_t* Q_sq_norms,
+                         const value_t* R_sq_norms,
+                         const value_t* Q_norms,
+                         const value_t* R_norms,
+                         value_idx n_rows,
+                         value_idx n_cols,
+                         value_idx n,
+                         cudaStream_t stream)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_correlation_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256>
+void compute_corr(value_t* out,
+                  const value_idx* Q_coo_rows,
+                  const value_t* Q_data,
+                  value_idx Q_nnz,
+                  const value_idx* R_coo_rows,
+                  const value_t* R_data,
+                  value_idx R_nnz,
+                  value_idx m,
+                  value_idx n,
+                  value_idx n_cols,
+                  cudaStream_t stream)
+{
+  // sum_sq for std dev
+  rmm::device_uvector<value_t> Q_sq_norms(m, stream);
+  rmm::device_uvector<value_t> R_sq_norms(n, stream);
+
+  // sum for mean
+  rmm::device_uvector<value_t> Q_norms(m, stream);
+  rmm::device_uvector<value_t> R_norms(n, stream);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+
+  compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_row_sum_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_sum_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_correlation(out,
+                      Q_sq_norms.data(),
+                      R_sq_norms.data(),
+                      Q_norms.data(),
+                      R_norms.data(),
+                      m,
+                      n,
+                      n_cols,
+                      stream);
+}
+
+/**
+ * L2 distance using the expanded form: sum(x_k)^2 + sum(y_k)^2 - 2 * sum(x_k * y_k)
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class l2_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit l2_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               raft::resource::get_cuda_stream(config_->handle),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 return -2 * dot + q_norm + r_norm;
+               });
+  }
+
+  ~l2_expanded_distances_t() = default;
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * L2 sqrt distance performing the sqrt operation after the distance computation
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, value_t> {
+ public:
+  explicit l2_sqrt_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_expanded_distances_t<value_idx, value_t>(config)
+  {
+  }
+
+  void compute(value_t* out_dists) override
+  {
+    l2_expanded_distances_t<value_idx, value_t>::compute(out_dists);
+    // Sqrt Post-processing
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
+      [] __device__(value_t input) {
+        int neg = input < 0 ? -1 : 1;
+        return raft::sqrt(abs(input) * neg);
+      },
+      raft::resource::get_cuda_stream(this->config_->handle));
+  }
+
+  ~l2_sqrt_expanded_distances_t() = default;
+};
+
+template <typename value_idx, typename value_t>
+class correlation_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit correlation_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_corr(out_dists,
+                 search_coo_rows.data(),
+                 config_->a_data,
+                 config_->a_nnz,
+                 b_indices,
+                 b_data,
+                 config_->b_nnz,
+                 config_->a_nrows,
+                 config_->b_nrows,
+                 config_->b_ncols,
+                 raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  ~correlation_expanded_distances_t() = default;
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) *
+ * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class cosine_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit cosine_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               raft::resource::get_cuda_stream(config_->handle),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm);
+                 // deal with potential for 0 in denominator by forcing 0/1 instead
+                 value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+
+                 // flip the similarity when both rows are 0
+                 bool both_empty = (q_norm == 0) && (r_norm == 0);
+                 return 1 - ((!both_empty * cos) + both_empty);
+               });
+  }
+
+  ~cosine_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Hellinger distance using the expanded form: sqrt(1 - sum(sqrt(x_k) * sqrt(y_k)))
+ * The expanded form is more efficient for sparse data.
+ *
+ * This distance computation modifies A and B by computing a sqrt
+ * and then performing a `pow(x, 2)` to convert it back. Because of this,
+ * it is possible that the values in A and B might differ slightly
+ * after this is invoked.
+ */
+template <typename value_idx = int, typename value_t = float>
+class hellinger_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit hellinger_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, raft::resource::get_cuda_stream(config.handle))
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                            raft::resource::get_cuda_stream(config_->handle));
+
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) {
+        // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
+        bool rectifier = (1 - input) > 0;
+        return raft::sqrt(rectifier * (1 - input));
+      },
+      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  ~hellinger_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class russelrao_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit russelrao_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_t n_cols     = config_->a_ncols;
+    value_t n_cols_inv = 1.0 / n_cols;
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; },
+      raft::resource::get_cuda_stream(config_->handle));
+
+    auto exec_policy  = rmm::exec_policy(raft::resource::get_cuda_stream(config_->handle));
+    auto diags        = thrust::counting_iterator<value_idx>(0);
+    value_idx b_nrows = config_->b_nrows;
+    thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) {
+      out_dists[input * b_nrows + input] = 0.0;
+    });
+  }
+
+  ~russelrao_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/lp_distance.cuh b/cpp/src/distance/detail/sparse/lp_distance.cuh
new file mode 100644
index 000000000..18e7b04e4
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/lp_distance.cuh
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <algorithm>
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx = int,
+          typename value_t   = float,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+void unexpanded_lp_distances(value_t* out_dists,
+                             const distances_config_t<value_idx, value_t>* config_,
+                             product_f product_func,
+                             accum_f accum_func,
+                             write_f write_func)
+{
+  rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                          raft::resource::get_cuda_stream(config_->handle));
+
+  raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                    config_->b_nrows,
+                                    coo_rows.data(),
+                                    config_->b_nnz,
+                                    raft::resource::get_cuda_stream(config_->handle));
+
+  balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+    out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
+
+  raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                    config_->a_nrows,
+                                    coo_rows.data(),
+                                    config_->a_nnz,
+                                    raft::resource::get_cuda_stream(config_->handle));
+
+  balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
+    out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
+}
+
+/**
+ * Computes L1 distances for sparse input. This does not have
+ * an equivalent expanded form, so it is only executed in
+ * an unexpanded form.
+ * @tparam value_idx
+ * @tparam value_t
+ */
+template <typename value_idx = int, typename value_t = float>
+class l1_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l1_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::add_op(), raft::atomic_add_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class l2_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l2_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::sqdiff_op(), raft::add_op(), raft::atomic_add_op());
+  }
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_idx, value_t> {
+ public:
+  l2_sqrt_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_unexpanded_distances_t<value_idx, value_t>(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    l2_unexpanded_distances_t<value_idx, value_t>::compute(out_dists);
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    // Sqrt Post-processing
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      n,
+      [] __device__(value_t input) {
+        int neg = input < 0 ? -1 : 1;
+        return raft::sqrt(abs(input) * neg);
+      },
+      raft::resource::get_cuda_stream(this->config_->handle));
+  }
+};
+
+template <typename value_idx = int, typename value_t = float>
+class linf_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit linf_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::max_op(), raft::atomic_max_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class canberra_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit canberra_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      [] __device__(value_t a, value_t b) {
+        value_t d = fabs(a) + fabs(b);
+
+        // deal with potential for 0 in denominator by
+        // forcing 1/0 instead
+        return ((d != 0) * fabs(a - b)) / (d + (d == 0));
+      },
+      raft::add_op(),
+      raft::atomic_add_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class lp_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit lp_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config,
+                                     value_t p_)
+    : config_(&config), p(p_)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      raft::compose_op(raft::pow_const_op<value_t>(p), raft::sub_op()),
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n         = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    value_t one_over_p = value_t{1} / p;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::pow_const_op<value_t>(one_over_p),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  value_t p;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class hamming_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit hamming_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::notequal_op(), raft::add_op(), raft::atomic_add_op());
+
+    uint64_t n     = (uint64_t)config_->a_nrows * (uint64_t)config_->b_nrows;
+    value_t n_cols = 1.0 / config_->a_ncols;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::mul_const_op<value_t>(n_cols),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit jensen_shannon_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      [] __device__(value_t a, value_t b) {
+        value_t m   = 0.5f * (a + b);
+        bool a_zero = a == 0;
+        bool b_zero = b == 0;
+
+        value_t x = (!a_zero * m) / (a_zero + a);
+        value_t y = (!b_zero * m) / (b_zero + b);
+
+        bool x_zero = x == 0;
+        bool y_zero = y == 0;
+
+        return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
+      },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      n,
+      [=] __device__(value_t input) { return raft::sqrt(0.5 * input); },
+      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit kl_divergence_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                            raft::resource::get_cuda_stream(config_->handle));
+
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return a * log(a / b); },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::mul_const_op<value_t>(0.5),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/utils.cuh b/cpp/src/distance/detail/sparse/utils.cuh
new file mode 100644
index 000000000..dc7ae6df6
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/utils.cuh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/math.hpp>
+
+#include <cub/cub.cuh>
+#include <cuda_fp16.h>
+#include <cuda_pipeline.h>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+/**
+ * Computes the maximum number of columns that can be stored
+ * in shared memory in dense form with the given block size
+ * and precision.
+ * @return the maximum number of columns that can be stored in smem
+ */
+template <typename value_idx, typename value_t, int tpb = 1024>
+inline int max_cols_per_block()
+{
+  // max cols = (total smem available - cub reduction smem)
+  return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+         sizeof(value_t);
+}
+
+template <typename value_idx, typename value_t, typename dot_t = value_t>
+RAFT_KERNEL faster_dot_on_csr_kernel(dot_t* __restrict__ dot,
+                                     const value_idx* __restrict__ indptr,
+                                     const value_idx* __restrict__ cols,
+                                     const value_t* __restrict__ A,
+                                     const value_t* __restrict__ B,
+                                     const value_idx nnz,
+                                     const value_idx n_rows,
+                                     const value_idx dim)
+{
+  auto vec_id  = threadIdx.x;
+  auto lane_id = threadIdx.x & 0x1f;
+
+  extern __shared__ char smem[];
+  value_t* s_A      = (value_t*)smem;
+  value_idx cur_row = -1;
+
+  for (int row = blockIdx.x; row < n_rows; row += gridDim.x) {
+    for (int dot_id = blockIdx.y + indptr[row]; dot_id < indptr[row + 1]; dot_id += gridDim.y) {
+      if (dot_id >= nnz) { return; }
+      const value_idx col               = cols[dot_id] * dim;
+      const value_t* __restrict__ B_col = B + col;
+
+      if (threadIdx.x == 0) { dot[dot_id] = 0.0; }
+      __syncthreads();
+
+      if (cur_row != row) {
+        for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+          s_A[k] = A[row * dim + k];
+        }
+        cur_row = row;
+      }
+
+      dot_t l_dot_ = 0.0;
+      for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+        asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x));
+        if constexpr ((std::is_same_v<dot_t, float> && std::is_same_v<value_t, half>)) {
+          l_dot_ += __half2float(s_A[k]) * __half2float(__ldcg(B_col + k));
+        } else {
+          l_dot_ += s_A[k] * __ldcg(B_col + k);
+        }
+      }
+
+      typedef cub::WarpReduce<dot_t> WarpReduce;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      dot_t warp_sum = WarpReduce(temp_storage).Sum(l_dot_);
+
+      if (lane_id == 0) { atomicAdd_block(dot + dot_id, warp_sum); }
+    }
+  }
+}
+
+template <typename value_idx, typename value_t, typename dot_t = value_t>
+void faster_dot_on_csr(raft::resources const& handle,
+                       dot_t* dot,
+                       const value_idx nnz,
+                       const value_idx* indptr,
+                       const value_idx* cols,
+                       const value_t* A,
+                       const value_t* B,
+                       const value_idx n_rows,
+                       const value_idx dim)
+{
+  if (nnz == 0 || n_rows == 0) return;
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  constexpr value_idx MAX_ROW_PER_ITER = 500;
+  int dev_id, sm_count, blocks_per_sm;
+
+  const int smem_size = dim * sizeof(value_t);
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  if (dim < 128) {
+    constexpr int tpb = 64;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+
+  } else if (dim < 256) {
+    constexpr int tpb = 128;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else if (dim < 512) {
+    constexpr int tpb = 256;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else {
+    constexpr int tpb = 512;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  }
+
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh
index e623f76ba..a692a62a3 100644
--- a/cpp/src/distance/distance-ext.cuh
+++ b/cpp/src/distance/distance-ext.cuh
@@ -273,13 +273,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan
                                          float,
                                          float,
                                          float,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<float>,
+                                         cuvs::distance::kernels::rbf_fin_op<float>,
                                          int64_t);
 instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded,
                                          double,
                                          double,
                                          double,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<double>,
+                                         cuvs::distance::kernels::rbf_fin_op<double>,
                                          int64_t);
 
 #undef instantiate_cuvs_distance_distance_extra
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index c1d39f360..47e72460f 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -139,13 +139,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan
                                          float,
                                          float,
                                          float,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<float>,
+                                         cuvs::distance::kernels::rbf_fin_op<float>,
                                          int64_t);
 instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded,
                                          double,
                                          double,
                                          double,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<double>,
+                                         cuvs::distance::kernels::rbf_fin_op<double>,
                                          int64_t);
 
 #undef instantiate_cuvs_distance_distance_extra
diff --git a/cpp/src/distance/sparse_distance.cu b/cpp/src/distance/sparse_distance.cu
new file mode 100644
index 000000000..338c4e908
--- /dev/null
+++ b/cpp/src/distance/sparse_distance.cu
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include "sparse_distance.cuh"
+
+namespace cuvs {
+namespace distance {
+
+template <typename ElementType, typename IndexType>
+void pairwise_distance(
+  raft::resources const& handle,
+  raft::device_csr_matrix_view<const ElementType, IndexType, IndexType, IndexType> x,
+  raft::device_csr_matrix_view<const ElementType, IndexType, IndexType, IndexType> y,
+  raft::device_matrix_view<ElementType, IndexType, raft::row_major> dist,
+  cuvs::distance::DistanceType metric,
+  float metric_arg = 2.0f)
+{
+  auto x_structure = x.structure_view();
+  auto y_structure = y.structure_view();
+
+  RAFT_EXPECTS(x_structure.get_n_cols() == y_structure.get_n_cols(),
+               "Number of columns must be equal");
+
+  RAFT_EXPECTS(dist.extent(0) == x_structure.get_n_rows(),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y_structure.get_n_rows(),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  detail::sparse::distances_config_t<IndexType, ElementType> input_config(handle);
+  input_config.a_nrows   = x_structure.get_n_rows();
+  input_config.a_ncols   = x_structure.get_n_cols();
+  input_config.a_nnz     = x_structure.get_nnz();
+  input_config.a_indptr  = const_cast<IndexType*>(x_structure.get_indptr().data());
+  input_config.a_indices = const_cast<IndexType*>(x_structure.get_indices().data());
+  input_config.a_data    = const_cast<ElementType*>(x.get_elements().data());
+
+  input_config.b_nrows   = y_structure.get_n_rows();
+  input_config.b_ncols   = y_structure.get_n_cols();
+  input_config.b_nnz     = y_structure.get_nnz();
+  input_config.b_indptr  = const_cast<IndexType*>(y_structure.get_indptr().data());
+  input_config.b_indices = const_cast<IndexType*>(y_structure.get_indices().data());
+  input_config.b_data    = const_cast<ElementType*>(y.get_elements().data());
+
+  pairwiseDistance(dist.data_handle(), input_config, metric, metric_arg);
+}
+
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const float, int, int, int> x,
+                       raft::device_csr_matrix_view<const float, int, int, int> y,
+                       raft::device_matrix_view<float, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg)
+{
+  pairwise_distance<float, int>(handle, x, y, dist, metric, metric_arg);
+}
+
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const double, int, int, int> x,
+                       raft::device_csr_matrix_view<const double, int, int, int> y,
+                       raft::device_matrix_view<double, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg)
+{
+  pairwise_distance<double, int>(handle, x, y, dist, metric, metric_arg);
+}
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/sparse_distance.cuh b/cpp/src/distance/sparse_distance.cuh
new file mode 100644
index 000000000..0d6dc0e6f
--- /dev/null
+++ b/cpp/src/distance/sparse_distance.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/sparse/bin_distance.cuh"
+#include "detail/sparse/common.hpp"
+#include "detail/sparse/ip_distance.cuh"
+#include "detail/sparse/l2_distance.cuh"
+#include "detail/sparse/lp_distance.cuh"
+
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+
+#include <unordered_set>
+
+namespace cuvs {
+namespace distance {
+/**
+ * Compute pairwise distances between A and B, using the provided
+ * input configuration and distance function.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @param[out] out dense output array (size A.nrows * B.nrows)
+ * @param[in] input_config input argument configuration
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename value_idx = int, typename value_t = float>
+void pairwiseDistance(value_t* out,
+                      detail::sparse::distances_config_t<value_idx, value_t> input_config,
+                      cuvs::distance::DistanceType metric,
+                      float metric_arg)
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded:
+      detail::sparse::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+      detail::sparse::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::InnerProduct:
+      detail::sparse::ip_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2Unexpanded:
+      detail::sparse::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
+      detail::sparse::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L1:
+      detail::sparse::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::LpUnexpanded:
+      detail::sparse::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::Linf:
+      detail::sparse::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::Canberra:
+      detail::sparse::canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::JaccardExpanded:
+      detail::sparse::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::CosineExpanded:
+      detail::sparse::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::HellingerExpanded:
+      detail::sparse::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::DiceExpanded:
+      detail::sparse::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::CorrelationExpanded:
+      detail::sparse::correlation_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::RusselRaoExpanded:
+      detail::sparse::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::HammingUnexpanded:
+      detail::sparse::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::JensenShannon:
+      detail::sparse::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::KLDivergence:
+      detail::sparse::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+
+    default: THROW("Unsupported distance: %d", metric);
+  }
+}
+};  // namespace distance
+};  // namespace cuvs
diff --git a/cpp/src/embed/spectral.cu b/cpp/src/embed/spectral.cu
new file mode 100644
index 000000000..c3d4e3fc7
--- /dev/null
+++ b/cpp/src/embed/spectral.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../sparse/cluster/detail/spectral.cuh"
+#include <cuvs/embed/spectral.hpp>
+#include <raft/core/device_coo_matrix.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::embed::spectral {
+
+/**
+ * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings
+ * (lowest n_components eigenvectors), using Lanczos min cut algorithm.
+ * @param rows source vertices of knn graph (size nnz)
+ * @param cols destination vertices of knn graph (size nnz)
+ * @param vals edge weights connecting vertices of knn graph (size nnz)
+ * @param nnz size of rows/cols/vals
+ * @param n number of samples in X
+ * @param n_neighbors the number of neighbors to query for knn graph construction
+ * @param n_components the number of components to project the X into
+ * @param out output array for embedding (size n*n_comonents)
+ */
+void fit(const raft::resources& handle,
+         raft::device_coo_matrix_view<float, int, int, int> knn_graph,
+         int n_components,
+         raft::device_matrix_view<float, int> out,
+         unsigned long long seed)
+{
+  cuvs::sparse::cluster::spectral::detail::fit_embedding(
+    handle,
+    knn_graph.structure_view().get_rows().data(),
+    knn_graph.structure_view().get_cols().data(),
+    knn_graph.get_elements().data(),
+    knn_graph.structure_view().get_nnz(),
+    knn_graph.structure_view().get_n_rows(),
+    n_components,
+    out.data_handle(),
+    seed);
+}
+};  // namespace cuvs::embed::spectral
diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu
index b0f87e9ac..d534676e3 100644
--- a/cpp/src/neighbors/brute_force.cu
+++ b/cpp/src/neighbors/brute_force.cu
@@ -21,6 +21,21 @@
 #include <raft/core/copy.hpp>
 
 namespace cuvs::neighbors::brute_force {
+
+template <typename T, typename DistT>
+index<T, DistT>::index(raft::resources const& res)
+  // this constructor is just for a temporary index, for use in the deserialization
+  // api. all the parameters here will get replaced with loaded values - that aren't
+  // necessarily known ahead of time before deserialization.
+  // TODO: do we even need a handle here - could just construct one?
+  : cuvs::neighbors::index(),
+    metric_(cuvs::distance::DistanceType::L2Expanded),
+    dataset_(raft::make_device_matrix<T, int64_t>(res, 0, 0)),
+    norms_(std::nullopt),
+    metric_arg_(0)
+{
+}
+
 template <typename T, typename DistT>
 index<T, DistT>::index(raft::resources const& res,
                        raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp
index eda79aa31..f1a8c995d 100644
--- a/cpp/src/neighbors/brute_force_c.cpp
+++ b/cpp/src/neighbors/brute_force_c.cpp
@@ -17,10 +17,12 @@
 
 #include <cstdint>
 #include <dlpack/dlpack.h>
+#include <fstream>
 
 #include <raft/core/error.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
 
 #include <cuvs/core/c_api.h>
 #include <cuvs/core/exceptions.hpp>
@@ -91,6 +93,22 @@ void _search(cuvsResources_t res,
   }
 }
 
+template <typename T>
+void _serialize(cuvsResources_t res, const char* filename, cuvsBruteForceIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::brute_force::index<T>*>(index.addr);
+  cuvs::neighbors::brute_force::serialize(*res_ptr, std::string(filename), *index_ptr);
+}
+
+template <typename T>
+void* _deserialize(cuvsResources_t res, const char* filename)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  auto index   = new cuvs::neighbors::brute_force::index<T>(*res_ptr);
+  cuvs::neighbors::brute_force::deserialize(*res_ptr, std::string(filename), index);
+  return index;
+}
 }  // namespace
 
 extern "C" cuvsError_t cuvsBruteForceIndexCreate(cuvsBruteForceIndex_t* index)
@@ -129,7 +147,7 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res,
     if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
       index->addr =
         reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
-      index->dtype.code = kDLFloat;
+      index->dtype = dataset.dtype;
     } else {
       RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
                 dataset.dtype.code,
@@ -174,3 +192,38 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
     }
   });
 }
+
+extern "C" cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                                 const char* filename,
+                                                 cuvsBruteForceIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    // read the numpy dtype from the beginning of the file
+    std::ifstream is(filename, std::ios::in | std::ios::binary);
+    if (!is) { RAFT_FAIL("Cannot open file %s", filename); }
+    char dtype_string[4];
+    is.read(dtype_string, 4);
+    auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4));
+
+    index->dtype.bits = dtype.itemsize * 8;
+    if (dtype.kind == 'f' && dtype.itemsize == 4) {
+      index->dtype.code = kDLFloat;
+      index->addr       = reinterpret_cast<uintptr_t>(_deserialize<float>(res, filename));
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                               const char* filename,
+                                               cuvsBruteForceIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
+      _serialize<float>(res, filename, *index);
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
\ No newline at end of file
diff --git a/cpp/src/neighbors/brute_force_serialize.cu b/cpp/src/neighbors/brute_force_serialize.cu
new file mode 100644
index 000000000..1b5b5111e
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_serialize.cu
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/copy.cuh>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
+
+#include <fstream>
+
+namespace cuvs::neighbors::brute_force {
+
+int constexpr serialization_version = 0;
+
+template <typename T, typename DistT>
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<T, DistT>& index,
+               bool include_dataset = true)
+{
+  RAFT_LOG_DEBUG(
+    "Saving brute force index, size %zu, dim %u", static_cast<size_t>(index.size()), index.dim());
+
+  auto dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<T>().to_string();
+  dtype_string.resize(4);
+  os << dtype_string;
+
+  raft::serialize_scalar(handle, os, serialization_version);
+  raft::serialize_scalar(handle, os, index.size());
+  raft::serialize_scalar(handle, os, index.dim());
+  raft::serialize_scalar(handle, os, index.metric());
+  raft::serialize_scalar(handle, os, index.metric_arg());
+  raft::serialize_scalar(handle, os, include_dataset);
+  if (include_dataset) { raft::serialize_mdspan(handle, os, index.dataset()); }
+  auto has_norms = index.has_norms();
+  raft::serialize_scalar(handle, os, has_norms);
+  if (has_norms) { raft::serialize_mdspan(handle, os, index.norms()); }
+  raft::resource::sync_stream(handle);
+}
+
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<half, float>& index,
+               bool include_dataset)
+{
+  auto os = std::ofstream{filename, std::ios::out | std::ios::binary};
+  RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str());
+  serialize<half, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<float, float>& index,
+               bool include_dataset)
+{
+  auto os = std::ofstream{filename, std::ios::out | std::ios::binary};
+  RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str());
+  serialize<float, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<half, float>& index,
+               bool include_dataset)
+{
+  serialize<half, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<float, float>& index,
+               bool include_dataset)
+{
+  serialize<float, float>(handle, os, index, include_dataset);
+}
+
+template <typename T, typename DistT>
+auto deserialize(raft::resources const& handle, std::istream& is)
+{
+  auto dtype_string = std::array<char, 4>{};
+  is.read(dtype_string.data(), 4);
+
+  auto ver = raft::deserialize_scalar<int>(handle, is);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  std::int64_t rows = raft::deserialize_scalar<size_t>(handle, is);
+  std::int64_t dim  = raft::deserialize_scalar<size_t>(handle, is);
+  auto metric       = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
+  auto metric_arg   = raft::deserialize_scalar<DistT>(handle, is);
+
+  auto dataset_storage = raft::make_host_matrix<T>(std::int64_t{}, std::int64_t{});
+  auto include_dataset = raft::deserialize_scalar<bool>(handle, is);
+  if (include_dataset) {
+    dataset_storage = raft::make_host_matrix<T>(rows, dim);
+    raft::deserialize_mdspan(handle, is, dataset_storage.view());
+  }
+
+  auto has_norms     = raft::deserialize_scalar<bool>(handle, is);
+  auto norms_storage = has_norms ? std::optional{raft::make_host_vector<DistT, std::int64_t>(rows)}
+                                 : std::optional<raft::host_vector<DistT, std::int64_t>>{};
+  // TODO(wphicks): Use mdbuffer here when available
+  auto norms_storage_dev =
+    has_norms ? std::optional{raft::make_device_vector<DistT, std::int64_t>(handle, rows)}
+              : std::optional<raft::device_vector<DistT, std::int64_t>>{};
+  if (has_norms) {
+    raft::deserialize_mdspan(handle, is, norms_storage->view());
+    raft::copy(handle, norms_storage_dev->view(), norms_storage->view());
+  }
+
+  auto result = index<T, DistT>(handle,
+                                raft::make_const_mdspan(dataset_storage.view()),
+                                std::move(norms_storage_dev),
+                                metric,
+                                metric_arg);
+  raft::resource::sync_stream(handle);
+
+  return result;
+}
+
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<half, float>* index)
+{
+  auto is = std::ifstream{filename, std::ios::in | std::ios::binary};
+  RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str());
+
+  *index = deserialize<half, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<float, float>* index)
+{
+  auto is = std::ifstream{filename, std::ios::in | std::ios::binary};
+  RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str());
+
+  *index = deserialize<float, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<half, float>* index)
+{
+  *index = deserialize<half, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<float, float>* index)
+{
+  *index = deserialize<float, float>(handle, is);
+}
+
+}  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp
index 6985ff094..326a89665 100644
--- a/cpp/src/neighbors/cagra_c.cpp
+++ b/cpp/src/neighbors/cagra_c.cpp
@@ -29,6 +29,8 @@
 #include <cuvs/neighbors/cagra.h>
 #include <cuvs/neighbors/cagra.hpp>
 
+#include <fstream>
+
 namespace {
 
 template <typename T>
diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 29f790ec5..652d41c85 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -63,14 +63,9 @@ struct pointer_residency_count<Type, Types...> {
     auto [on_device, on_host] = pointer_residency_count<Types...>::run(ptrs...);
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
-    switch (attr.type) {
-      case cudaMemoryTypeUnregistered: return std::make_tuple(on_device, on_host + 1);
-      case cudaMemoryTypeHost:
-        return std::make_tuple(on_device + int(attr.devicePointer == ptr), on_host + 1);
-      case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
-      case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
-      default: return std::make_tuple(on_device, on_host);
-    }
+    if (attr.devicePointer || attr.type == cudaMemoryTypeDevice) { ++on_device; }
+    if (attr.hostPointer || attr.type == cudaMemoryTypeUnregistered) { ++on_host; }
+    return std::make_tuple(on_device, on_host);
   }
 };
 
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index e5495dc3e..b7fec724b 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "../../../core/nvtx.hpp"
 #include "../../vpq_dataset.cuh"
 #include "graph_core.cuh"
 #include <cuvs/neighbors/cagra.hpp>
@@ -32,8 +33,7 @@
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <cuvs/neighbors/refine.hpp>
 
-// TODO: Fixme- this needs to be migrated
-#include "../../nn_descent.cuh"
+#include <cuvs/neighbors/nn_descent.hpp>
 
 // TODO: This shouldn't be calling spatial/knn APIs
 #include "../ann_utils.cuh"
@@ -130,7 +130,7 @@ void build_knn_graph(
                "Currently only L2Expanded or InnerProduct metric are supported");
 
   uint32_t node_degree = knn_graph.extent(1);
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_graph(%zu, %zu, %u)",
     size_t(dataset.extent(0)),
     size_t(dataset.extent(1)),
@@ -355,8 +355,8 @@ void build_knn_graph(
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,
   cuvs::neighbors::nn_descent::index_params build_params)
 {
-  auto nn_descent_idx = cuvs::neighbors::nn_descent::index<IdxT>(res, knn_graph);
-  cuvs::neighbors::nn_descent::build<DataT, IdxT>(res, build_params, dataset, nn_descent_idx);
+  std::optional<raft::host_matrix_view<IdxT, int64_t, row_major>> graph_view = knn_graph;
+  auto nn_descent_idx = cuvs::neighbors::nn_descent::build(res, build_params, dataset, graph_view);
 
   using internal_IdxT = typename std::make_unsigned<IdxT>::type;
   using g_accessor    = typename decltype(nn_descent_idx.graph())::accessor_type;
@@ -436,11 +436,11 @@ index<T, IdxT> build(
   auto knn_build_params = params.graph_build_params;
   if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
     // Heuristic to decide default build algo and its params.
-    if (params.metric == cuvs::distance::DistanceType::L2Expanded &&
-        cuvs::neighbors::nn_descent::has_enough_device_memory(
+    if (cuvs::neighbors::nn_descent::has_enough_device_memory(
           res, dataset.extents(), sizeof(IdxT))) {
       RAFT_LOG_DEBUG("NN descent solver");
-      knn_build_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
+      knn_build_params =
+        cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
     } else {
       RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
       knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
@@ -453,9 +453,6 @@ index<T, IdxT> build(
       std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(knn_build_params);
     build_knn_graph(res, dataset, knn_graph->view(), ivf_pq_params);
   } else {
-    RAFT_EXPECTS(
-      params.metric == cuvs::distance::DistanceType::L2Expanded,
-      "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent");
     auto nn_descent_params =
       std::get<cagra::graph_build_params::nn_descent_params>(knn_build_params);
 
@@ -466,10 +463,12 @@ index<T, IdxT> build(
         "nn-descent graph_degree.",
         nn_descent_params.graph_degree,
         intermediate_degree);
-      nn_descent_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
+      nn_descent_params =
+        cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
     }
 
     // Use nn-descent to build CAGRA knn graph
+    nn_descent_params.return_distances = false;
     build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
   }
 
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 4c15b8e14..5778d85a6 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "../../../core/nvtx.hpp"
 #include "factory.cuh"
 #include "sample_filter_utils.cuh"
 #include "search_plan.cuh"
@@ -23,7 +24,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 
@@ -66,7 +66,7 @@ void search_main_core(raft::resources const& res,
     params.max_queries = std::min<size_t>(queries.extent(0), deviceProp.maxGridSize[1]);
   }
 
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)",
     params.max_queries,
     topk,
@@ -151,7 +151,7 @@ void search_main(raft::resources const& res,
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
     // Search using a plain (strided) row-major dataset
-    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+    auto desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
       res, params, *strided_dset, index.metric());
     search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
       res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
@@ -161,7 +161,7 @@ void search_main(raft::resources const& res,
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
-    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+    auto desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
       res, params, *vpq_dset, index.metric());
     search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
       res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
index a077c098f..0f6cf852f 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
@@ -21,10 +21,10 @@
 #include <raft/core/logger-ext.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/mdspan_types.hpp>
-#include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
 
+#include "../../../core/nvtx.hpp"
 #include "../dataset_serialize.hpp"
 
 #include <cstddef>
@@ -54,7 +54,7 @@ void serialize(raft::resources const& res,
                const index<T, IdxT>& index_,
                bool include_dataset)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::serialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
 
   RAFT_LOG_DEBUG(
     "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
@@ -104,7 +104,7 @@ void serialize_to_hnswlib(raft::resources const& res,
 {
   // static_assert(std::is_same_v<IdxT, int> or std::is_same_v<IdxT, uint32_t>,
   //               "An hnswlib index can only be trained with int32 or uint32 IdxT");
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::serialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
   RAFT_LOG_DEBUG("Saving CAGRA index to hnswlib format, size %zu, dim %u",
                  static_cast<size_t>(index_.size()),
                  index_.dim());
@@ -226,7 +226,7 @@ void serialize_to_hnswlib(raft::resources const& res,
 template <typename T, typename IdxT>
 void deserialize(raft::resources const& res, std::istream& is, index<T, IdxT>* index_)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::deserialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::deserialize");
 
   char dtype_string[4];
   is.read(dtype_string, 4);
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 297eb1f55..7eb798459 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -31,8 +31,10 @@
 #include <raft/util/device_loads_stores.cuh>
 #include <raft/util/vectorized.cuh>
 
+#include <atomic>
 #include <functional>
 #include <memory>
+#include <mutex>
 #include <type_traits>
 #include <variant>
 
@@ -232,52 +234,77 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t {
  */
 template <typename DataT, typename IndexT, typename DistanceT>
 struct dataset_descriptor_host {
-  using dev_descriptor_t = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
-  using dd_ptr_t         = std::shared_ptr<dev_descriptor_t>;
-  using init_f =
-    std::tuple<std::function<void(dev_descriptor_t*, rmm::cuda_stream_view stream)>, size_t>;
+  using dev_descriptor_t         = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
   uint32_t smem_ws_size_in_bytes = 0;
   uint32_t team_size             = 0;
 
+  struct state {
+    using ready_t = std::tuple<dev_descriptor_t*, rmm::cuda_stream_view>;
+    using init_f =
+      std::tuple<std::function<void(dev_descriptor_t*, rmm::cuda_stream_view)>, size_t>;
+
+    std::mutex mutex;
+    std::atomic<bool> ready;  // Not sure if std::holds_alternative is thread-safe
+    std::variant<ready_t, init_f> value;
+
+    template <typename InitF>
+    state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)}
+    {
+    }
+
+    ~state() noexcept
+    {
+      if (std::holds_alternative<ready_t>(value)) {
+        auto& [ptr, stream] = std::get<ready_t>(value);
+        RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream));
+      }
+    }
+
+    void eval(rmm::cuda_stream_view stream)
+    {
+      std::lock_guard<std::mutex> lock(mutex);
+      if (std::holds_alternative<init_f>(value)) {
+        auto& [fun, size]     = std::get<init_f>(value);
+        dev_descriptor_t* ptr = nullptr;
+        RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream));
+        fun(ptr, stream);
+        value = std::make_tuple(ptr, stream);
+        ready.store(true, std::memory_order_release);
+      }
+    }
+
+    auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t*
+    {
+      if (!ready.load(std::memory_order_acquire)) { eval(stream); }
+      return std::get<0>(std::get<ready_t>(value));
+    }
+  };
+
   template <typename DescriptorImpl, typename InitF>
   dataset_descriptor_host(const DescriptorImpl& dd_host, InitF init)
-    : value_{std::make_tuple(init, sizeof(DescriptorImpl))},
+    : value_{std::make_shared<state>(init, sizeof(DescriptorImpl))},
       smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
       team_size{dd_host.team_size()}
   {
   }
 
+  dataset_descriptor_host() = default;
+
   /**
    * Return the device pointer, possibly evaluating it in the given thread.
    */
   [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) const -> const dev_descriptor_t*
   {
-    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
-    return std::get<dd_ptr_t>(value_).get();
+    return value_->get(stream);
   }
+
   [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) -> dev_descriptor_t*
   {
-    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
-    return std::get<dd_ptr_t>(value_).get();
+    return value_->get(stream);
   }
 
  private:
-  mutable std::variant<dd_ptr_t, init_f> value_;
-
-  static auto eval(init_f init, rmm::cuda_stream_view stream) -> dd_ptr_t
-  {
-    using raft::RAFT_NAME;
-    auto& [fun, size] = init;
-    dd_ptr_t dev_ptr{
-      [stream, s = size]() {
-        dev_descriptor_t* p;
-        RAFT_CUDA_TRY(cudaMallocAsync(&p, s, stream));
-        return p;
-      }(),
-      [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }};
-    fun(dev_ptr.get(), stream);
-    return dev_ptr;
-  }
+  mutable std::shared_ptr<state> value_;
 };
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index b7cb9c42d..7ec3d4d9e 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -120,7 +120,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
   for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) {
     const bool valid_i = (i < num_pickup);
 
-    IndexT best_index_team_local;
+    IndexT best_index_team_local    = raft::upper_bound<IndexT>();
     DistanceT best_norm2_team_local = raft::upper_bound<DistanceT>();
     for (uint32_t j = 0; j < num_distilation; j++) {
       // Select a node randomly and compute the distance to it
@@ -145,7 +145,8 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
 
     const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u);
     if (valid_i && lane_id == 0) {
-      if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
+      if (best_index_team_local != raft::upper_bound<IndexT>() &&
+          hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
         result_distances_ptr[i] = best_norm2_team_local;
         result_indices_ptr[i]   = best_index_team_local;
       } else {
diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh
index abc907da5..e6e7ff64f 100644
--- a/cpp/src/neighbors/detail/cagra/factory.cuh
+++ b/cpp/src/neighbors/detail/cagra/factory.cuh
@@ -135,11 +135,9 @@ template <typename DataT, typename IndexT, typename DistanceT>
 struct store {
   /** Number of descriptors to cache. */
   static constexpr size_t kDefaultSize = 100;
-  raft::cache::lru<key,
-                   key_hash,
-                   std::equal_to<>,
-                   std::shared_ptr<dataset_descriptor_host<DataT, IndexT, DistanceT>>>
-    value{kDefaultSize};
+  raft::cache::
+    lru<key, key_hash, std::equal_to<>, dataset_descriptor_host<DataT, IndexT, DistanceT>>
+      value{kDefaultSize};
 };
 
 }  // namespace descriptor_cache
@@ -159,20 +157,18 @@ auto dataset_descriptor_init_with_cache(const raft::resources& res,
                                         const cagra::search_params& params,
                                         const DatasetT& dataset,
                                         cuvs::distance::DistanceType metric)
-  -> const dataset_descriptor_host<DataT, IndexT, DistanceT>&
+  -> dataset_descriptor_host<DataT, IndexT, DistanceT>
 {
-  using desc_t = dataset_descriptor_host<DataT, IndexT, DistanceT>;
-  auto key     = descriptor_cache::make_key(params, dataset, metric);
+  auto key = descriptor_cache::make_key(params, dataset, metric);
   auto& cache =
     raft::resource::get_custom_resource<descriptor_cache::store<DataT, IndexT, DistanceT>>(res)
       ->value;
-  std::shared_ptr<desc_t> desc{nullptr};
+  dataset_descriptor_host<DataT, IndexT, DistanceT> desc;
   if (!cache.get(key, &desc)) {
-    desc = std::make_shared<desc_t>(
-      std::move(dataset_descriptor_init<DataT, IndexT, DistanceT>(params, dataset, metric)));
+    desc = dataset_descriptor_init<DataT, IndexT, DistanceT>(params, dataset, metric);
     cache.set(key, desc);
   }
-  return *desc;
+  return desc;
 }
 
 };  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 43bf1ba2b..daeac82b9 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -156,6 +156,7 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
   // count number of detours (A->D->B)
   for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
     const uint64_t iD = knn_graph[kAD + (graph_degree * iA)];
+    if (iD >= graph_size) { continue; }
     for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) {
       const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)];
       for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) {
@@ -1076,11 +1077,11 @@ void optimize(
                "Each input array is expected to have the same number of rows");
   RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
                "output graph cannot have more columns than input graph");
-  const uint32_t input_graph_degree  = knn_graph.extent(1);
-  const uint32_t output_graph_degree = new_graph.extent(1);
+  const uint64_t input_graph_degree  = knn_graph.extent(1);
+  const uint64_t output_graph_degree = new_graph.extent(1);
+  const uint64_t graph_size          = new_graph.extent(0);
   auto input_graph_ptr               = knn_graph.data_handle();
   auto output_graph_ptr              = new_graph.data_handle();
-  const IdxT graph_size              = new_graph.extent(0);
 
   // MST optimization
   auto mst_graph_num_edges     = raft::make_host_vector<uint32_t, int64_t>(graph_size);
@@ -1148,7 +1149,7 @@ void optimize(
     constexpr int MAX_DEGREE = 1024;
     if (input_graph_degree > MAX_DEGREE) {
       RAFT_FAIL(
-        "The degree of input knn graph is too large (%u). "
+        "The degree of input knn graph is too large (%zu). "
         "It must be equal to or smaller than %d.",
         input_graph_degree,
         1024);
@@ -1217,11 +1218,12 @@ void optimize(
         assert(next_num_detour != std::numeric_limits<uint32_t>::max());
         num_detour = next_num_detour;
       }
-      RAFT_EXPECTS(pk == output_graph_degree,
-                   "Couldn't find the output_graph_degree (%u) smallest detourable count nodes for "
-                   "node %lu in the rank-based node reranking process",
-                   output_graph_degree,
-                   static_cast<uint64_t>(i));
+      RAFT_EXPECTS(
+        pk == output_graph_degree,
+        "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
+        "node %lu in the rank-based node reranking process",
+        output_graph_degree,
+        i);
     }
 
     const double time_prune_end = cur_time();
@@ -1317,7 +1319,7 @@ void optimize(
       uint32_t kf       = 0;
       uint32_t k        = mst_graph_num_edges_ptr[i];
 
-      const uint64_t num_protected_edges = max(k, output_graph_degree / 2);
+      const auto num_protected_edges = std::max<uint64_t>(k, output_graph_degree / 2);
       assert(num_protected_edges <= output_graph_degree);
       if (num_protected_edges == output_graph_degree) continue;
 
@@ -1342,7 +1344,7 @@ void optimize(
       assert(kf <= output_graph_degree);
 
       // Replace some edges of the output graph with edges of the reverse graph.
-      uint32_t kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree);
+      auto kr = std::min<uint32_t>(rev_graph_count.data_handle()[i], output_graph_degree);
       while (kr) {
         kr -= 1;
         if (my_rev_graph[kr] < graph_size) {
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index 0003f2495..ecfd856f1 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -93,10 +93,10 @@ struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_
   using base_type::num_seeds;
 
   uint32_t num_cta_per_query;
-  rmm::device_uvector<INDEX_T> intermediate_indices;
-  rmm::device_uvector<float> intermediate_distances;
+  lightweight_uvector<INDEX_T> intermediate_indices;
+  lightweight_uvector<float> intermediate_distances;
   size_t topk_workspace_size;
-  rmm::device_uvector<uint32_t> topk_workspace;
+  lightweight_uvector<uint32_t> topk_workspace;
 
   search(raft::resources const& res,
          search_params params,
@@ -105,9 +105,9 @@ struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_
          int64_t graph_degree,
          uint32_t topk)
     : base_type(res, params, dataset_desc, dim, graph_degree, topk),
-      intermediate_indices(0, raft::resource::get_cuda_stream(res)),
-      intermediate_distances(0, raft::resource::get_cuda_stream(res)),
-      topk_workspace(0, raft::resource::get_cuda_stream(res))
+      intermediate_indices(res),
+      intermediate_distances(res),
+      topk_workspace(res)
 
   {
     set_params(res, params);
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index 9c22134a6..c6fe21642 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -91,6 +91,15 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre
   get_value_kernel<T><<<1, 1, 0, cuda_stream>>>(host_ptr, dev_ptr);
 }
 
+template <class T>
+auto get_value(const T* const dev_ptr, cudaStream_t stream) -> T
+{
+  T value;
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&value, dev_ptr, sizeof(value), cudaMemcpyDefault, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  return value;
+}
+
 // MAX_DATASET_DIM : must equal to or greater than dataset_dim
 template <class DATASET_DESCRIPTOR_T>
 RAFT_KERNEL random_pickup_kernel(
@@ -609,18 +618,18 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
   using base_type::num_seeds;
 
   size_t result_buffer_allocation_size;
-  rmm::device_uvector<INDEX_T> result_indices;       // results_indices_buffer
-  rmm::device_uvector<DISTANCE_T> result_distances;  // result_distances_buffer
-  rmm::device_uvector<INDEX_T> parent_node_list;
-  rmm::device_uvector<uint32_t> topk_hint;
-  rmm::device_scalar<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
-  rmm::device_uvector<uint32_t> topk_workspace;
+  lightweight_uvector<INDEX_T> result_indices;       // results_indices_buffer
+  lightweight_uvector<DISTANCE_T> result_distances;  // result_distances_buffer
+  lightweight_uvector<INDEX_T> parent_node_list;
+  lightweight_uvector<uint32_t> topk_hint;
+  lightweight_uvector<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
+  lightweight_uvector<uint32_t> topk_workspace;
 
   // temporary storage for _find_topk
-  rmm::device_uvector<float> input_keys_storage;
-  rmm::device_uvector<float> output_keys_storage;
-  rmm::device_uvector<INDEX_T> input_values_storage;
-  rmm::device_uvector<INDEX_T> output_values_storage;
+  lightweight_uvector<float> input_keys_storage;
+  lightweight_uvector<float> output_keys_storage;
+  lightweight_uvector<INDEX_T> input_values_storage;
+  lightweight_uvector<INDEX_T> output_values_storage;
 
   search(raft::resources const& res,
          search_params params,
@@ -629,16 +638,16 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
          int64_t graph_degree,
          uint32_t topk)
     : base_type(res, params, dataset_desc, dim, graph_degree, topk),
-      result_indices(0, raft::resource::get_cuda_stream(res)),
-      result_distances(0, raft::resource::get_cuda_stream(res)),
-      parent_node_list(0, raft::resource::get_cuda_stream(res)),
-      topk_hint(0, raft::resource::get_cuda_stream(res)),
-      topk_workspace(0, raft::resource::get_cuda_stream(res)),
-      terminate_flag(raft::resource::get_cuda_stream(res)),
-      input_keys_storage(0, raft::resource::get_cuda_stream(res)),
-      output_keys_storage(0, raft::resource::get_cuda_stream(res)),
-      input_values_storage(0, raft::resource::get_cuda_stream(res)),
-      output_values_storage(0, raft::resource::get_cuda_stream(res))
+      result_indices(res),
+      result_distances(res),
+      parent_node_list(res),
+      topk_hint(res),
+      topk_workspace(res),
+      terminate_flag(res),
+      input_keys_storage(res),
+      output_keys_storage(res),
+      input_values_storage(res),
+      output_values_storage(res)
   {
     set_params(res);
   }
@@ -662,7 +671,7 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
       itopk_size, max_queries, result_buffer_size, utils::get_cuda_data_type<DATA_T>());
     RAFT_LOG_DEBUG("# topk_workspace_size: %lu", topk_workspace_size);
     topk_workspace.resize(topk_workspace_size, raft::resource::get_cuda_stream(res));
-
+    terminate_flag.resize(1, raft::resource::get_cuda_stream(res));
     hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res));
   }
 
@@ -847,7 +856,7 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
                           stream);
 
       // termination (2)
-      if (iter + 1 >= min_iterations && terminate_flag.value(stream)) {
+      if (iter + 1 >= min_iterations && get_value(terminate_flag.data(), stream)) {
         iter++;
         break;
       }
diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh
index f23b96631..99254aa50 100644
--- a/cpp/src/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh
@@ -151,7 +151,7 @@ struct search_plan_impl : public search_plan_impl_base {
   lightweight_uvector<INDEX_T> hashmap;
   lightweight_uvector<uint32_t> num_executed_iterations;  // device or managed?
   lightweight_uvector<INDEX_T> dev_seed;
-  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc;
+  dataset_descriptor_host<DataT, IndexT, DistanceT> dataset_desc;
 
   search_plan_impl(raft::resources const& res,
                    search_params params,
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 2bed19009..fa71dbaf9 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -129,17 +129,27 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
       (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
       sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width +
       sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t);
-    smem_size = base_smem_size;
+
+    std::uint32_t additional_smem_size = 0;
     if (num_itopk_candidates > 256) {
       // Tentatively calculate the required share memory size when radix
       // sort based topk is used, assuming the block size is the maximum.
       if (itopk_size <= 256) {
-        smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        additional_smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
       } else {
-        smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        additional_smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
       }
     }
 
+    if (!std::is_same_v<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>) {
+      // For filtering postprocess
+      using scan_op_t = cub::WarpScan<unsigned>;
+      additional_smem_size =
+        std::max<std::uint32_t>(additional_smem_size, sizeof(scan_op_t::TempStorage));
+    }
+
+    smem_size = base_smem_size + additional_smem_size;
+
     uint32_t block_size = thread_block_size;
     if (block_size == 0) {
       block_size = min_block_size;
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 79cb6bc10..678ed0cb4 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -111,7 +111,7 @@ RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const termin
 }
 
 template <unsigned MAX_CANDIDATES, class IdxT = void>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_full(
   float* candidate_distances,  // [num_candidates]
   IdxT* candidate_indices,     // [num_candidates]
   const std::uint32_t num_candidates,
@@ -215,7 +215,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
 }
 
 template <unsigned MAX_ITOPK, class IdxT = void>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge(
   float* itopk_distances,  // [num_itopk]
   IdxT* itopk_indices,     // [num_itopk]
   const std::uint32_t num_itopk,
@@ -424,7 +424,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
 template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           class IdxT>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge(
   float* itopk_distances,  // [num_itopk]
   IdxT* itopk_indices,     // [num_itopk]
   const std::uint32_t num_itopk,
@@ -437,20 +437,62 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
   const unsigned MULTI_WARPS_2)
 {
   // The results in candidate_distances/indices are sorted by bitonic sort.
-  topk_by_bitonic_sort_1st<MAX_CANDIDATES, IdxT>(
+  topk_by_bitonic_sort_and_full<MAX_CANDIDATES, IdxT>(
     candidate_distances, candidate_indices, num_candidates, num_itopk, MULTI_WARPS_1);
 
   // The results sorted above are merged with the internal intermediate top-k
   // results so far using bitonic merge.
-  topk_by_bitonic_sort_2nd<MAX_ITOPK, IdxT>(itopk_distances,
-                                            itopk_indices,
-                                            num_itopk,
-                                            candidate_distances,
-                                            candidate_indices,
-                                            num_candidates,
-                                            work_buf,
-                                            first,
-                                            MULTI_WARPS_2);
+  topk_by_bitonic_sort_and_merge<MAX_ITOPK, IdxT>(itopk_distances,
+                                                  itopk_indices,
+                                                  num_itopk,
+                                                  candidate_distances,
+                                                  candidate_indices,
+                                                  num_candidates,
+                                                  work_buf,
+                                                  first,
+                                                  MULTI_WARPS_2);
+}
+
+// This function move the invalid index element to the end of the itopk list.
+// Require : array_length % 32 == 0 && The invalid entry is only one.
+template <class IdxT>
+RAFT_DEVICE_INLINE_FUNCTION void move_invalid_to_end_of_list(IdxT* const index_array,
+                                                             float* const distance_array,
+                                                             const std::uint32_t array_length)
+{
+  constexpr std::uint32_t warp_size     = 32;
+  constexpr std::uint32_t invalid_index = utils::get_max_value<IdxT>();
+  const std::uint32_t lane_id           = threadIdx.x % warp_size;
+
+  if (threadIdx.x >= warp_size) { return; }
+
+  bool found_invalid = false;
+  if (array_length % warp_size == 0) {
+    for (std::uint32_t i = lane_id; i < array_length; i += warp_size) {
+      const auto index    = index_array[i];
+      const auto distance = distance_array[i];
+
+      if (found_invalid) {
+        index_array[i - 1]    = index;
+        distance_array[i - 1] = distance;
+      } else {
+        // Check if the index is invalid
+        const auto I_found_invalid = (index == invalid_index);
+        const auto who_has_invalid = raft::ballot(I_found_invalid);
+        // if a value that is loaded by a smaller lane id thread, shift the array
+        if (who_has_invalid << (warp_size - lane_id)) {
+          index_array[i - 1]    = index;
+          distance_array[i - 1] = distance;
+        }
+
+        found_invalid = who_has_invalid;
+      }
+    }
+  }
+  if (lane_id == 0) {
+    index_array[array_length - 1]    = invalid_index;
+    distance_array[array_length - 1] = utils::get_max_value<float>();
+  }
 }
 
 template <class INDEX_T>
@@ -589,10 +631,10 @@ __device__ void search_core(
     // sort
     if constexpr (TOPK_BY_BITONIC_SORT) {
       // [Notice]
-      // It is good to use multiple warps in topk_by_bitonic_sort() when
+      // It is good to use multiple warps in topk_by_bitonic_sort_and_merge() when
       // batch size is small (short-latency), but it might not be always good
       // when batch size is large (high-throughput).
-      // topk_by_bitonic_sort() consists of two operations:
+      // topk_by_bitonic_sort_and_merge() consists of two operations:
       // if MAX_CANDIDATES is greater than 128, the first operation uses two warps;
       // if MAX_ITOPK is greater than 256, the second operation used two warps.
       const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
@@ -601,9 +643,9 @@ __device__ void search_core(
       // reset small-hash table.
       if ((iter + 1) % small_hash_reset_interval == 0) {
         // Depending on the block size and the number of warps used in
-        // topk_by_bitonic_sort(), determine which warps are used to reset
+        // topk_by_bitonic_sort_and_merge(), determine which warps are used to reset
         // the small hash and whether they are performed in overlap with
-        // topk_by_bitonic_sort().
+        // topk_by_bitonic_sort_and_merge().
         _CLK_START();
         unsigned hash_start_tid;
         if (blockDim.x == 32) {
@@ -627,28 +669,28 @@ __device__ void search_core(
 
       // topk with bitonic sort
       _CLK_START();
-      if (std::is_same<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>::value ||
-          *filter_flag == 0) {
-        topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES>(result_distances_buffer,
-                                                        result_indices_buffer,
-                                                        internal_topk,
-                                                        result_distances_buffer + internal_topk,
-                                                        result_indices_buffer + internal_topk,
-                                                        search_width * graph_degree,
-                                                        topk_ws,
-                                                        (iter == 0),
-                                                        multi_warps_1,
-                                                        multi_warps_2);
-        __syncthreads();
-      } else {
-        topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
-          result_distances_buffer,
-          result_indices_buffer,
-          internal_topk + search_width * graph_degree,
-          internal_topk,
-          false);
+      if (!(std::is_same<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>::value ||
+            *filter_flag == 0)) {
+        // Move the filtered out index to the end of the itopk list
+        for (unsigned i = 0; i < search_width; i++) {
+          move_invalid_to_end_of_list(
+            result_indices_buffer, result_distances_buffer, internal_topk);
+        }
+
         if (threadIdx.x == 0) { *terminate_flag = 0; }
       }
+      topk_by_bitonic_sort_and_merge<MAX_ITOPK, MAX_CANDIDATES>(
+        result_distances_buffer,
+        result_indices_buffer,
+        internal_topk,
+        result_distances_buffer + internal_topk,
+        result_indices_buffer + internal_topk,
+        search_width * graph_degree,
+        topk_ws,
+        (iter == 0),
+        multi_warps_1,
+        multi_warps_2);
+      __syncthreads();
       _CLK_REC(clk_topk);
     } else {
       _CLK_START();
@@ -755,12 +797,66 @@ __device__ void search_core(
     }
 
     __syncthreads();
-    topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
-      result_distances_buffer,
-      result_indices_buffer,
-      internal_topk + search_width * graph_degree,
-      top_k,
-      false);
+    // Move invalid index items to the end of the buffer without sorting the entire buffer
+    using scan_op_t    = cub::WarpScan<unsigned>;
+    auto& temp_storage = *reinterpret_cast<typename scan_op_t::TempStorage*>(smem_work_ptr);
+
+    constexpr std::uint32_t warp_size = 32;
+    if (threadIdx.x < warp_size) {
+      std::uint32_t num_found_valid = 0;
+      for (std::uint32_t buffer_offset = 0; buffer_offset < internal_topk;
+           buffer_offset += warp_size) {
+        // Calculate the new buffer index
+        const auto src_position = buffer_offset + threadIdx.x;
+        const std::uint32_t is_valid_index =
+          (result_indices_buffer[src_position] & (~index_msb_1_mask)) == invalid_index ? 0 : 1;
+        std::uint32_t new_position;
+        scan_op_t(temp_storage).InclusiveSum(is_valid_index, new_position);
+        if (is_valid_index) {
+          const auto dst_position               = num_found_valid + (new_position - 1);
+          result_indices_buffer[dst_position]   = result_indices_buffer[src_position];
+          result_distances_buffer[dst_position] = result_distances_buffer[src_position];
+        }
+
+        // Calculate the largest valid position within a warp and bcast it for the next iteration
+        num_found_valid += new_position;
+        for (std::uint32_t offset = (warp_size >> 1); offset > 0; offset >>= 1) {
+          const auto v = raft::shfl_xor(num_found_valid, offset);
+          if ((threadIdx.x & offset) == 0) { num_found_valid = v; }
+        }
+
+        // If the enough number of items are found, do early termination
+        if (num_found_valid >= top_k) { break; }
+      }
+
+      if (num_found_valid < top_k) {
+        // Fill the remaining buffer with invalid values so that `topk_by_bitonic_sort_and_merge` is
+        // usable in the next step
+        for (std::uint32_t i = num_found_valid + threadIdx.x; i < internal_topk; i += warp_size) {
+          result_indices_buffer[i]   = invalid_index;
+          result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+        }
+      }
+    }
+
+    // If the sufficient number of valid indexes are not in the internal topk, pick up from the
+    // candidate list.
+    if (top_k > internal_topk || result_indices_buffer[top_k - 1] == invalid_index) {
+      __syncthreads();
+      const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
+      const unsigned multi_warps_2 = ((blockDim.x >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
+      topk_by_bitonic_sort_and_merge<MAX_ITOPK, MAX_CANDIDATES>(
+        result_distances_buffer,
+        result_indices_buffer,
+        internal_topk,
+        result_distances_buffer + internal_topk,
+        result_indices_buffer + internal_topk,
+        search_width * graph_degree,
+        topk_ws,
+        (iter == 0),
+        multi_warps_1,
+        multi_warps_2);
+    }
     __syncthreads();
   }
 
diff --git a/cpp/src/neighbors/detail/dynamic_batching.cuh b/cpp/src/neighbors/detail/dynamic_batching.cuh
new file mode 100644
index 000000000..5c6b1654e
--- /dev/null
+++ b/cpp/src/neighbors/detail/dynamic_batching.cuh
@@ -0,0 +1,1197 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../sample_filter.cuh"
+
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/pinned_mdarray.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cooperative_groups.h>
+#include <cuda/atomic>
+#include <cuda/std/atomic>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <chrono>
+#include <limits>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#ifndef CUVS_SYSTEM_LITTLE_ENDIAN
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CUVS_SYSTEM_LITTLE_ENDIAN 0
+#else
+#define CUVS_SYSTEM_LITTLE_ENDIAN 1
+#endif
+#endif
+
+namespace cuvs::neighbors::dynamic_batching::detail {
+
+using raft::RAFT_NAME;  // TODO: a workaround for RAFT_LOG_XXX macros
+
+/**
+ * A helper to make the requester threads more cooperative when busy-spinning.
+ * It is used in the wait loops across this file to reduce the CPU usage.
+ *
+ * Ideally, we should be using atomics notify/wait feature, but that is not always possible
+ * (e.g. waiting on multiple things or waiting on GPU volatile stores).
+ */
+struct local_waiter {
+  static constexpr inline int64_t kNonSleepIterations = 10;
+
+  explicit local_waiter(std::chrono::nanoseconds base_sleep_time,
+                        int64_t start_iteration = 0) noexcept
+    : base_sleep_time_{base_sleep_time}, iteration_{start_iteration}
+  {
+  }
+
+  inline void wait() noexcept
+  {
+    if (iteration_ < 2) {
+      // Don't wait for the first few iterations:
+      // maybe there's a weak CAS op in the loop, or something else that could return quickly
+    } else if (iteration_ < kNonSleepIterations) {
+      std::this_thread::yield();
+    } else {
+      auto k = iteration_ + 1 - kNonSleepIterations;
+      std::this_thread::sleep_for(base_sleep_time_ * k);
+    }
+    ++iteration_;
+  }
+
+  inline void reset(int64_t start_iteration = 0) noexcept { iteration_ = start_iteration; }
+
+ private:
+  std::chrono::nanoseconds base_sleep_time_;
+  int64_t iteration_;
+};
+
+class cuda_event {
+ public:
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+  ~cuda_event()                       = default;
+  cuda_event(cuda_event const&)       = delete;  // Copying disallowed: one event one owner
+  cuda_event& operator=(cuda_event&)  = delete;
+
+  cuda_event()
+    : event_{[]() {
+               cudaEvent_t* e = new cudaEvent_t;
+               RAFT_CUDA_TRY(cudaEventCreateWithFlags(e, cudaEventDisableTiming));
+               return e;
+             }(),
+             [](cudaEvent_t* e) {
+               RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(*e));
+               delete e;
+             }}
+  {
+  }
+
+  cudaEvent_t value() const { return *event_; }
+
+ private:
+  std::unique_ptr<cudaEvent_t, std::function<void(cudaEvent_t*)>> event_;
+};
+
+template <typename MdSpanOrArray>
+struct get_accessor_type_t {
+  using type = typename MdSpanOrArray::accessor_type;
+};
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+struct get_accessor_type_t<raft::mdarray<ElementType, Extents, LayoutPolicy, ContainerPolicy>> {
+  using mdarray_type = raft::mdarray<ElementType, Extents, LayoutPolicy, ContainerPolicy>;
+  using view_type    = typename mdarray_type::view_type;
+  using type         = typename view_type::accessor_type;
+};
+
+template <typename MdSpanOrArray>
+using get_accessor_type = typename get_accessor_type_t<MdSpanOrArray>::type;
+
+template <typename Source3DT>
+constexpr inline auto slice_3d(typename Source3DT::index_type i,
+                               const Source3DT& source3d,
+                               typename Source3DT::index_type n_rows = 0)
+{
+  using element_type  = typename Source3DT::element_type;
+  using index_type    = typename Source3DT::index_type;
+  using layout_type   = typename Source3DT::layout_type;
+  using accessor_type = get_accessor_type<Source3DT>;
+  auto extent2d =
+    raft::make_extents<index_type>(n_rows == 0 ? source3d.extent(1) : n_rows, source3d.extent(2));
+  auto stride = uint64_t(source3d.extent(1)) * uint64_t(source3d.extent(2));
+  return raft::mdspan<element_type, decltype(extent2d), layout_type, accessor_type>{
+    const_cast<element_type*>(source3d.data_handle()) + stride * i, extent2d};
+}
+
+template <typename Source2DT>
+constexpr inline auto slice_2d(typename Source2DT::index_type i, const Source2DT& source2d)
+{
+  using element_type  = typename Source2DT::element_type;
+  using index_type    = typename Source2DT::index_type;
+  using layout_type   = typename Source2DT::layout_type;
+  using accessor_type = get_accessor_type<Source2DT>;
+  auto extent1d       = raft::make_extents<index_type>(source2d.extent(1));
+  auto stride         = uint64_t(extent1d.extent(0));
+  return raft::mdspan<element_type, decltype(extent1d), layout_type, accessor_type>{
+    const_cast<element_type*>(source2d.data_handle()) + stride * i, extent1d};
+}
+
+// ---------------------------------------------
+
+constexpr size_t kCacheLineBytes = 64;
+
+template <typename Upstream, typename T, typename IdxT>
+using upstream_search_type_const = void(raft::resources const&,
+                                        typename Upstream::search_params_type const&,
+                                        Upstream const&,
+                                        raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                        raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                        raft::device_matrix_view<float, int64_t, raft::row_major>,
+                                        const cuvs::neighbors::filtering::base_filter&);
+
+template <typename Upstream, typename T, typename IdxT>
+using upstream_search_type = void(raft::resources const&,
+                                  typename Upstream::search_params_type const&,
+                                  Upstream&,
+                                  raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<float, int64_t, raft::row_major>,
+                                  const cuvs::neighbors::filtering::base_filter&);
+
+template <typename T, typename IdxT>
+using function_search_type = void(raft::resources const&,
+                                  raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<float, int64_t, raft::row_major>);
+
+/**
+ * State of the batch token slot.
+ *
+ * In a nutshell, there are only two batch slot states that matter: empty or full.
+ * Initially, all slots are empty. The host threads can commit (i.e. subscribe) to a batch slot even
+ * if it's empty (when they know it will be filled-in at some point in future). With this logic, we
+ * smooth out the bottleneck that occurs when many threads try to submit their work using a single
+ * atomic counter (the batch queue head).
+ *
+ * Once a GPU IO buffer is available, its owner returns the buffer to the queue by marking a slot as
+ * full. By that time, it may be partially or fully committed (i.e. several host threads are
+ * committed to submit a certain number of queries).
+ *
+ * If we had an infinite buffer, these two states would suffice. However, we have a finite ring
+ * buffer, so the used-up slots must be emptied again, so that they are usable in the following
+ * rounds through the ring buffer.
+ *
+ * The slot state depends not only on the value stored in it, but on the accessing thread as well
+ * (see `batch_queue_t::batch_status` below). The accessing thread may be ahead or behind the others
+ * (as defined by the sequential order id below). Depending on the accessor state, it may view the
+ * slot as being emptied/filled in the future, current, or previous rounds. This affects the
+ * decision whether the slot can be used and whether the thread has the right to advance tail or
+ * head counters of the batch queue.
+ *
+ */
+enum struct slot_state : int32_t {
+  /** The slot is empty, cleared-up in this round (hence the head should be past it). */
+  kEmptyPast = 1025,
+  /** The slot is empty, cleared-up in previous round. */
+  kEmpty = 1024,
+  /** The slot is empty, cleared-up two round ago and cannot be used yet (due to be filled). */
+  kEmptyBusy = 1023,
+  /** The current thread has been sleeping for too long and is way behind the others. */
+  kFullPast = 1,
+  /** The slot is full, filled-in in this round. */
+  kFull = 0,
+  /** This state is considered full, filled-in in previous round.  */
+  kFullBusy = -1
+  /** The rest of the values are impossible states indicating an error in the algo. */
+};
+
+/**
+ * Identifies the batch and its job-commit state.
+ * Should be in the pinned memory for fast shared access on CPU and GPU side.
+ *
+ * The batch token packs the IO buffer address (id) and a number of committed queries in a single
+ * 64-bit atomic. This is to allow conflict-free atomic updates of both values.
+ *
+ */
+struct batch_token {
+  uint64_t value = 0;
+
+  constexpr inline batch_token() {}
+  explicit constexpr inline batch_token(uint32_t buffer_id) { id() = buffer_id; }
+
+  /**
+   * Sequential id of the batch in the array of batches.
+   *
+   * The `id` field, in practice, stores not only the IO buffer address, but also an extra
+   * sequential "round" id. The latter identifies how many rounds through the batch ring buffer has
+   * already been done (computed from the the `seq_order_id` counter in the batch queue) and is used
+   * by `batch_queue_t::batch_status` below to compute the `slot_state`. This is to avoid the ABA
+   * atomic updates problem when using the ring buffer.
+   *
+   * There cannot be more IO buffers than the size of the ring buffer. The size of the ring buffer
+   * is always a power-of-two. Hence the IO buffer address needs only `log2(Size)` bits, and the
+   * rest is used for the ring buffer round id (see `batch_queue_t::make_seq_batch_id`).
+   *
+   */
+  RAFT_INLINE_FUNCTION auto id() noexcept -> uint32_t&
+  {
+    return *(reinterpret_cast<uint32_t*>(&value) + kOffsetOfId);
+  }
+  /**
+   * How many queries are promised by the participating CPU threads (requesters).
+   *
+   * The CPU threads atomically increment this counter until its size reaches `max_batch_size`.
+   *
+   * Any (CPU or GPU thread) may atomically write to the highest byte of this value, which indicates
+   * that no one can commit to this batch anymore (e.g. the wait timeout is exceeded).
+   * Hence, the actual number of committed queries is `size_committed % 0x00ffffff`.
+   *
+   * The gather kernel cannot finish while `size_committed < max_batch_size`.
+   *
+   * NB: we use the trick of writing to the highest byte to allow GPU write atomically to the pinned
+   * host memory. This way, we don't need to use device RMW atomics on host memory, which are not
+   * available on a broad class of GPUs. If not this workaround, we could simply do atomic add/or
+   * with value 0x01000000.
+   */
+  RAFT_INLINE_FUNCTION auto size_committed() noexcept -> uint32_t&
+  {
+    return *(reinterpret_cast<uint32_t*>(&value) + kOffsetOfSC);
+  }
+
+ private:
+  /** Offset of the `id()` value in the token if it's interpreted as uint32_t[2]. */
+  static constexpr inline uint32_t kOffsetOfId = CUVS_SYSTEM_LITTLE_ENDIAN;
+  /** Offset of the `size_committed()` value in the token if it's interpreted as uint32_t[2]. */
+  static constexpr inline uint32_t kOffsetOfSC = 1 - kOffsetOfId;
+};
+static_assert(sizeof(batch_token) == sizeof(uint64_t));
+static_assert(cuda::std::atomic<batch_token>::is_always_lock_free);
+
+/**
+ * The batch queue consists of several ring buffers and two counters determining where are the head
+ * and the tail of the queue in those buffers.
+ *
+ * There is an internal sequentially consistent order in the queue, defined by `seq_order_id`
+ * counter. The head and tail members define where the participants should look for full and
+ * empty slots in the queue respectively.
+ *
+ * The slots in the queue have their own states (see `slot_state` above). The states are updated
+ * concurrently in many threads, so the head and tail counters do not always accurately represent
+ * the actual compound state of the queue.
+ *
+ * `.head()` is where a host thread starts looking for a batch token. All slots earlier than
+ * returned by this method are not usable anymore (they batches are either "fully committed",
+ * dispatched, or emptied earlier). If a host thread determines that the current slot is not usable
+ * anymore, it increments the counter by calling `.pop()`.
+ *
+ * The tail is where a host thread reserves an empty slot to be filled-in by a GPU worker thread
+ * once it releases the owned IO buffer. There's no `.tail()` method, but `.push()` method returns
+ * the tail position (before advancing it). `.push()` blocks the host thread until it knows the slot
+ * isn't used by any other threads anymore (i.e. cleaned-up from the previous round).
+ *
+ * There's no strict relation between the head and the tail.
+ * Normally there is a single batch in the ring buffer being partially filled. It is followed by
+ * contiguous list of empty idle batches and reserved empty slots. The head and the tail loosely
+ * correspond to the beginning and the end of this sequence.
+ *
+ * Sometimes, the head can go further than the tail. This means all batches are busy and there are
+ * more threads committed to the slots that are not populated with the batches (and not even
+ * reserved for filling-in yet).
+ *
+ *
+ */
+template <uint32_t Size>
+struct batch_queue_t {
+  static constexpr uint32_t kSize        = Size;
+  static constexpr uint32_t kMinElemSize = sizeof(uint32_t);
+  static_assert(cuda::std::atomic<batch_token>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(cuda::std::atomic<uint32_t>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(cuda::std::atomic<int32_t>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(raft::is_a_power_of_two(kSize), "The size must be a power-of-two for efficiency.");
+
+  static constexpr auto kMemOrder = cuda::std::memory_order_relaxed;
+
+  /** Type-safe synonym for the internal head & tail counters. */
+  struct seq_order_id {
+    uint32_t value;
+  };
+
+  explicit batch_queue_t(const raft::resources& res, bool use_batch_sizes) noexcept
+    : tokens_{raft::make_pinned_vector<cuda::atomic<batch_token, cuda::thread_scope_system>,
+                                       uint32_t>(res, kSize)},
+      rem_time_us_{
+        raft::make_pinned_vector<cuda::atomic<int32_t, cuda::thread_scope_system>, uint32_t>(
+          res, kSize)},
+      dispatch_sequence_id_(kSize),
+      batch_sizes_{
+        use_batch_sizes
+          ? std::make_optional(
+              raft::make_pinned_vector<cuda::atomic<uint32_t, cuda::thread_scope_system>, uint32_t>(
+                res, kSize))
+          : std::nullopt}
+  {
+    tail_.store(0, kMemOrder);
+    head_.store(0, kMemOrder);
+    auto past_seq_id = seq_order_id{static_cast<uint32_t>(-1)};
+    for (uint32_t i = 0; i < kSize; i++) {
+      rem_time_us_(i).store(std::numeric_limits<int32_t>::max(), kMemOrder);
+      if (batch_sizes_.has_value()) { batch_sizes_.value()(i).store(0, kMemOrder); }
+      dispatch_sequence_id_[i].store(past_seq_id.value, kMemOrder);
+      tokens_(i).store(make_empty_token(past_seq_id), kMemOrder);
+    }
+  }
+
+  /**
+   * Advance the tail position, ensure the slot is empty, and return the reference to the new slot.
+   * The calling side is responsible for filling-in the slot with an actual value at a later time.
+   *
+   * Conceptually, this method reserves a ring buffer slot on the host side, so that the GPU worker
+   * thread can return the IO buffer (filling the token slot) asynchronously.
+   */
+  inline auto push() -> seq_order_id
+  {
+    seq_order_id seq_id{tail_.fetch_add(1, kMemOrder)};
+    auto& loc = token(seq_id);
+    auto ss   = batch_status(loc.load(kMemOrder), seq_id);
+    /* [Note: very small waiting time]
+
+    Only a few (dispatcher) threads are going to call this function at the same time as opposed to
+    potentially any number of threads waiting on new batches to arrive.
+    This is a performance-critical code path.
+
+    Hence the small base sleep time.
+    */
+    local_waiter till_empty{std::chrono::nanoseconds{1000}};
+    while (ss == slot_state::kFull || ss == slot_state::kFullBusy || ss == slot_state::kEmptyBusy) {
+      // Wait till the slot becomes empty (doesn't matter future or past).
+      // The batch id is only ever updated in the scatter/gather kernels, which are the only source
+      // of truth whether a batch buffer is currently used by the GPU.
+      till_empty.wait();
+      ss = batch_status(loc.load(kMemOrder), seq_id);
+    }
+    return seq_id;
+  }
+
+  /**
+   * Return the offset of the given w.r.t. the tail of the queue.
+   * Negative value means the given slot is in the body of the queue and should be dispatched soon.
+   * Positive value means the given slot is ahead of the queue and should wait longer.
+   *
+   * That is the lower the value the higher the priority.
+   */
+  [[nodiscard]] inline auto niceness(seq_order_id id) const noexcept -> int32_t
+  {
+    return static_cast<int32_t>(id.value - tail_.load(kMemOrder));
+  }
+
+  /** Get the reference to the first element in the queue. */
+  inline auto head() noexcept -> seq_order_id
+  {
+    auto h = head_.load(kMemOrder);
+    // The head cannot go ahead of the tail by more than the queue buffer size.
+    // If the head is ahead by not more than kSize elements though, everything is fine;
+    // the slots too far ahead are protected by busy tokens.
+    local_waiter for_tail(std::chrono::nanoseconds{100000});
+    while (static_cast<int32_t>(h - tail_.load(kMemOrder)) >= static_cast<int32_t>(kSize)) {
+      for_tail.wait();
+      h = head_.load(kMemOrder);
+    }
+    return seq_order_id{h};
+  }
+
+  /** Batch commit state and IO buffer id (see `batch_token`) */
+  inline auto token(seq_order_id id) -> cuda::atomic<batch_token, cuda::thread_scope_system>&
+  {
+    return tokens_(cache_friendly_idx(id.value));
+  }
+
+  /**
+   * How much time has this batch left for waiting.
+   * It is an approximate value by design - to minimize the synchronization between CPU and GPU.
+   *
+   * The clocks on GPU and CPU may have different values, so the running kernel and the CPU thread
+   * have different ideas on how much time is left. Rather than trying to synchronize the clocks, we
+   * maintain independent timers and accept the uncertainty.
+   *
+   * Access pattern: CPU write-only (producer); GPU read-only (consumer).
+   */
+  inline auto rem_time_us(seq_order_id id) -> cuda::atomic<int32_t, cuda::thread_scope_system>&
+  {
+    return rem_time_us_(cache_friendly_idx(id.value));
+  }
+
+  /**
+   * The actual batch size - the final number of committed queries.
+   * This is only used if `conservative_dispatch = true`.
+   */
+  inline auto batch_size(seq_order_id id) noexcept
+    -> cuda::atomic<uint32_t, cuda::thread_scope_system>*
+  {
+    if (batch_sizes_.has_value()) { return &batch_sizes_.value()(cache_friendly_idx(id.value)); }
+    return nullptr;
+  }
+
+  /**
+   * This value is updated by the host thread after it submits the job completion event to indicate
+   * to other threads can wait on the event to get the results back.
+   * Other threads get the value from the batch queue and compare that value against this atomic.
+   *
+   * Access pattern: CPU-only; dispatching thread writes the id once, other threads wait on it.
+   */
+  inline auto dispatch_sequence_id(seq_order_id id) -> cuda::std::atomic<uint32_t>&
+  {
+    return dispatch_sequence_id_[cache_friendly_idx(id.value)];
+  }
+
+  /**
+   * An `atomicMax` on the queue head in disguise.
+   * This makes the given batch slot and all prior slots unreachable (not possible to commit).
+   */
+  inline void pop(seq_order_id id) noexcept
+  {
+    const auto desired = id.value + 1;
+    auto observed      = id.value;
+    while (observed < desired &&
+           !head_.compare_exchange_weak(observed, desired, kMemOrder, kMemOrder)) {}
+  }
+
+  static constexpr inline auto batch_id(batch_token token) noexcept -> uint32_t
+  {
+    return token.id() & kCounterLocMask;
+  }
+
+  /**
+   * Construct a token that is interpreted as having been emptied in the current round
+   * (the round is derived from seq_id).
+   *
+   * NB: "round" is the number of times the queue counters went over the whole ring buffer.
+   *     It's used to avoid the ABA problem for atomic token updates.
+   */
+  static constexpr inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
+  {
+    // Modify the seq_id to identify that the token slot is empty
+    auto empty_round    = static_cast<uint32_t>(slot_state::kEmptyPast) * kSize;
+    auto empty_round_id = seq_order_id{seq_id.value + empty_round};
+    // Id of empty slot is ignored and can be anything
+    auto empty_id = kCounterLocMask;
+    return batch_token{make_seq_batch_id(empty_round_id, empty_id)};
+  }
+
+  /**
+   * Construct a sequential batch id by combining the current round and the real batch id.
+   *
+   * The "round" part gives a hint when the token slot was filled-in to avoid the ABA problem
+   *  (see above).
+   */
+  static constexpr inline auto make_seq_batch_id(seq_order_id seq_id, uint32_t batch_id) noexcept
+    -> uint32_t
+  {
+    return seq_round(seq_id) | batch_id;
+  }
+
+  /**
+   * Get the state of the batch slot w.r.t. the given seq_order_id counter.
+   * This gives the information whether the slot is emptied/filled by another thread and whether
+   * that thread is ahead or behind the current thread.
+   * By introducing these future/past flavours of states we solve the ABA problem for atomic updates
+   * of the ring buffer slots.
+   */
+  static inline auto batch_status(batch_token token, seq_order_id seq_id) -> slot_state
+  {
+    /*
+    The "round" part of the id is just a seq_id without the low bits.
+    Essentially, we comparing here seq_ids of two threads: the one that wrote to the slot in the
+    past and the one reads from it now.
+
+    `kSize` determines the number of bits we use for the IO buffer id and for the round id.
+      */
+    auto v =
+      static_cast<int32_t>(seq_round(token) - seq_round(seq_id)) / static_cast<int32_t>(kSize);
+    if (v < static_cast<int32_t>(slot_state::kFullBusy)) { RAFT_FAIL("Invalid batch state %d", v); }
+    if (v < static_cast<int32_t>(slot_state::kEmptyBusy)) {
+      return static_cast<slot_state>(std::min(v, static_cast<int32_t>(slot_state::kFullPast)));
+    }
+    return static_cast<slot_state>(std::min(v, static_cast<int32_t>(slot_state::kEmptyPast)));
+  }
+
+ private:
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
+
+  alignas(kCacheLineBytes)
+    raft::pinned_vector<cuda::atomic<batch_token, cuda::thread_scope_system>, uint32_t> tokens_;
+  raft::pinned_vector<cuda::atomic<int32_t, cuda::thread_scope_system>, uint32_t> rem_time_us_;
+  std::vector<cuda::std::atomic<uint32_t>> dispatch_sequence_id_;
+  std::optional<raft::pinned_vector<cuda::atomic<uint32_t, cuda::thread_scope_system>, uint32_t>>
+    batch_sizes_;
+
+  /* [Note: cache-friendly indexing]
+     To avoid false sharing, the queue pushes and pops values not sequentially, but with an
+     increment that is larger than the cache line size.
+     Hence we introduce the `kCounterIncrement > kCacheLineBytes`.
+     However, to make sure all indices are used, we choose the increment to be coprime with the
+     buffer size. We also require that the buffer size is a power-of-two for two reasons:
+       1) Fast modulus operation - reduces to binary `and` (with `kCounterLocMask`).
+       2) Easy to ensure GCD(kCounterIncrement, kSize) == 1 by construction
+          (see the definition below).
+   */
+  static constexpr uint32_t kElemsPerCacheLine =
+    raft::div_rounding_up_safe<uint32_t>(kCacheLineBytes, kMinElemSize);
+  static constexpr uint32_t kCounterIncrement = raft::bound_by_power_of_two(kElemsPerCacheLine) + 1;
+  static constexpr uint32_t kCounterLocMask   = kSize - 1;
+  // These props hold by design, but we add them here as a documentation and a sanity check.
+  static_assert(
+    kCounterIncrement * kMinElemSize >= kCacheLineBytes,
+    "The counter increment should be larger than the cache line size to avoid false sharing.");
+  static_assert(
+    std::gcd(kCounterIncrement, kSize) == 1,
+    "The counter increment and the size must be coprime to allow using all of the queue slots.");
+  /** Map the sequential index onto cache-friendly strided index. */
+  static constexpr inline auto cache_friendly_idx(uint32_t source_idx) noexcept -> uint32_t
+  {
+    return (source_idx * kCounterIncrement) & kCounterLocMask;
+  }
+
+  /** The "round": the number of times the queue counter went over the whole ring buffer. */
+  static constexpr inline auto seq_round(seq_order_id id) noexcept -> uint32_t
+  {
+    return id.value & ~kCounterLocMask;
+  }
+
+  /** The "round": the number of times the queue counter went over the whole ring buffer. */
+  static constexpr inline auto seq_round(batch_token token) noexcept -> uint32_t
+  {
+    return token.id() & ~kCounterLocMask;
+  }
+};
+
+template <typename T, typename IdxT>
+struct alignas(kCacheLineBytes) request_pointers {
+  /**
+   * A pointer to `dim` values of a single query (input).
+   *
+   * Serves as a synchronization point between the CPU thread (producer) and a GPU block in the
+   * `gather_inputs` kernel (consumer).
+   */
+  cuda::atomic<const T*, cuda::thread_scope_system> query{nullptr};
+  /** A pointer to `k` nearest neighbors (output) */
+  IdxT* neighbors{nullptr};
+  /** A pointer to distances of `k` nearest neighbors (output) */
+  float* distances{nullptr};
+};
+
+/**
+ * Check the current timestamp at the moment of construction and repeatedly compare the elapsed time
+ * to the timeout value provided by the host (passed via an atomic).
+ *
+ * This is used in the gather inputs kernel to make it stop waiting for new queries in a batch
+ * once the deadline is reached.
+ */
+struct gpu_time_keeper {
+  /**
+   * @param[in] cpu_provided_remaining_time_us
+   *   a pointer to a shared atomic, represent the remaining waiting time in microseconds.
+   *   Note, the remaining time is updated atomically by each participating host thread in their
+   *   "private coordinate systems". That's ok, we don't expect a single reference time for all host
+   *   and device threads.
+   *   We tolerate the errors coming from the time difference between the host thread writing their
+   *   remaining waiting time and the GPU thread reading that value.
+   */
+  RAFT_DEVICE_INLINE_FUNCTION explicit gpu_time_keeper(
+    cuda::atomic<int32_t, cuda::thread_scope_system>* cpu_provided_remaining_time_us)
+    : cpu_provided_remaining_time_us_{cpu_provided_remaining_time_us}
+  {
+    update_timestamp();
+  }
+
+  /**
+   * Check whether the deadline is not reached yet:
+   * 1) Compare the internal clock against the last-read deadline value
+   * 2) Read the deadline value from the host-visible atomic and check the internal clock again.
+   */
+  RAFT_DEVICE_INLINE_FUNCTION auto has_time() noexcept -> bool
+  {
+    if (timeout) { return false; }
+    update_local_remaining_time();
+    if (local_remaining_time_us_ <= 0) {
+      timeout = true;
+      return false;
+    }
+    update_cpu_provided_remaining_time();
+    if (local_remaining_time_us_ <= 0) {
+      timeout = true;
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  cuda::atomic<int32_t, cuda::thread_scope_system>* cpu_provided_remaining_time_us_;
+  uint64_t timestamp_ns_           = 0;
+  int32_t local_remaining_time_us_ = std::numeric_limits<int32_t>::max();
+  bool timeout                     = false;
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_timestamp() noexcept
+  {
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timestamp_ns_));
+  }
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_local_remaining_time() noexcept
+  {
+    auto prev_timestamp = timestamp_ns_;
+    update_timestamp();
+    // subtract the time passed since the last check
+    // (assuming local time is updated every time timestamp is read)
+    local_remaining_time_us_ -= static_cast<int32_t>((timestamp_ns_ - prev_timestamp) / 1000ull);
+  }
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_cpu_provided_remaining_time() noexcept
+  {
+    local_remaining_time_us_ =
+      std::min<int32_t>(local_remaining_time_us_,
+                        cpu_provided_remaining_time_us_->load(cuda::std::memory_order_relaxed));
+  }
+};
+
+/**
+ * Copy the queries from the submitted pointers to the batch store, one query per block.
+ * Upon completion of this kernel, the submitted queries are all in the contiguous buffer
+ * `batch_queries`.
+ *
+ * Block size: (n, 1, 1) any number of threads copying a single row of data.
+ * Grid size: (max_batch_size, 1, 1) - one block per query
+ *
+ * Note, we view the incoming queries and the batch as going through multiple stages:
+ *   1) A host thread "commits" a query: it reserves a slot for the query in the batch and promises
+ *      to fill-in the corresponding query pointer.
+ *   2) A host thread "submits" the query: it fills-in the pointer to the query data in the reserved
+ *      slot.
+ *   3) This kernel copies the query data to the contiguous query buffer owned by the batch.
+ *
+ * The batch is "fully committed" when the number of committed queries reaches the maximum batch
+ * size (all slots are reserved). Committing, submitting, and copying of the queries is somewhat
+ * overlapped among multiple host and device threads. Only the copying happens in a CUDA stream in
+ * this kernel, and the upstream search is dispatched right after this kernel (in the same stream).
+ *
+ */
+template <typename T, typename IdxT>
+RAFT_KERNEL gather_inputs(
+  raft::device_matrix_view<T, uint32_t, raft::row_major> batch_queries,
+  raft::pinned_vector_view<request_pointers<T, IdxT>, uint32_t> request_ptrs,
+  /* The remaining time may be updated on the host side: a thread with a tighter deadline may reduce
+     it (but not increase). */
+  cuda::atomic<int32_t, cuda::thread_scope_system>* remaining_time_us,
+  /* The token contains the current number of queries committed and is cleared in this kernel. */
+  cuda::atomic<batch_token, cuda::thread_scope_system>* batch_token_ptr,
+  /* The host-visible batch size counter (used in `conservative_dispatch`). */
+  cuda::atomic<uint32_t, cuda::thread_scope_system>* batch_size_out,
+  /**
+   * The token value considered empty depends on the round over the ring buffer
+   * (which is defined by the seq_order_id)
+   */
+  batch_token empty_token_value,
+  /**
+   * The counter is used to find the last CTA to finish and to share the batch size with the
+   * scatter_inputs kernel.
+   */
+  cuda::atomic<uint32_t, cuda::std::thread_scope_device>* kernel_progress_counter)
+{
+  const uint32_t query_id = blockIdx.x;
+  __shared__ const T* query_ptr;
+
+  if (threadIdx.x == 0) {
+    query_ptr = nullptr;
+
+    // NB: we have to read/write to `batch_token_ptr`, `bs_committed`, and `batch_fully_committed`
+    // using volatile assembly ops, because otherwise the compiler seems to fail to understand that
+    // this is the same location in memory. The order of reads in writes here is extremely
+    // important, as it involves multiple host and device threads (the host threads do RMW atomic
+    // increments on the commit counter).
+    volatile uint32_t* bs_committed =
+      reinterpret_cast<volatile uint32_t*>(batch_token_ptr) + 1 - CUVS_SYSTEM_LITTLE_ENDIAN;
+    volatile uint8_t* batch_fully_committed =
+      reinterpret_cast<volatile uint8_t*>(bs_committed) + (CUVS_SYSTEM_LITTLE_ENDIAN * 3);
+
+    gpu_time_keeper runtime{remaining_time_us};
+    bool committed          = false;  // if the query is committed, we have to wait for it to arrive
+    auto& request_query_ptr = request_ptrs(query_id).query;
+    while (true) {
+      query_ptr = request_query_ptr.load(cuda::std::memory_order_acquire);
+      if (query_ptr != nullptr) {
+        // The query is submitted to this block's slot; erase the pointer buffer for future use and
+        // exit the loop.
+        request_query_ptr.store(nullptr, cuda::std::memory_order_relaxed);
+        break;
+      }
+      // The query hasn't been submitted, but is already committed; other checks may be skipped
+      if (committed) { continue; }
+      // Check if the query is committed
+      uint32_t committed_count;
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed = (committed_count & 0x00ffffff) > query_id;
+      if (committed) { continue; }
+      // If the query is not committed, but the batch is past the deadline, we exit without copying
+      // the query
+      if (committed_count > 0x00ffffff) { break; }
+      // The query hasn't been submitted yet; check if we're past the deadline
+      if (runtime.has_time()) { continue; }
+      // Otherwise, let the others know time is out
+      // Set the highest byte of the commit counter to 1 (thus avoiding RMW atomic)
+      // This prevents any more CPU threads from committing to this batch.
+      asm volatile("st.volatile.global.u8 [%0], %1;"
+                   :
+                   : "l"(batch_fully_committed), "r"(1)
+                   : "memory");
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed = (committed_count & 0x00ffffff) > query_id;
+      if (committed) { continue; }
+      break;
+    }
+    auto progress = kernel_progress_counter->fetch_add(1, cuda::std::memory_order_acq_rel) + 1;
+    if (progress >= gridDim.x) {
+      // read the last value of the committed count to know the batch size for sure
+      uint32_t committed_count;
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed_count &= 0x00ffffff;  // Clear the timeout bit
+      if (batch_size_out != nullptr) {
+        // Inform the dispatcher about the final batch size if `conservative_dispatch` is enabled
+        batch_size_out->store(committed_count, cuda::std::memory_order_relaxed);
+      }
+      // store the batch size in the progress counter, so we can read it in the scatter kernel
+      kernel_progress_counter->store(committed_count, cuda::std::memory_order_relaxed);
+      // Clear the batch token slot, so it can be re-used by others
+      asm volatile("st.volatile.global.u64 [%0], %1;"
+                   :
+                   : "l"(reinterpret_cast<uint64_t*>(batch_token_ptr)),
+                     "l"(reinterpret_cast<uint64_t&>(empty_token_value))
+                   : "memory");
+    }
+  }
+  // The block waits till the leading thread gets the query pointer
+  cooperative_groups::this_thread_block().sync();
+  auto query_ptr_local = query_ptr;
+  if (query_ptr_local == nullptr) { return; }
+  // block-wide copy input query
+  auto dim = batch_queries.extent(1);
+  for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+    batch_queries(query_id, i) = query_ptr_local[i];
+  }
+}
+
+/** Copy the results of the search back to the requesters. */
+template <typename T, typename IdxT>
+RAFT_KERNEL scatter_outputs(
+  raft::pinned_vector_view<request_pointers<T, IdxT>, uint32_t> request_ptrs,
+  raft::device_matrix_view<const IdxT, uint32_t> batch_neighbors,
+  raft::device_matrix_view<const float, uint32_t> batch_distances,
+  cuda::atomic<uint32_t, cuda::std::thread_scope_device>* kernel_progress_counter,
+  cuda::atomic<batch_token, cuda::thread_scope_system>* next_token,
+  uint32_t batch_id)
+{
+  __shared__ uint32_t batch_size;
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    batch_size = kernel_progress_counter->exchange(0, cuda::std::memory_order_relaxed);
+  }
+  // Copy output
+  cooperative_groups::this_thread_block().sync();
+  auto k = batch_neighbors.extent(1);
+  for (uint32_t i = threadIdx.y; i < batch_size; i += blockDim.y) {
+    auto* request_neighbors = request_ptrs(i).neighbors;
+    auto* request_distances = request_ptrs(i).distances;
+    for (uint32_t j = threadIdx.x; j < k; j += blockDim.x) {
+      request_neighbors[j] = batch_neighbors(i, j);
+      request_distances[j] = batch_distances(i, j);
+    }
+  }
+  // Clear the batch state after all threads copied the data, so the batch can be reused
+  cuda::atomic_thread_fence(cuda::std::memory_order_release, cuda::thread_scope_system);
+  cooperative_groups::this_thread_block().sync();
+  if (threadIdx.x != 0 || threadIdx.y != 0) { return; }
+  reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(
+    &reinterpret_cast<batch_token*>(next_token)->id())
+    ->store(batch_id, cuda::std::memory_order_relaxed);
+}
+
+/**
+ * Batch runner is shared among the users of the `dynamic_batching::index` (i.e. the index can be
+ * copied, but the copies hold shared pointers to a single batch runner).
+ *
+ * Constructor and destructor of this class do not need to be thread-safe, as their execution is
+ * guaranteed to happen in one thread by the holding shared pointer.
+ *
+ * The search function must be thread-safe. We only have to pay attention to the `mutable` members
+ * though, because the function is marked const.
+ */
+template <typename T, typename IdxT>
+class batch_runner {
+ public:
+  constexpr static uint32_t kMaxNumQueues = 256;
+
+  using batch_queue  = batch_queue_t<kMaxNumQueues>;
+  using seq_order_id = typename batch_queue::seq_order_id;
+
+  // Save the parameters and the upstream batched search function to invoke
+  template <typename Upstream>
+  batch_runner(const raft::resources& res,
+               const dynamic_batching::index_params& params,
+               const Upstream& upstream_index,
+               const typename Upstream::search_params_type& upstream_params,
+               upstream_search_type_const<Upstream, T, IdxT>* upstream_search,
+               const cuvs::neighbors::filtering::base_filter* sample_filter)
+    : res_{res},
+      upstream_search_{[&upstream_index, upstream_search, upstream_params, sample_filter](
+                         raft::resources const& res,
+                         raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+                         raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,
+                         raft::device_matrix_view<float, int64_t, raft::row_major> distances) {
+        /* Note: passing sample_filter by pointer
+
+        Ideally, dynamic batching would capture the filter by value. Unfortunately, one cannot use
+        the copy constructor of the `base_filter` (it would erase the actual filter type).
+        Therefore, we can only pass the filter by pointer or reference and require the user to keep
+        the filter alive for the lifetime of the dynamic batching index.
+        This, however, may lead to a segfault when the user doesn't provide the filter argument and
+        the argument is passed by reference: the lifetime of the none_sample_filter default argument
+        is limited to the search function call, so it is destroyed while the dynamic batching index
+        is still alive.
+        Hence the solution is to pass the filter by pointer and default it to nullptr.
+        */
+        if (sample_filter == nullptr) {
+          using base_filter_type = cuvs::neighbors::filtering::base_filter;
+          const auto none_filter = cuvs::neighbors::filtering::none_sample_filter{};
+          return upstream_search(res,
+                                 upstream_params,
+                                 upstream_index,
+                                 queries,
+                                 neighbors,
+                                 distances,
+                                 static_cast<const base_filter_type&>(none_filter));
+
+        } else {
+          return upstream_search(
+            res, upstream_params, upstream_index, queries, neighbors, distances, *sample_filter);
+        }
+      }},
+      k_{uint32_t(params.k)},
+      dim_{uint32_t(upstream_index.dim())},
+      max_batch_size_{uint32_t(params.max_batch_size)},
+      n_queues_{uint32_t(params.n_queues)},
+      batch_queue_{res_, params.conservative_dispatch},
+      completion_events_(n_queues_),
+      input_extents_{n_queues_, max_batch_size_, dim_},
+      output_extents_{n_queues_, max_batch_size_, k_},
+      queries_{raft::make_device_mdarray<T>(res_, input_extents_)},
+      neighbors_{raft::make_device_mdarray<IdxT>(res_, output_extents_)},
+      distances_{raft::make_device_mdarray<float>(res_, output_extents_)},
+      kernel_progress_counters_{
+        raft::make_device_vector<cuda::atomic<uint32_t, cuda::std::thread_scope_device>>(
+          res_, n_queues_)},
+      request_ptrs_{raft::make_pinned_matrix<request_pointers<T, IdxT>, uint32_t>(
+        res_, n_queues_, max_batch_size_)}
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      kernel_progress_counters_.data_handle(),
+      0,
+      sizeof(*kernel_progress_counters_.data_handle()) * kernel_progress_counters_.size(),
+      raft::resource::get_cuda_stream(res_)));
+    // Make sure to initialize the atomic values in the batch_state structs.
+    for (uint32_t i = 0; i < n_queues_; i++) {
+      auto seq_id = batch_queue_.push();
+      batch_queue_.token(seq_id).store(batch_token{batch_queue::make_seq_batch_id(seq_id, i)});
+      // Make sure to initialize query pointers, because they are used for synchronization
+      for (uint32_t j = 0; j < max_batch_size_; j++) {
+        new (&request_ptrs_(i, j)) request_pointers<T, IdxT>{};
+      }
+    }
+  }
+
+  // A workaround for algos, which have non-const `index` type in their arguments
+  template <typename Upstream>
+  batch_runner(const raft::resources& res,
+               const dynamic_batching::index_params& params,
+               const Upstream& upstream_index,
+               const typename Upstream::search_params_type& upstream_params,
+               upstream_search_type<Upstream, T, IdxT>* upstream_search,
+               const cuvs::neighbors::filtering::base_filter* sample_filter)
+    : batch_runner{
+        res,
+        params,
+        upstream_index,
+        upstream_params,
+        reinterpret_cast<upstream_search_type_const<Upstream, T, IdxT>*>(upstream_search),
+        sample_filter}
+  {
+  }
+
+  void search(raft::resources const& res,
+              cuvs::neighbors::dynamic_batching::search_params const& params,
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+              raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances) const
+  {
+    uint32_t n_queries = queries.extent(0);
+    if (n_queries >= max_batch_size_) {
+      return upstream_search_(res, queries, neighbors, distances);
+    }
+
+    if (neighbors.extent(1) != int64_t(k_)) {
+      // TODO: the check can be relaxed to `neighbors.extent(1) > int64_t(k_)`;
+      //       this, however, would require an extra bounds check per-query in the scatter kernel.
+      RAFT_LOG_WARN(
+        "The requested number of neighbors (%zd) doesn't match the configured "
+        "dynamic_batching::index_params::k (%u); dynamic batching is disabled for the request.",
+        neighbors.extent(1),
+        k_);
+      return upstream_search_(res, queries, neighbors, distances);
+    }
+
+    auto deadline = std::chrono::system_clock::now() +
+                    std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1000000.0));
+
+    int64_t local_io_offset = 0;
+    batch_token batch_token_observed{0};
+    local_waiter to_commit{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 3e5)),
+                           local_waiter::kNonSleepIterations};
+    while (true) {
+      const auto seq_id        = batch_queue_.head();
+      const auto commit_result = try_commit(seq_id, n_queries);
+      // The bool (busy or not) returned if no queries were committed:
+      if (std::holds_alternative<bool>(commit_result)) {
+        // Pause if the system is busy
+        // (otherwise the progress is guaranteed due to update of the head counter)
+        if (std::get<bool>(commit_result)) { to_commit.wait(); }
+        continue;  // Try to get a new batch token
+      }
+      batch_token_observed           = std::get<batch_token>(std::get<0>(commit_result));
+      const auto queries_committed   = std::get<uint32_t>(std::get<0>(commit_result));
+      const auto batch_offset        = batch_token_observed.size_committed();
+      auto& batch_token_ref          = batch_queue_.token(seq_id);
+      auto& rem_time_us_ref          = batch_queue_.rem_time_us(seq_id);
+      auto& dispatch_sequence_id_ref = batch_queue_.dispatch_sequence_id(seq_id);
+      auto* batch_size_ptr           = batch_queue_.batch_size(seq_id);
+      // sleep for 1/10 of deadline time or more
+      //   (if couldn't get the value in the first few iterations).
+      local_waiter till_full{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5)),
+                             batch_queue_.niceness(seq_id)};
+      while (batch_queue::batch_status(batch_token_observed, seq_id) != slot_state::kFull) {
+        /* Note: waiting for batch IO buffers
+        The CPU threads can commit to the incoming batches in the queue in advance (this happens in
+        try_commit).
+        In this loop, a thread waits for the batch IO buffer to be released by a running search on
+        the GPU side (scatter_outputs kernel). Hence, this loop is engaged only if all buffers are
+        currently used, which suggests that the GPU is busy (or there's not enough IO buffers).
+        This also means the current search is not likely to meet the deadline set by the user.
+
+        The scatter kernel returns its buffer id into an acquired slot in the batch queue; in this
+        loop we wait for that id to arrive.
+
+        Generally, we want to waste as little as possible CPU cycles here to let other threads wait
+        on dispatch_sequence_id_ref below more efficiently. At the same time, we shouldn't use
+        `.wait()` here, because `.notify_all()` would have to come from GPU.
+        */
+        till_full.wait();
+        batch_token_observed = batch_token_ref.load(cuda::std::memory_order_acquire);
+      }
+      // Whether this thread is responsible for dispatching the batch.
+      bool is_dispatcher = batch_offset == 0;
+      auto stream        = raft::resource::get_cuda_stream(res);
+      auto batch_id      = batch_queue::batch_id(batch_token_observed);
+      auto request_ptrs  = slice_2d(batch_id, request_ptrs_);
+
+      if (is_dispatcher) {
+        // Conservatively initialize the remaining time
+        // TODO (achirkin): this initialization may happen after the other requesters update the
+        //                  time and thus erase their deadlines.
+        rem_time_us_ref.store(static_cast<int32_t>(params.dispatch_timeout_ms * 1000),
+                              cuda::std::memory_order_relaxed);
+        // run the gather kernel before submitting the data to reduce the latency
+        gather_inputs<T, IdxT><<<max_batch_size_, 32, 0, stream>>>(
+          slice_3d(batch_id, queries_),
+          request_ptrs,
+          &rem_time_us_ref,
+          &batch_token_ref,
+          batch_size_ptr,
+          // This indicates the empty token slot, which can only be used in the following round
+          batch_queue::make_empty_token(seq_id),
+          kernel_progress_counters_.data_handle() + batch_id);
+      }
+
+      // *** Set the pointers to queries, neighbors, distances - query-by-query
+      for (uint32_t i = 0; i < queries_committed; i++) {
+        const auto o   = local_io_offset + i;
+        auto& ptrs     = request_ptrs(batch_offset + i);
+        ptrs.neighbors = neighbors.data_handle() + o * k_;
+        ptrs.distances = distances.data_handle() + o * k_;
+        ptrs.query.store(queries.data_handle() + o * dim_, cuda::std::memory_order_release);
+      }
+
+      // Submit estimated remaining time
+      {
+        auto rem_time_us = static_cast<int32_t>(
+          std::max<int64_t>(0, (deadline - std::chrono::system_clock::now()).count()) / 1000);
+        rem_time_us_ref.fetch_min(rem_time_us, cuda::std::memory_order_relaxed);
+      }
+
+      if (is_dispatcher) {
+        uint32_t batch_size = max_batch_size_;
+        if (batch_size_ptr != nullptr) {
+          // Block until the real batch size is available if conservative dispatch is used.
+          local_waiter for_dispatch{
+            std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5))};
+          batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed);
+          while (batch_size == 0) {
+            for_dispatch.wait();
+            batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed);
+          }
+          batch_size_ptr->store(0, cuda::std::memory_order_relaxed);
+        }
+        auto batch_neighbors = slice_3d(batch_id, neighbors_, batch_size);
+        auto batch_distances = slice_3d(batch_id, distances_, batch_size);
+        upstream_search_(
+          res, slice_3d(batch_id, queries_, batch_size), batch_neighbors, batch_distances);
+        auto next_seq_id     = batch_queue_.push();
+        auto& next_token_ref = batch_queue_.token(next_seq_id);
+        // next_batch_token);
+        auto bs = dim3(128, 8, 1);
+        scatter_outputs<T, IdxT>
+          <<<1, bs, 0, stream>>>(request_ptrs,
+                                 batch_neighbors,
+                                 batch_distances,
+                                 kernel_progress_counters_.data_handle() + batch_id,
+                                 &next_token_ref,
+                                 batch_queue::make_seq_batch_id(next_seq_id, batch_id));
+        RAFT_CUDA_TRY(cudaEventRecord(completion_events_[batch_id].value(), stream));
+        dispatch_sequence_id_ref.store(seq_id.value, cuda::std::memory_order_release);
+        dispatch_sequence_id_ref.notify_all();
+
+      } else {
+        // Wait till the dispatch_sequence_id counter is updated, which means the event is recorded
+        auto dispatched_id_observed =
+          dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire);
+        while (static_cast<int32_t>(seq_id.value - dispatched_id_observed) > 0) {
+          dispatch_sequence_id_ref.wait(dispatched_id_observed, cuda::std::memory_order_relaxed);
+          dispatched_id_observed = dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire);
+        }
+        // Now we can safely record the event
+        RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, completion_events_[batch_id].value()));
+      }
+
+      n_queries -= queries_committed;
+
+      if (n_queries == 0) { return; }
+      // If not all queries were committed, continue in the loop.
+      // TODO: it could potentially be more efficient to first commit everything and only then
+      //        submit the work/wait for the event
+      local_io_offset += queries_committed;
+      to_commit.reset(
+        local_waiter::kNonSleepIterations);  // reset the waiter for the next iteration.
+    }
+  }
+
+ private:
+  raft::resources res_;  // Sic! Store by value to copy the resource.
+  std::function<function_search_type<T, IdxT>> upstream_search_;
+  uint32_t k_;
+  uint32_t dim_;
+  uint32_t max_batch_size_;
+  uint32_t n_queues_;
+
+  mutable batch_queue batch_queue_;
+  std::vector<cuda_event> completion_events_;
+
+  using batch_extents = raft::extent_3d<uint32_t>;
+  batch_extents input_extents_;
+  batch_extents output_extents_;
+
+  mutable raft::device_mdarray<T, batch_extents, raft::row_major> queries_;
+  mutable raft::device_mdarray<IdxT, batch_extents, raft::row_major> neighbors_;
+  mutable raft::device_mdarray<float, batch_extents, raft::row_major> distances_;
+  mutable raft::device_vector<cuda::atomic<uint32_t, cuda::std::thread_scope_device>>
+    kernel_progress_counters_;
+
+  mutable raft::pinned_matrix<request_pointers<T, IdxT>, uint32_t, raft::row_major> request_ptrs_;
+
+  /**
+   * Try to commit n_queries at most; returns the last observed batch_token (where `size_committed`
+   * represents offset at which new queries are committed if successful), the number of committed
+   * queries, or whether the ring buffer appears to be busy (on unsuccessful commit).
+   */
+  auto try_commit(seq_order_id seq_id, uint32_t n_queries) const
+    -> std::variant<std::tuple<batch_token, uint32_t>, bool>
+  {
+    auto& batch_token_ref            = batch_queue_.token(seq_id);
+    batch_token batch_token_observed = batch_token_ref.load(cuda::std::memory_order_relaxed);
+    batch_token batch_token_updated;
+    slot_state token_status;
+    do {
+      // The interpretation of the token status depends on the current seq_order_id and a similar
+      // counter in the token. This is to prevent conflicts when too many parallel requests wrap
+      // over the whole ring buffer (batch_queue_t).
+      token_status = batch_queue::batch_status(batch_token_observed, seq_id);
+      // Busy status means the current thread is a whole ring buffer ahead of the token.
+      // The thread should wait for the rest of the system.
+      if (token_status == slot_state::kFullBusy || token_status == slot_state::kEmptyBusy) {
+        return true;
+      }
+      // This branch checks if the token was recently filled or dispatched.
+      // This means the head counter of the ring buffer is slightly outdated.
+      if (token_status == slot_state::kEmptyPast || token_status == slot_state::kFullPast ||
+          batch_token_observed.size_committed() >= max_batch_size_) {
+        batch_queue_.pop(seq_id);
+        return false;
+      }
+      batch_token_updated = batch_token_observed;
+      batch_token_updated.size_committed() =
+        std::min(batch_token_observed.size_committed() + n_queries, max_batch_size_);
+    } while (!batch_token_ref.compare_exchange_weak(batch_token_observed,
+                                                    batch_token_updated,
+                                                    cuda::std::memory_order_acq_rel,
+                                                    cuda::std::memory_order_relaxed));
+    if (batch_token_updated.size_committed() >= max_batch_size_) {
+      // The batch is already full, let's try to pop it from the queue
+      //                                 (if nobody has done so already)
+      batch_queue_.pop(seq_id);
+    }
+    return std::make_tuple(
+      batch_token_observed,
+      batch_token_updated.size_committed() - batch_token_observed.size_committed());
+  }
+};
+
+}  // namespace cuvs::neighbors::dynamic_batching::detail
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index ce1e03264..e129d23e8 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -22,9 +22,63 @@
 #include <hnswlib/hnswlib.h>
 #include <memory>
 #include <random>
+#include <thread>
 
 namespace cuvs::neighbors::hnsw::detail {
 
+// Multithreaded executor
+// The helper function is copied from the hnswlib repository
+// as for some reason, adding vectors to the hnswlib index does not
+// work well with omp parallel for
+template <class Function>
+inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn)
+{
+  if (numThreads <= 0) { numThreads = std::thread::hardware_concurrency(); }
+
+  if (numThreads == 1) {
+    for (size_t id = start; id < end; id++) {
+      fn(id, 0);
+    }
+  } else {
+    std::vector<std::thread> threads;
+    std::atomic<size_t> current(start);
+
+    // keep track of exceptions in threads
+    // https://stackoverflow.com/a/32428427/1713196
+    std::exception_ptr lastException = nullptr;
+    std::mutex lastExceptMutex;
+
+    for (size_t threadId = 0; threadId < numThreads; ++threadId) {
+      threads.push_back(std::thread([&, threadId] {
+        while (true) {
+          size_t id = current.fetch_add(1);
+
+          if (id >= end) { break; }
+
+          try {
+            fn(id, threadId);
+          } catch (...) {
+            std::unique_lock<std::mutex> lastExcepLock(lastExceptMutex);
+            lastException = std::current_exception();
+            /*
+             * This will work even when current is the largest value that
+             * size_t can fit, because fetch_add returns the previous value
+             * before the increment (what will result in overflow
+             * and produce 0 instead of current + 1).
+             */
+            current = end;
+            break;
+          }
+        }
+      }));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    if (lastException) { std::rethrow_exception(lastException); }
+  }
+}
+
 template <typename T>
 struct hnsw_dist_t {
   using type = void;
@@ -54,9 +108,10 @@ struct index_impl : index<T> {
    * @param[in] filepath path to the index
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
+   * @param[in] hierarchy hierarchy used for upper HNSW layers
    */
-  index_impl(const std::string& filepath, int dim, cuvs::distance::DistanceType metric)
-    : index<T>{dim, metric}
+  index_impl(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy)
+    : index<T>{dim, metric, hierarchy}
   {
     if constexpr (std::is_same_v<T, float>) {
       if (metric == cuvs::distance::DistanceType::L2Expanded) {
@@ -71,11 +126,6 @@ struct index_impl : index<T> {
     }
 
     RAFT_EXPECTS(space_ != nullptr, "Unsupported metric type was used");
-
-    appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
-      space_.get(), filepath);
-
-    appr_alg_->base_layer_only = true;
   }
 
   /**
@@ -88,14 +138,32 @@ struct index_impl : index<T> {
   */
   void set_ef(int ef) const override { appr_alg_->ef_ = ef; }
 
+  /**
+  @brief Set index
+   */
+  void set_index(std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>&& index)
+  {
+    appr_alg_ = std::move(index);
+  }
+
+  /**
+  @brief Get space
+   */
+  auto get_space() const -> hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>*
+  {
+    return space_.get();
+  }
+
  private:
   std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
   std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
 };
 
-template <typename T>
-std::unique_ptr<index<T>> from_cagra(raft::resources const& res,
-                                     const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index)
+template <typename T, HnswHierarchy hierarchy>
+std::enable_if_t<hierarchy == HnswHierarchy::NONE, std::unique_ptr<index<T>>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index)
 {
   std::random_device dev;
   std::mt19937 rng(dev());
@@ -103,13 +171,125 @@ std::unique_ptr<index<T>> from_cagra(raft::resources const& res,
   auto uuid            = std::to_string(dist(rng));
   std::string filepath = "/tmp/" + uuid + ".bin";
   cuvs::neighbors::cagra::serialize_to_hnswlib(res, filepath, cagra_index);
+
   index<T>* hnsw_index = nullptr;
   cuvs::neighbors::hnsw::deserialize(
-    res, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index);
+    res, params, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index);
   std::filesystem::remove(filepath);
   return std::unique_ptr<index<T>>(hnsw_index);
 }
 
+template <typename T, HnswHierarchy hierarchy>
+std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
+{
+  // auto host_dataset = raft::make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
+  auto host_dataset = raft::make_host_matrix<T, int64_t>(0, 0);
+  raft::host_matrix_view<const T, int64_t, raft::row_major> host_dataset_view(
+    host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1));
+  if (dataset.has_value()) {
+    host_dataset_view = dataset.value();
+  } else {
+    // move dataset to host, remove padding
+    auto cagra_dataset = cagra_index.dataset();
+    host_dataset =
+      raft::make_host_matrix<T, int64_t>(cagra_dataset.extent(0), cagra_dataset.extent(1));
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.data_handle(),
+                                    sizeof(T) * cagra_dataset.stride(0),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.extent(0),
+                                    cudaMemcpyDefault,
+                                    raft::resource::get_cuda_stream(res)));
+    raft::resource::sync_stream(res);
+    host_dataset_view = host_dataset.view();
+  }
+  // build upper layers of hnsw index
+  auto hnsw_index =
+    std::make_unique<index_impl<T>>(cagra_index.dim(), cagra_index.metric(), hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(),
+    host_dataset_view.extent(0),
+    cagra_index.graph().extent(1) / 2,
+    params.ef_construction);
+  appr_algo->base_layer_init = false;  // tell hnswlib to build upper layers only
+  ParallelFor(0, host_dataset_view.extent(0), params.num_threads, [&](size_t i, size_t threadId) {
+    appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)),
+                        i);
+  });
+  appr_algo->base_layer_init = true;  // reset to true to allow addition of new points
+
+  // move cagra graph to host
+  auto graph = cagra_index.graph();
+  auto host_graph =
+    raft::make_host_matrix<uint32_t, int64_t, raft::row_major>(graph.extent(0), graph.extent(1));
+  raft::copy(host_graph.data_handle(),
+             graph.data_handle(),
+             graph.size(),
+             raft::resource::get_cuda_stream(res));
+  raft::resource::sync_stream(res);
+
+// copy cagra graph to hnswlib base layer
+#pragma omp parallel for
+  for (size_t i = 0; i < static_cast<size_t>(host_graph.extent(0)); ++i) {
+    auto ll_i = appr_algo->get_linklist0(i);
+    appr_algo->setListCount(ll_i, host_graph.extent(1));
+    auto* data = (uint32_t*)(ll_i + 1);
+    for (size_t j = 0; j < static_cast<size_t>(host_graph.extent(1)); ++j) {
+      data[j] = host_graph(i, j);
+    }
+  }
+
+  hnsw_index->set_index(std::move(appr_algo));
+  return hnsw_index;
+}
+
+template <typename T>
+std::unique_ptr<index<T>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
+{
+  if (params.hierarchy == HnswHierarchy::NONE) {
+    return from_cagra<T, HnswHierarchy::NONE>(res, params, cagra_index);
+  } else if (params.hierarchy == HnswHierarchy::CPU) {
+    return from_cagra<T, HnswHierarchy::CPU>(res, params, cagra_index, dataset);
+  }
+  {
+    RAFT_FAIL("Unsupported hierarchy type");
+  }
+}
+
+template <typename T>
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> additional_dataset,
+            index<T>& idx)
+{
+  auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
+    const_cast<void*>(idx.get_index()));
+  auto current_element_count = hnswlib_index->getCurrentElementCount();
+  auto new_element_count     = additional_dataset.extent(0);
+  auto num_threads           = params.num_threads == 0 ? std::thread::hardware_concurrency()
+                                                       : static_cast<size_t>(params.num_threads);
+
+  hnswlib_index->resizeIndex(current_element_count + new_element_count);
+  ParallelFor(current_element_count,
+              current_element_count + new_element_count,
+              num_threads,
+              [&](size_t i, size_t threadId) {
+                hnswlib_index->addPoint(
+                  (void*)(additional_dataset.data_handle() +
+                          (i - current_element_count) * additional_dataset.extent(1)),
+                  i);
+              });
+}
+
 template <typename T>
 void get_search_knn_results(hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const* idx,
                             const T* query,
@@ -171,14 +351,28 @@ void search(raft::resources const& res,
   }
 }
 
+template <typename T>
+void serialize(raft::resources const& res, const std::string& filename, const index<T>& idx)
+{
+  auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
+    const_cast<void*>(idx.get_index()));
+  hnswlib_index->saveIndex(filename);
+}
+
 template <typename T>
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
                  index<T>** idx)
 {
-  *idx = new detail::index_impl<T>(filename, dim, metric);
+  auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
+  auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), filename);
+  if (params.hierarchy == HnswHierarchy::NONE) { appr_algo->base_layer_only = true; }
+  hnsw_index->set_index(std::move(appr_algo));
+  *idx = hnsw_index.release();
 }
 
 }  // namespace cuvs::neighbors::hnsw::detail
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 8c5767c50..c62a52540 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -16,42 +16,42 @@
 
 #pragma once
 
-#include <cuvs/neighbors/nn_descent.hpp>
-
 #include "ann_utils.cuh"
 #include "cagra/device_common.hpp"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/error.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/pinned_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
-
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/slice.cuh>
 #include <raft/util/arch.cuh>  // raft::util::arch::SM_*
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
-#include <cub/cub.cuh>
+#include <rmm/device_uvector.hpp>
+
 #include <cuda_runtime.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/host_vector.h>
-#include <thrust/mr/allocator.h>
-#include <thrust/mr/device_memory_resource.h>
 
 #include <mma.h>
 #include <omp.h>
 
 #include <limits>
+#include <optional>
 #include <queue>
 #include <random>
 
 namespace cuvs::neighbors::nn_descent::detail {
-static const std::string RAFT_NAME = "raft";
-using pinned_memory_resource       = thrust::universal_host_pinned_memory_resource;
-template <typename T>
-using pinned_memory_allocator = thrust::mr::stateless_resource_allocator<T, pinned_memory_resource>;
 
 using DistData_t = float;
 constexpr int DEGREE_ON_DEVICE{32};
@@ -216,6 +216,8 @@ struct BuildConfig {
   // If internal_node_degree == 0, the value of node_degree will be assigned to it
   size_t max_iterations{50};
   float termination_threshold{0.0001};
+  size_t output_graph_degree{32};
+  cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded};
 };
 
 template <typename Index_t>
@@ -300,6 +302,7 @@ class BloomFilter {
 
 template <typename Index_t>
 struct GnndGraph {
+  raft::resources const& res;
   static constexpr int segment_size = 32;
   InternalID_t<Index_t>* h_graph;
 
@@ -310,16 +313,17 @@ struct GnndGraph {
 
   raft::host_matrix<DistData_t, size_t, raft::row_major> h_dists;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_new;
-  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_new;
+  raft::pinned_matrix<Index_t, size_t> h_graph_new;
+  raft::pinned_vector<int2, size_t> h_list_sizes_new;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old;
-  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_old;
+  raft::pinned_matrix<Index_t, size_t> h_graph_old;
+  raft::pinned_vector<int2, size_t> h_list_sizes_old;
   BloomFilter<Index_t> bloom_filter;
 
   GnndGraph(const GnndGraph&)            = delete;
   GnndGraph& operator=(const GnndGraph&) = delete;
-  GnndGraph(const size_t nrow,
+  GnndGraph(raft::resources const& res,
+            const size_t nrow,
             const size_t node_degree,
             const size_t internal_node_degree,
             const size_t num_samples);
@@ -344,9 +348,14 @@ class GNND {
   GNND(const GNND&)            = delete;
   GNND& operator=(const GNND&) = delete;
 
-  void build(Data_t* data, const Index_t nrow, Index_t* output_graph);
+  void build(Data_t* data,
+             const Index_t nrow,
+             Index_t* output_graph,
+             bool return_distances,
+             DistData_t* output_distances);
   ~GNND()    = default;
   using ID_t = InternalID_t<Index_t>;
+  void reset(raft::resources const& res);
 
  private:
   void add_reverse_edges(Index_t* graph_ptr,
@@ -371,15 +380,14 @@ class GNND {
   raft::device_matrix<ID_t, size_t, raft::row_major> graph_buffer_;
   raft::device_matrix<DistData_t, size_t, raft::row_major> dists_buffer_;
 
-  // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827
-  thrust::host_vector<ID_t, pinned_memory_allocator<ID_t>> graph_host_buffer_;
-  thrust::host_vector<DistData_t, pinned_memory_allocator<DistData_t>> dists_host_buffer_;
+  raft::pinned_matrix<ID_t, size_t> graph_host_buffer_;
+  raft::pinned_matrix<DistData_t, size_t> dists_host_buffer_;
 
   raft::device_vector<int, size_t> d_locks_;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_new_;
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old_;
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_old_;
+  raft::pinned_matrix<Index_t, size_t> h_rev_graph_new_;
+  raft::pinned_matrix<Index_t, size_t> h_graph_old_;
+  raft::pinned_matrix<Index_t, size_t> h_rev_graph_old_;
   // int2.x is the number of forward edges, int2.y is the number of reverse edges
 
   raft::device_vector<int2, size_t> d_list_sizes_new_;
@@ -448,11 +456,13 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer,
 // TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
 /** Calculate L2 norm, and cast data to __half */
 template <typename Data_t>
-RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
-                                   __half* output_data,
-                                   int dim,
-                                   DistData_t* l2_norms,
-                                   size_t list_offset = 0)
+RAFT_KERNEL preprocess_data_kernel(
+  const Data_t* input_data,
+  __half* output_data,
+  int dim,
+  DistData_t* l2_norms,
+  size_t list_offset                  = 0,
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
 {
   extern __shared__ char buffer[];
   __shared__ float l2_norm;
@@ -462,26 +472,32 @@ RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
   load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size());
   if (threadIdx.x == 0) { l2_norm = 0; }
   __syncthreads();
-  int lane_id = threadIdx.x % raft::warp_size();
-  for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
-    int idx         = step * raft::warp_size() + lane_id;
-    float part_dist = 0;
-    if (idx < dim) {
-      part_dist = s_vec[idx];
-      part_dist = part_dist * part_dist;
-    }
-    __syncwarp();
-    for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
-      part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+
+  if (metric == cuvs::distance::DistanceType::L2Expanded ||
+      metric == cuvs::distance::DistanceType::CosineExpanded) {
+    int lane_id = threadIdx.x % raft::warp_size();
+    for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
+      int idx         = step * raft::warp_size() + lane_id;
+      float part_dist = 0;
+      if (idx < dim) {
+        part_dist = s_vec[idx];
+        part_dist = part_dist * part_dist;
+      }
+      __syncwarp();
+      for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+        part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+      }
+      if (lane_id == 0) { l2_norm += part_dist; }
+      __syncwarp();
     }
-    if (lane_id == 0) { l2_norm += part_dist; }
-    __syncwarp();
   }
 
   for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
     int idx = step * raft::warp_size() + threadIdx.x;
     if (idx < dim) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
+        output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
         output_data[list_id * dim + idx] =
           (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm);
       } else {
@@ -709,7 +725,8 @@ __launch_bounds__(BLOCK_SIZE, 4)
                     DistData_t* dists,
                     int graph_width,
                     int* locks,
-                    DistData_t* l2_norms)
+                    DistData_t* l2_norms,
+                    cuvs::distance::DistanceType metric)
 {
 #if (__CUDA_ARCH__ >= 700)
   using namespace nvcuda;
@@ -821,8 +838,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
     if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
         i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
+        s_distances[i] = 1.0 - s_distances[i];
       } else {
         s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
                          l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
@@ -900,8 +919,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
     if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
         i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
+        s_distances[i] = 1.0 - s_distances[i];
       } else {
         s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
                          l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
@@ -971,19 +992,21 @@ int insert_to_ordered_list(InternalID_t<Index_t>* list,
 }  // namespace
 
 template <typename Index_t>
-GnndGraph<Index_t>::GnndGraph(const size_t nrow,
+GnndGraph<Index_t>::GnndGraph(raft::resources const& res,
+                              const size_t nrow,
                               const size_t node_degree,
                               const size_t internal_node_degree,
                               const size_t num_samples)
-  : nrow(nrow),
+  : res(res),
+    nrow(nrow),
     node_degree(node_degree),
     num_samples(num_samples),
     bloom_filter(nrow, internal_node_degree / segment_size, 3),
     h_dists{raft::make_host_matrix<DistData_t, size_t, raft::row_major>(nrow, node_degree)},
-    h_graph_new(nrow * num_samples),
-    h_list_sizes_new(nrow),
-    h_graph_old(nrow * num_samples),
-    h_list_sizes_old{nrow}
+    h_graph_new{raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow, num_samples)},
+    h_list_sizes_new{raft::make_pinned_vector<int2, size_t>(res, nrow)},
+    h_graph_old{raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow, num_samples)},
+    h_list_sizes_old{raft::make_pinned_vector<int2, size_t>(res, nrow)}
 {
   // node_degree must be a multiple of segment_size;
   assert(node_degree % segment_size == 0);
@@ -1001,9 +1024,9 @@ void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors,
 {
 #pragma omp parallel for
   for (size_t i = 0; i < nrow; i++) {
-    auto list_new         = h_graph_new.data() + i * num_samples;
-    h_list_sizes_new[i].x = 0;
-    h_list_sizes_new[i].y = 0;
+    auto list_new                       = h_graph_new.data_handle() + i * num_samples;
+    h_list_sizes_new.data_handle()[i].x = 0;
+    h_list_sizes_new.data_handle()[i].y = 0;
 
     for (size_t j = 0; j < width; j++) {
       auto new_neighb_id = new_neighbors[i * width + j].id();
@@ -1011,8 +1034,8 @@ void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors,
       if (bloom_filter.check(i, new_neighb_id)) { continue; }
       bloom_filter.add(i, new_neighb_id);
       new_neighbors[i * width + j].mark_old();
-      list_new[h_list_sizes_new[i].x++] = new_neighb_id;
-      if (h_list_sizes_new[i].x == num_samples) break;
+      list_new[h_list_sizes_new.data_handle()[i].x++] = new_neighb_id;
+      if (h_list_sizes_new.data_handle()[i].x == num_samples) break;
     }
   }
 }
@@ -1051,31 +1074,37 @@ void GnndGraph<Index_t>::sample_graph(bool sample_new)
 {
 #pragma omp parallel for
   for (size_t i = 0; i < nrow; i++) {
-    h_list_sizes_old[i].x = 0;
-    h_list_sizes_old[i].y = 0;
-    h_list_sizes_new[i].x = 0;
-    h_list_sizes_new[i].y = 0;
+    h_list_sizes_old.data_handle()[i].x = 0;
+    h_list_sizes_old.data_handle()[i].y = 0;
+    h_list_sizes_new.data_handle()[i].x = 0;
+    h_list_sizes_new.data_handle()[i].y = 0;
 
     auto list     = h_graph + i * node_degree;
-    auto list_old = h_graph_old.data() + i * num_samples;
-    auto list_new = h_graph_new.data() + i * num_samples;
+    auto list_old = h_graph_old.data_handle() + i * num_samples;
+    auto list_new = h_graph_new.data_handle() + i * num_samples;
     for (int j = 0; j < segment_size; j++) {
       for (int k = 0; k < num_segments; k++) {
         auto neighbor = list[k * segment_size + j];
         if ((size_t)neighbor.id() >= nrow) continue;
         if (!neighbor.is_new()) {
-          if (h_list_sizes_old[i].x < num_samples) {
-            list_old[h_list_sizes_old[i].x++] = neighbor.id();
+          if (h_list_sizes_old.data_handle()[i].x < num_samples) {
+            list_old[h_list_sizes_old.data_handle()[i].x++] = neighbor.id();
           }
         } else if (sample_new) {
-          if (h_list_sizes_new[i].x < num_samples) {
+          if (h_list_sizes_new.data_handle()[i].x < num_samples) {
             list[k * segment_size + j].mark_old();
-            list_new[h_list_sizes_new[i].x++] = neighbor.id();
+            list_new[h_list_sizes_new.data_handle()[i].x++] = neighbor.id();
           }
         }
-        if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
+        if (h_list_sizes_old.data_handle()[i].x == num_samples &&
+            h_list_sizes_new.data_handle()[i].x == num_samples) {
+          break;
+        }
+      }
+      if (h_list_sizes_old.data_handle()[i].x == num_samples &&
+          h_list_sizes_new.data_handle()[i].x == num_samples) {
+        break;
       }
-      if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
     }
   }
 }
@@ -1137,7 +1166,8 @@ template <typename Data_t, typename Index_t>
 GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build_config)
   : res(res),
     build_config_(build_config),
-    graph_(build_config.max_dataset_size,
+    graph_(res,
+           build_config.max_dataset_size,
            align32::roundUp(build_config.node_degree),
            align32::roundUp(build_config.internal_node_degree ? build_config.internal_node_degree
                                                               : build_config.node_degree),
@@ -1146,33 +1176,48 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
     ndim_(build_config.dataset_dim),
     d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>(
       res, nrow_, build_config.dataset_dim)},
-    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, nrow_)},
+    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, 0)},
     graph_buffer_{
       raft::make_device_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     dists_buffer_{
       raft::make_device_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
-    graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
-    dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
+    graph_host_buffer_{
+      raft::make_pinned_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+    dists_host_buffer_{
+      raft::make_pinned_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     d_locks_{raft::make_device_vector<int, size_t>(res, nrow_)},
-    h_rev_graph_new_(nrow_ * NUM_SAMPLES),
-    h_graph_old_(nrow_ * NUM_SAMPLES),
-    h_rev_graph_old_(nrow_ * NUM_SAMPLES),
+    h_rev_graph_new_{
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)},
+    h_graph_old_(
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)),
+    h_rev_graph_old_{
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)},
     d_list_sizes_new_{raft::make_device_vector<int2, size_t>(res, nrow_)},
     d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
 {
   static_assert(NUM_SAMPLES <= 32);
 
-  thrust::fill(thrust::device,
-               dists_buffer_.data_handle(),
-               dists_buffer_.data_handle() + dists_buffer_.size(),
-               std::numeric_limits<float>::max());
-  thrust::fill(thrust::device,
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()),
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()) + graph_buffer_.size(),
-               std::numeric_limits<Index_t>::max());
-  thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0);
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
+
+  if (build_config.metric == cuvs::distance::DistanceType::L2Expanded) {
+    l2_norms_ = raft::make_device_vector<DistData_t, size_t>(res, nrow_);
+  }
 };
 
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::reset(raft::resources const& res)
+{
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
+}
+
 template <typename Data_t, typename Index_t>
 void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
                                               Index_t* h_rev_graph_ptr,
@@ -1189,34 +1234,36 @@ void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
 template <typename Data_t, typename Index_t>
 void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
 {
-  thrust::fill(thrust::device.on(stream),
-               dists_buffer_.data_handle(),
-               dists_buffer_.data_handle() + dists_buffer_.size(),
-               std::numeric_limits<float>::max());
-  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(
-    thrust::raw_pointer_cast(graph_.h_graph_new.data()),
-    thrust::raw_pointer_cast(h_rev_graph_new_.data()),
-    d_list_sizes_new_.data_handle(),
-    thrust::raw_pointer_cast(h_graph_old_.data()),
-    thrust::raw_pointer_cast(h_rev_graph_old_.data()),
-    d_list_sizes_old_.data_handle(),
-    NUM_SAMPLES,
-    d_data_.data_handle(),
-    ndim_,
-    graph_buffer_.data_handle(),
-    dists_buffer_.data_handle(),
-    DEGREE_ON_DEVICE,
-    d_locks_.data_handle(),
-    l2_norms_.data_handle());
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                      h_rev_graph_new_.data_handle(),
+                                                      d_list_sizes_new_.data_handle(),
+                                                      h_graph_old_.data_handle(),
+                                                      h_rev_graph_old_.data_handle(),
+                                                      d_list_sizes_old_.data_handle(),
+                                                      NUM_SAMPLES,
+                                                      d_data_.data_handle(),
+                                                      ndim_,
+                                                      graph_buffer_.data_handle(),
+                                                      dists_buffer_.data_handle(),
+                                                      DEGREE_ON_DEVICE,
+                                                      d_locks_.data_handle(),
+                                                      l2_norms_.data_handle(),
+                                                      build_config_.metric);
 }
 
 template <typename Data_t, typename Index_t>
-void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* output_graph)
+void GNND<Data_t, Index_t>::build(Data_t* data,
+                                  const Index_t nrow,
+                                  Index_t* output_graph,
+                                  bool return_distances,
+                                  DistData_t* output_distances)
 {
   using input_t = typename std::remove_const<Data_t>::type;
 
   cudaStream_t stream = raft::resource::get_cuda_stream(res);
   nrow_               = nrow;
+  graph_.nrow         = nrow;
   graph_.h_graph      = (InternalID_t<Index_t>*)output_graph;
 
   cudaPointerAttributes data_ptr_attr;
@@ -1226,24 +1273,19 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{
     data, static_cast<size_t>(nrow_), build_config_.dataset_dim, batch_size, stream};
   for (auto const& batch : vec_batches) {
-    preprocess_data_kernel<<<batch.size(),
-                             raft::warp_size(),
-                             sizeof(Data_t) *
-                               raft::ceildiv(build_config_.dataset_dim,
-                                             static_cast<size_t>(raft::warp_size())) *
-                               raft::warp_size(),
-                             stream>>>(batch.data(),
-                                       d_data_.data_handle(),
-                                       build_config_.dataset_dim,
-                                       l2_norms_.data_handle(),
-                                       batch.offset());
+    preprocess_data_kernel<<<
+      batch.size(),
+      raft::warp_size(),
+      sizeof(Data_t) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
+        raft::warp_size(),
+      stream>>>(batch.data(),
+                d_data_.data_handle(),
+                build_config_.dataset_dim,
+                l2_norms_.data_handle(),
+                batch.offset(),
+                build_config_.metric);
   }
 
-  thrust::fill(thrust::device.on(stream),
-               (Index_t*)graph_buffer_.data_handle(),
-               (Index_t*)graph_buffer_.data_handle() + graph_buffer_.size(),
-               std::numeric_limits<Index_t>::max());
-
   graph_.clear();
   graph_.init_random_graph();
   graph_.sample_graph(true);
@@ -1251,8 +1293,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   auto update_and_sample = [&](bool update_graph) {
     if (update_graph) {
       update_counter_ = 0;
-      graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
-                          thrust::raw_pointer_cast(dists_host_buffer_.data()),
+      graph_.update_graph(graph_host_buffer_.data_handle(),
+                          dists_host_buffer_.data_handle(),
                           DEGREE_ON_DEVICE,
                           update_counter_);
       if (update_counter_ < build_config_.termination_threshold * nrow_ *
@@ -1265,15 +1307,15 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
   for (size_t it = 0; it < build_config_.max_iterations; it++) {
     raft::copy(d_list_sizes_new_.data_handle(),
-               thrust::raw_pointer_cast(graph_.h_list_sizes_new.data()),
+               graph_.h_list_sizes_new.data_handle(),
                nrow_,
                raft::resource::get_cuda_stream(res));
-    raft::copy(thrust::raw_pointer_cast(h_graph_old_.data()),
-               thrust::raw_pointer_cast(graph_.h_graph_old.data()),
+    raft::copy(h_graph_old_.data_handle(),
+               graph_.h_graph_old.data_handle(),
                nrow_ * NUM_SAMPLES,
                raft::resource::get_cuda_stream(res));
     raft::copy(d_list_sizes_old_.data_handle(),
-               thrust::raw_pointer_cast(graph_.h_list_sizes_old.data()),
+               graph_.h_list_sizes_old.data_handle(),
                nrow_,
                raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
@@ -1286,13 +1328,13 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
     // contains some information for local_join.
     static_assert(DEGREE_ON_DEVICE * sizeof(*(dists_buffer_.data_handle())) >=
                   NUM_SAMPLES * sizeof(*(graph_buffer_.data_handle())));
-    add_reverse_edges(thrust::raw_pointer_cast(graph_.h_graph_new.data()),
-                      thrust::raw_pointer_cast(h_rev_graph_new_.data()),
+    add_reverse_edges(graph_.h_graph_new.data_handle(),
+                      h_rev_graph_new_.data_handle(),
                       (Index_t*)dists_buffer_.data_handle(),
                       d_list_sizes_new_.data_handle(),
                       stream);
-    add_reverse_edges(thrust::raw_pointer_cast(h_graph_old_.data()),
-                      thrust::raw_pointer_cast(h_rev_graph_old_.data()),
+    add_reverse_edges(h_graph_old_.data_handle(),
+                      h_rev_graph_old_.data_handle(),
                       (Index_t*)dists_buffer_.data_handle(),
                       d_list_sizes_old_.data_handle(),
                       stream);
@@ -1316,21 +1358,21 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
     update_and_sample_thread.join();
 
     if (update_counter_ == -1) { break; }
-    raft::copy(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+    raft::copy(graph_host_buffer_.data_handle(),
                graph_buffer_.data_handle(),
                nrow_ * DEGREE_ON_DEVICE,
                raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
-    raft::copy(thrust::raw_pointer_cast(dists_host_buffer_.data()),
+    raft::copy(dists_host_buffer_.data_handle(),
                dists_buffer_.data_handle(),
                nrow_ * DEGREE_ON_DEVICE,
                raft::resource::get_cuda_stream(res));
 
-    graph_.sample_graph_new(thrust::raw_pointer_cast(graph_host_buffer_.data()), DEGREE_ON_DEVICE);
+    graph_.sample_graph_new(graph_host_buffer_.data_handle(), DEGREE_ON_DEVICE);
   }
 
-  graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
-                      thrust::raw_pointer_cast(dists_host_buffer_.data()),
+  graph_.update_graph(graph_host_buffer_.data_handle(),
+                      dists_host_buffer_.data_handle(),
                       DEGREE_ON_DEVICE,
                       update_counter_);
   raft::resource::sync_stream(res);
@@ -1338,6 +1380,27 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
   // Reuse graph_.h_dists as the buffer for shrink the lists in graph
   static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t));
+
+  if (return_distances) {
+    auto graph_d_dists = raft::make_device_matrix<DistData_t, int64_t, raft::row_major>(
+      res, nrow_, build_config_.node_degree);
+    raft::copy(graph_d_dists.data_handle(),
+               graph_.h_dists.data_handle(),
+               nrow_ * build_config_.node_degree,
+               raft::resource::get_cuda_stream(res));
+
+    auto output_dist_view = raft::make_device_matrix_view<DistData_t, int64_t, raft::row_major>(
+      output_distances, nrow_, build_config_.output_graph_degree);
+
+    raft::matrix::slice_coordinates coords{static_cast<int64_t>(0),
+                                           static_cast<int64_t>(0),
+                                           static_cast<int64_t>(nrow_),
+                                           static_cast<int64_t>(build_config_.output_graph_degree)};
+    raft::matrix::slice<DistData_t, int64_t, raft::row_major>(
+      res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords);
+    raft::resource::sync_stream(res);
+  }
+
   Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle();
 
 #pragma omp parallel for
@@ -1376,6 +1439,11 @@ void build(raft::resources const& res,
   RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
                "The dataset size for GNND should be less than %d",
                std::numeric_limits<int>::max() - 1);
+  auto allowed_metrics = params.metric == cuvs::distance::DistanceType::L2Expanded ||
+                         params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+                         params.metric == cuvs::distance::DistanceType::InnerProduct;
+  RAFT_EXPECTS(allowed_metrics && idx.metric() == params.metric,
+               "The metric for NN Descent should be L2Expanded, CosineExpanded or InnerProduct");
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
 
@@ -1410,10 +1478,25 @@ void build(raft::resources const& res,
                            .node_degree           = extended_graph_degree,
                            .internal_node_degree  = extended_intermediate_degree,
                            .max_iterations        = params.max_iterations,
-                           .termination_threshold = params.termination_threshold};
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = params.graph_degree,
+                           .metric                = params.metric};
 
   GNND<const T, int> nnd(res, build_config);
-  nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle());
+
+  if (idx.distances().has_value() || !params.return_distances) {
+    nnd.build(dataset.data_handle(),
+              dataset.extent(0),
+              int_graph.data_handle(),
+              params.return_distances,
+              idx.distances()
+                .value_or(raft::make_device_matrix<float, int64_t>(res, 0, 0).view())
+                .data_handle());
+  } else {
+    RAFT_EXPECTS(!params.return_distances,
+                 "Distance view not allocated. Using return_distances set to true requires "
+                 "distance view to be allocated.");
+  }
 
 #pragma omp parallel for
   for (size_t i = 0; i < static_cast<size_t>(dataset.extent(0)); i++) {
@@ -1445,11 +1528,15 @@ index<IdxT> build(
     graph_degree = intermediate_degree;
   }
 
-  index<IdxT> idx{res, dataset.extent(0), static_cast<int64_t>(graph_degree)};
+  index<IdxT> idx{res,
+                  dataset.extent(0),
+                  static_cast<int64_t>(graph_degree),
+                  params.return_distances,
+                  params.metric};
 
   build(res, params, dataset, idx);
 
   return idx;
 }
 
-}  // namespace  cuvs::neighbors::nn_descent::detail
+}  // namespace cuvs::neighbors::nn_descent::detail
diff --git a/cpp/src/neighbors/detail/nn_descent_batch.cuh b/cpp/src/neighbors/detail/nn_descent_batch.cuh
new file mode 100644
index 000000000..842dbe788
--- /dev/null
+++ b/cpp/src/neighbors/detail/nn_descent_batch.cuh
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <sys/types.h>
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#include "nn_descent.cuh"
+#include <cuvs/neighbors/brute_force.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/managed_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/matrix/detail/gather_inplace.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/sample_rows.cuh>
+
+#include <thrust/copy.h>
+
+#include <vector_types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <random>
+#include <type_traits>
+
+namespace cuvs::neighbors::nn_descent::detail::experimental {
+
+//
+// Run balanced kmeans on a subsample of the dataset to get centroids
+//
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void get_balanced_kmeans_centroids(
+  raft::resources const& res,
+  cuvs::distance::DistanceType metric,
+  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+  raft::device_matrix_view<T, IdxT> centroids)
+{
+  size_t num_rows   = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols   = static_cast<size_t>(dataset.extent(1));
+  size_t n_clusters = centroids.extent(0);
+  size_t num_subsamples =
+    std::min(static_cast<size_t>(num_rows / n_clusters), static_cast<size_t>(num_rows * 0.1));
+
+  auto d_subsample_dataset =
+    raft::make_device_matrix<T, int64_t, raft::row_major>(res, num_subsamples, num_cols);
+  raft::matrix::sample_rows<T, int64_t, Accessor>(
+    res, raft::random::RngState{0}, dataset, d_subsample_dataset.view());
+
+  cuvs::cluster::kmeans::balanced_params kmeans_params;
+  kmeans_params.metric = metric;
+
+  auto d_subsample_dataset_const_view =
+    raft::make_device_matrix_view<const T, int, raft::row_major>(
+      d_subsample_dataset.data_handle(), num_subsamples, num_cols);
+  auto centroids_view = raft::make_device_matrix_view<T, int, raft::row_major>(
+    centroids.data_handle(), n_clusters, num_cols);
+  cuvs::cluster::kmeans::fit(res, kmeans_params, d_subsample_dataset_const_view, centroids_view);
+}
+
+//
+// Get the top k closest centroid indices for each data point
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename T, typename IdxT = uint32_t>
+void get_global_nearest_k(
+  raft::resources const& res,
+  size_t k,
+  size_t num_rows,
+  size_t n_clusters,
+  const T* dataset,
+  raft::host_matrix_view<IdxT, IdxT, raft::row_major> global_nearest_cluster,
+  raft::device_matrix_view<T, IdxT, raft::row_major> centroids,
+  cuvs::distance::DistanceType metric)
+{
+  size_t num_cols     = centroids.extent(1);
+  auto centroids_view = raft::make_device_matrix_view<const T, int64_t, raft::row_major>(
+    centroids.data_handle(), n_clusters, num_cols);
+
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset));
+  float* ptr = reinterpret_cast<float*>(attr.devicePointer);
+
+  size_t num_batches = n_clusters;
+  size_t batch_size  = (num_rows + n_clusters) / n_clusters;
+  if (ptr == nullptr) {  // data on host
+
+    auto d_dataset_batch =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, batch_size, num_cols);
+
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<int64_t, int64_t, raft::row_major>(res, batch_size, k);
+    auto nearest_clusters_idxt =
+      raft::make_device_matrix<IdxT, int64_t, raft::row_major>(res, batch_size, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, batch_size, k);
+
+    for (size_t i = 0; i < num_batches; i++) {
+      size_t batch_size_ = batch_size;
+
+      if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; }
+      raft::copy(d_dataset_batch.data_handle(),
+                 dataset + i * batch_size * num_cols,
+                 batch_size_ * num_cols,
+                 resource::get_cuda_stream(res));
+
+      std::optional<raft::device_vector_view<const T, int64_t>> norms_view;
+      cuvs::neighbors::brute_force::index<T> brute_force_index(
+        res, centroids_view, norms_view, metric);
+      cuvs::neighbors::brute_force::search(res,
+                                           brute_force_index,
+                                           raft::make_const_mdspan(d_dataset_batch.view()),
+                                           nearest_clusters_idx.view(),
+                                           nearest_clusters_dist.view());
+
+      thrust::copy(raft::resource::get_thrust_policy(res),
+                   nearest_clusters_idx.data_handle(),
+                   nearest_clusters_idx.data_handle() + nearest_clusters_idx.size(),
+                   nearest_clusters_idxt.data_handle());
+      raft::copy(global_nearest_cluster.data_handle() + i * batch_size * k,
+                 nearest_clusters_idxt.data_handle(),
+                 batch_size_ * k,
+                 resource::get_cuda_stream(res));
+    }
+  } else {  // data on device
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<int64_t, int64_t, raft::row_major>(res, num_rows, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, num_rows, k);
+
+    std::optional<raft::device_vector_view<const T, int64_t>> norms_view;
+    cuvs::neighbors::brute_force::index<T> brute_force_index(
+      res, centroids_view, norms_view, metric);
+    auto dataset_view =
+      raft::make_device_matrix_view<const T, int64_t, raft::row_major>(dataset, num_rows, num_cols);
+    cuvs::neighbors::brute_force::search(res,
+                                         brute_force_index,
+                                         dataset_view,
+                                         nearest_clusters_idx.view(),
+                                         nearest_clusters_dist.view());
+
+    auto nearest_clusters_idxt =
+      raft::make_device_matrix<IdxT, int64_t, raft::row_major>(res, batch_size, k);
+    for (size_t i = 0; i < num_batches; i++) {
+      size_t batch_size_ = batch_size;
+
+      if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; }
+      thrust::copy(raft::resource::get_thrust_policy(res),
+                   nearest_clusters_idx.data_handle() + i * batch_size_ * k,
+                   nearest_clusters_idx.data_handle() + (i + 1) * batch_size_ * k,
+                   nearest_clusters_idxt.data_handle());
+      raft::copy(global_nearest_cluster.data_handle() + i * batch_size_ * k,
+                 nearest_clusters_idxt.data_handle(),
+                 batch_size_ * k,
+                 resource::get_cuda_stream(res));
+    }
+  }
+}
+
+//
+// global_nearest_cluster [num_rows X k=2] : top 2 closest clusters for each data point
+// inverted_indices [num_rows x k vector] : sparse vector for data indices for each cluster
+// cluster_size [n_cluster] : cluster size for each cluster
+// offset [n_cluster] : offset in inverted_indices for each cluster
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename IdxT = uint32_t>
+void get_inverted_indices(raft::resources const& res,
+                          size_t n_clusters,
+                          size_t& max_cluster_size,
+                          size_t& min_cluster_size,
+                          raft::host_matrix_view<IdxT, IdxT> global_nearest_cluster,
+                          raft::host_vector_view<IdxT, IdxT> inverted_indices,
+                          raft::host_vector_view<IdxT, IdxT> cluster_size,
+                          raft::host_vector_view<IdxT, IdxT> offset)
+{
+  // build sparse inverted indices and get number of data points for each cluster
+  size_t num_rows = global_nearest_cluster.extent(0);
+  size_t k        = global_nearest_cluster.extent(1);
+
+  auto local_offset = raft::make_host_vector<IdxT>(n_clusters);
+
+  max_cluster_size = 0;
+  min_cluster_size = std::numeric_limits<size_t>::max();
+
+  std::fill(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters, 0);
+  std::fill(local_offset.data_handle(), local_offset.data_handle() + n_clusters, 0);
+
+  // TODO: this part isn't really a bottleneck but maybe worth trying omp parallel
+  // for with atomic add
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      cluster_size(cluster_id) += 1;
+    }
+  }
+
+  offset(0) = 0;
+  for (size_t i = 1; i < n_clusters; i++) {
+    offset(i) = offset(i - 1) + cluster_size(i - 1);
+  }
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      inverted_indices(offset(cluster_id) + local_offset(cluster_id)) = i;
+      local_offset(cluster_id) += 1;
+    }
+  }
+
+  max_cluster_size = static_cast<size_t>(
+    *std::max_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+  min_cluster_size = static_cast<size_t>(
+    *std::min_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+}
+
+template <typename KeyType, typename ValueType>
+struct KeyValuePair {
+  KeyType key;
+  ValueType value;
+};
+
+template <typename KeyType, typename ValueType>
+struct CustomKeyComparator {
+  __device__ bool operator()(const KeyValuePair<KeyType, ValueType>& a,
+                             const KeyValuePair<KeyType, ValueType>& b) const
+  {
+    if (a.key == b.key) { return a.value < b.value; }
+    return a.key < b.key;
+  }
+};
+
+template <typename IdxT, int BLOCK_SIZE, int ITEMS_PER_THREAD>
+RAFT_KERNEL merge_subgraphs(IdxT* cluster_data_indices,
+                            size_t graph_degree,
+                            size_t num_cluster_in_batch,
+                            float* global_distances,
+                            float* batch_distances,
+                            IdxT* global_indices,
+                            IdxT* batch_indices)
+{
+  size_t batch_row = blockIdx.x;
+  typedef cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>
+    BlockMergeSortType;
+  __shared__ typename cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>::
+    TempStorage tmpSmem;
+
+  extern __shared__ char sharedMem[];
+  float* blockKeys  = reinterpret_cast<float*>(sharedMem);
+  IdxT* blockValues = reinterpret_cast<IdxT*>(&sharedMem[graph_degree * 2 * sizeof(float)]);
+  int16_t* uniqueMask =
+    reinterpret_cast<int16_t*>(&sharedMem[graph_degree * 2 * (sizeof(float) + sizeof(IdxT))]);
+
+  if (batch_row < num_cluster_in_batch) {
+    // load batch or global depending on threadIdx
+    size_t global_row = cluster_data_indices[batch_row];
+
+    KeyValuePair<float, IdxT> threadKeyValuePair[ITEMS_PER_THREAD];
+
+    size_t halfway   = BLOCK_SIZE / 2;
+    size_t do_global = threadIdx.x < halfway;
+
+    float* distances;
+    IdxT* indices;
+
+    if (do_global) {
+      distances = global_distances;
+      indices   = global_indices;
+    } else {
+      distances = batch_distances;
+      indices   = batch_indices;
+    }
+
+    size_t idxBase = (threadIdx.x * do_global + (threadIdx.x - halfway) * (1lu - do_global)) *
+                     static_cast<size_t>(ITEMS_PER_THREAD);
+    size_t arrIdxBase = (global_row * do_global + batch_row * (1lu - do_global)) * graph_degree;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < graph_degree) {
+        threadKeyValuePair[i].key   = distances[arrIdxBase + colId];
+        threadKeyValuePair[i].value = indices[arrIdxBase + colId];
+      } else {
+        threadKeyValuePair[i].key   = std::numeric_limits<float>::max();
+        threadKeyValuePair[i].value = std::numeric_limits<IdxT>::max();
+      }
+    }
+
+    __syncthreads();
+
+    BlockMergeSortType(tmpSmem).Sort(threadKeyValuePair, CustomKeyComparator<float, IdxT>{});
+
+    // load sorted result into shared memory to get unique values
+    idxBase = threadIdx.x * ITEMS_PER_THREAD;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < 2 * graph_degree) {
+        blockKeys[colId]   = threadKeyValuePair[i].key;
+        blockValues[colId] = threadKeyValuePair[i].value;
+      }
+    }
+
+    __syncthreads();
+
+    // get unique mask
+    if (threadIdx.x == 0) { uniqueMask[0] = 1; }
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        uniqueMask[colId] = static_cast<int16_t>(blockValues[colId] != blockValues[colId - 1]);
+      }
+    }
+
+    __syncthreads();
+
+    // prefix sum
+    if (threadIdx.x == 0) {
+      for (int i = 1; i < 2 * graph_degree; i++) {
+        uniqueMask[i] += uniqueMask[i - 1];
+      }
+    }
+
+    __syncthreads();
+    // load unique values to global memory
+    if (threadIdx.x == 0) {
+      global_distances[global_row * graph_degree] = blockKeys[0];
+      global_indices[global_row * graph_degree]   = blockValues[0];
+    }
+
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        bool is_unique       = uniqueMask[colId] != uniqueMask[colId - 1];
+        int16_t global_colId = uniqueMask[colId] - 1;
+        if (is_unique && static_cast<size_t>(global_colId) < graph_degree) {
+          global_distances[global_row * graph_degree + global_colId] = blockKeys[colId];
+          global_indices[global_row * graph_degree + global_colId]   = blockValues[colId];
+        }
+      }
+    }
+  }
+}
+
+//
+// builds knn graph using NN Descent and merge with global graph
+//
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void build_and_merge(raft::resources const& res,
+                     const index_params& params,
+                     size_t num_data_in_cluster,
+                     size_t graph_degree,
+                     size_t int_graph_node_degree,
+                     T* cluster_data,
+                     IdxT* cluster_data_indices,
+                     int* int_graph,
+                     IdxT* inverted_indices,
+                     IdxT* global_indices_d,
+                     float* global_distances_d,
+                     IdxT* batch_indices_h,
+                     IdxT* batch_indices_d,
+                     float* batch_distances_d,
+                     GNND<const T, int>& nnd)
+{
+  nnd.build(cluster_data, num_data_in_cluster, int_graph, true, batch_distances_d);
+
+  // remap indices
+#pragma omp parallel for
+  for (size_t i = 0; i < num_data_in_cluster; i++) {
+    for (size_t j = 0; j < graph_degree; j++) {
+      size_t local_idx                      = int_graph[i * int_graph_node_degree + j];
+      batch_indices_h[i * graph_degree + j] = inverted_indices[local_idx];
+    }
+  }
+
+  raft::copy(batch_indices_d,
+             batch_indices_h,
+             num_data_in_cluster * graph_degree,
+             raft::resource::get_cuda_stream(res));
+
+  size_t num_elems     = graph_degree * 2;
+  size_t sharedMemSize = num_elems * (sizeof(float) + sizeof(IdxT) + sizeof(int16_t));
+
+  if (num_elems <= 128) {
+    merge_subgraphs<IdxT, 32, 4>
+      <<<num_data_in_cluster, 32, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 512) {
+    merge_subgraphs<IdxT, 128, 4>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 1024) {
+    merge_subgraphs<IdxT, 128, 8>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 2048) {
+    merge_subgraphs<IdxT, 256, 8>
+      <<<num_data_in_cluster, 256, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else {
+    // this is as far as we can get due to the shared mem usage of cub::BlockMergeSort
+    RAFT_FAIL("The degree of knn is too large (%lu). It must be smaller than 1024", graph_degree);
+  }
+  raft::resource::sync_stream(res);
+}
+
+//
+// For each cluster, gather the data samples that belong to that cluster, and
+// call build_and_merge
+//
+template <typename T, typename IdxT = uint32_t>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::host_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_host_matrix<T, int64_t, row_major>(max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on host. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+#pragma omp parallel for
+    for (size_t i = 0; i < num_data_in_cluster; i++) {
+      for (size_t j = 0; j < num_cols; j++) {
+        size_t global_row         = (inverted_indices + offset)[i];
+        cluster_data_matrix(i, j) = dataset(global_row, j);
+      }
+    }
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_matrix.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd);
+    nnd.reset(res);
+  }
+}
+
+template <typename T, typename IdxT = uint32_t>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::device_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_device_matrix<T, int64_t, row_major>(res, max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on device. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+    auto cluster_data_view = raft::make_device_matrix_view<T, IdxT>(
+      cluster_data_matrix.data_handle(), num_data_in_cluster, num_cols);
+    auto cluster_data_indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+      cluster_data_indices + offset, num_data_in_cluster);
+
+    auto dataset_IdxT =
+      raft::make_device_matrix_view<const T, IdxT>(dataset.data_handle(), num_rows, num_cols);
+    raft::matrix::gather(res, dataset_IdxT, cluster_data_indices_view, cluster_data_view);
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_view.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd);
+    nnd.reset(res);
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<float>, memory_type::host>>
+void batch_build(raft::resources const& res,
+                 const index_params& params,
+                 mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+                 index<IdxT>& global_idx)
+{
+  size_t graph_degree        = params.graph_degree;
+  size_t intermediate_degree = params.intermediate_graph_degree;
+
+  size_t num_rows = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols = static_cast<size_t>(dataset.extent(1));
+
+  auto centroids =
+    raft::make_device_matrix<T, IdxT, raft::row_major>(res, params.n_clusters, num_cols);
+  get_balanced_kmeans_centroids<T, IdxT>(res, params.metric, dataset, centroids.view());
+
+  size_t k                    = 2;
+  auto global_nearest_cluster = raft::make_host_matrix<IdxT, IdxT, raft::row_major>(num_rows, k);
+  get_global_nearest_k<T, IdxT>(res,
+                                k,
+                                num_rows,
+                                params.n_clusters,
+                                dataset.data_handle(),
+                                global_nearest_cluster.view(),
+                                centroids.view(),
+                                params.metric);
+
+  auto inverted_indices = raft::make_host_vector<IdxT, IdxT, raft::row_major>(num_rows * k);
+  auto cluster_size     = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+  auto offset           = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+
+  size_t max_cluster_size, min_cluster_size;
+  get_inverted_indices(res,
+                       params.n_clusters,
+                       max_cluster_size,
+                       min_cluster_size,
+                       global_nearest_cluster.view(),
+                       inverted_indices.view(),
+                       cluster_size.view(),
+                       offset.view());
+
+  if (intermediate_degree >= min_cluster_size) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than minimum cluster size, reducing it to %lu",
+      dataset.extent(0));
+    intermediate_degree = min_cluster_size - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  size_t extended_graph_degree =
+    align32::roundUp(static_cast<size_t>(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3)));
+  size_t extended_intermediate_degree = align32::roundUp(
+    static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
+
+  auto int_graph = raft::make_host_matrix<int, int64_t, row_major>(
+    max_cluster_size, static_cast<int64_t>(extended_graph_degree));
+
+  BuildConfig build_config{.max_dataset_size      = max_cluster_size,
+                           .dataset_dim           = num_cols,
+                           .node_degree           = extended_graph_degree,
+                           .internal_node_degree  = extended_intermediate_degree,
+                           .max_iterations        = params.max_iterations,
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = graph_degree};
+
+  auto global_indices_h   = raft::make_managed_matrix<IdxT, int64_t>(res, num_rows, graph_degree);
+  auto global_distances_h = raft::make_managed_matrix<float, int64_t>(res, num_rows, graph_degree);
+
+  std::fill(global_indices_h.data_handle(),
+            global_indices_h.data_handle() + num_rows * graph_degree,
+            std::numeric_limits<IdxT>::max());
+  std::fill(global_distances_h.data_handle(),
+            global_distances_h.data_handle() + num_rows * graph_degree,
+            std::numeric_limits<float>::max());
+
+  auto batch_indices_h =
+    raft::make_host_matrix<IdxT, int64_t, row_major>(max_cluster_size, graph_degree);
+  auto batch_indices_d =
+    raft::make_device_matrix<IdxT, int64_t, row_major>(res, max_cluster_size, graph_degree);
+  auto batch_distances_d =
+    raft::make_device_matrix<float, int64_t, row_major>(res, max_cluster_size, graph_degree);
+
+  auto cluster_data_indices = raft::make_device_vector<IdxT, IdxT>(res, num_rows * k);
+  raft::copy(cluster_data_indices.data_handle(),
+             inverted_indices.data_handle(),
+             num_rows * k,
+             resource::get_cuda_stream(res));
+
+  cluster_nnd<T, IdxT>(res,
+                       params,
+                       graph_degree,
+                       extended_graph_degree,
+                       max_cluster_size,
+                       dataset,
+                       offset.data_handle(),
+                       cluster_size.data_handle(),
+                       cluster_data_indices.data_handle(),
+                       int_graph.data_handle(),
+                       inverted_indices.data_handle(),
+                       global_indices_h.data_handle(),
+                       global_distances_h.data_handle(),
+                       batch_indices_h.data_handle(),
+                       batch_indices_d.data_handle(),
+                       batch_distances_d.data_handle(),
+                       build_config);
+
+  raft::copy(global_idx.graph().data_handle(),
+             global_indices_h.data_handle(),
+             num_rows * graph_degree,
+             raft::resource::get_cuda_stream(res));
+  if (params.return_distances && global_idx.distances().has_value()) {
+    raft::copy(global_idx.distances().value().data_handle(),
+               global_distances_h.data_handle(),
+               num_rows * graph_degree,
+               raft::resource::get_cuda_stream(res));
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<float>, memory_type::host>>
+index<IdxT> batch_build(raft::resources const& res,
+                        const index_params& params,
+                        mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset)
+{
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  index<IdxT> idx{
+    res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
+
+  batch_build(res, params, dataset, idx);
+
+  return idx;
+}
+
+}  // namespace cuvs::neighbors::nn_descent::detail::experimental
diff --git a/cpp/src/neighbors/detail/sparse_knn.cuh b/cpp/src/neighbors/detail/sparse_knn.cuh
new file mode 100644
index 000000000..9c8e971b9
--- /dev/null
+++ b/cpp/src/neighbors/detail/sparse_knn.cuh
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "../../distance/sparse_distance.cuh"
+#include "knn_merge_parts.cuh"
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+
+#include <cuvs/selection/select_k.hpp>
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/op/slice.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <algorithm>
+
+namespace cuvs::neighbors::detail {
+
+template <typename value_idx, typename value_t>
+struct csr_batcher_t {
+  csr_batcher_t(value_idx batch_size,
+                value_idx n_rows,
+                const value_idx* csr_indptr,
+                const value_idx* csr_indices,
+                const value_t* csr_data)
+    : batch_start_(0),
+      batch_stop_(0),
+      batch_rows_(0),
+      total_rows_(n_rows),
+      batch_size_(batch_size),
+      csr_indptr_(csr_indptr),
+      csr_indices_(csr_indices),
+      csr_data_(csr_data),
+      batch_csr_start_offset_(0),
+      batch_csr_stop_offset_(0)
+  {
+  }
+
+  void set_batch(int batch_num)
+  {
+    batch_start_ = batch_num * batch_size_;
+    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
+
+    if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
+
+    batch_rows_ = (batch_stop_ - batch_start_) + 1;
+  }
+
+  value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_indptr(batch_start_,
+                                           batch_stop_,
+                                           csr_indptr_,
+                                           batch_indptr,
+                                           &batch_csr_start_offset_,
+                                           &batch_csr_stop_offset_,
+                                           stream);
+
+    return batch_csr_stop_offset_ - batch_csr_start_offset_;
+  }
+
+  void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_,
+                                             batch_csr_stop_offset_,
+                                             csr_indices_,
+                                             csr_data_,
+                                             csr_indices,
+                                             csr_data,
+                                             stream);
+  }
+
+  value_idx batch_rows() const { return batch_rows_; }
+
+  value_idx batch_start() const { return batch_start_; }
+
+  value_idx batch_stop() const { return batch_stop_; }
+
+ private:
+  value_idx batch_size_;
+  value_idx batch_start_;
+  value_idx batch_stop_;
+  value_idx batch_rows_;
+
+  value_idx total_rows_;
+
+  const value_idx* csr_indptr_;
+  const value_idx* csr_indices_;
+  const value_t* csr_data_;
+
+  value_idx batch_csr_start_offset_;
+  value_idx batch_csr_stop_offset_;
+};
+
+template <typename value_idx, typename value_t>
+class sparse_knn_t {
+ public:
+  sparse_knn_t(const value_idx* idxIndptr_,
+               const value_idx* idxIndices_,
+               const value_t* idxData_,
+               size_t idxNNZ_,
+               int n_idx_rows_,
+               int n_idx_cols_,
+               const value_idx* queryIndptr_,
+               const value_idx* queryIndices_,
+               const value_t* queryData_,
+               size_t queryNNZ_,
+               int n_query_rows_,
+               int n_query_cols_,
+               value_idx* output_indices_,
+               value_t* output_dists_,
+               int k_,
+               raft::resources const& handle_,
+               size_t batch_size_index_             = 2 << 14,  // approx 1M
+               size_t batch_size_query_             = 2 << 14,
+               cuvs::distance::DistanceType metric_ = cuvs::distance::DistanceType::L2Expanded,
+               float metricArg_                     = 0)
+    : idxIndptr(idxIndptr_),
+      idxIndices(idxIndices_),
+      idxData(idxData_),
+      idxNNZ(idxNNZ_),
+      n_idx_rows(n_idx_rows_),
+      n_idx_cols(n_idx_cols_),
+      queryIndptr(queryIndptr_),
+      queryIndices(queryIndices_),
+      queryData(queryData_),
+      queryNNZ(queryNNZ_),
+      n_query_rows(n_query_rows_),
+      n_query_cols(n_query_cols_),
+      output_indices(output_indices_),
+      output_dists(output_dists_),
+      k(k_),
+      handle(handle_),
+      batch_size_index(batch_size_index_),
+      batch_size_query(batch_size_query_),
+      metric(metric_),
+      metricArg(metricArg_)
+  {
+  }
+
+  void run()
+  {
+    using namespace raft::sparse;
+
+    int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
+    csr_batcher_t<value_idx, value_t> query_batcher(
+      batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData);
+
+    size_t rows_processed = 0;
+
+    for (int i = 0; i < n_batches_query; i++) {
+      /**
+       * Compute index batch info
+       */
+      query_batcher.set_batch(i);
+
+      /**
+       * Slice CSR to rows in batch
+       */
+
+      rmm::device_uvector<value_idx> query_batch_indptr(query_batcher.batch_rows() + 1,
+                                                        raft::resource::get_cuda_stream(handle));
+
+      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
+        query_batch_indptr.data(), raft::resource::get_cuda_stream(handle));
+
+      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
+                                                         raft::resource::get_cuda_stream(handle));
+      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
+                                                    raft::resource::get_cuda_stream(handle));
+
+      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
+                                               query_batch_data.data(),
+                                               raft::resource::get_cuda_stream(handle));
+
+      // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
+      // batches and 1 space for the results of the merge, which get copied back to the top
+      rmm::device_uvector<value_idx> merge_buffer_indices(0,
+                                                          raft::resource::get_cuda_stream(handle));
+      rmm::device_uvector<value_t> merge_buffer_dists(0, raft::resource::get_cuda_stream(handle));
+
+      value_t* dists_merge_buffer_ptr;
+      value_idx* indices_merge_buffer_ptr;
+
+      int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
+      csr_batcher_t<value_idx, value_t> idx_batcher(
+        batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData);
+
+      for (int j = 0; j < n_batches_idx; j++) {
+        idx_batcher.set_batch(j);
+
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
+                                    raft::resource::get_cuda_stream(handle));
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
+                                  raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Slice CSR to rows in batch
+         */
+        rmm::device_uvector<value_idx> idx_batch_indptr(idx_batcher.batch_rows() + 1,
+                                                        raft::resource::get_cuda_stream(handle));
+        rmm::device_uvector<value_idx> idx_batch_indices(0,
+                                                         raft::resource::get_cuda_stream(handle));
+        rmm::device_uvector<value_t> idx_batch_data(0, raft::resource::get_cuda_stream(handle));
+
+        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
+          idx_batch_indptr.data(), raft::resource::get_cuda_stream(handle));
+
+        idx_batch_indices.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle));
+        idx_batch_data.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle));
+
+        idx_batcher.get_batch_csr_indices_data(
+          idx_batch_indices.data(), idx_batch_data.data(), raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Compute distances
+         */
+        uint64_t dense_size =
+          (uint64_t)idx_batcher.batch_rows() * (uint64_t)query_batcher.batch_rows();
+        rmm::device_uvector<value_t> batch_dists(dense_size,
+                                                 raft::resource::get_cuda_stream(handle));
+
+        RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher,
+                          query_batcher,
+                          idx_batch_nnz,
+                          n_query_batch_nnz,
+                          idx_batch_indptr.data(),
+                          idx_batch_indices.data(),
+                          idx_batch_data.data(),
+                          query_batch_indptr.data(),
+                          query_batch_indices.data(),
+                          query_batch_data.data(),
+                          batch_dists.data());
+
+        // Build batch indices array
+        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
+                                                     raft::resource::get_cuda_stream(handle));
+
+        // populate batch indices array
+        value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+
+        iota_fill(
+          batch_indices.data(), batch_rows, batch_cols, raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Perform k-selection on batch & merge with other k-selections
+         */
+        size_t merge_buffer_offset = batch_rows * k;
+        dists_merge_buffer_ptr     = merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr   = merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher,
+                            query_batcher,
+                            batch_dists.data(),
+                            batch_indices.data(),
+                            dists_merge_buffer_ptr,
+                            indices_merge_buffer_ptr);
+
+        value_t* dists_merge_buffer_tmp_ptr     = dists_merge_buffer_ptr;
+        value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+
+        // Merge results of difference batches if necessary
+        if (idx_batcher.batch_start() > 0) {
+          size_t merge_buffer_tmp_out  = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr   = merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher,
+                        query_batcher,
+                        merge_buffer_dists.data(),
+                        merge_buffer_indices.data(),
+                        dists_merge_buffer_tmp_ptr,
+                        indices_merge_buffer_tmp_ptr);
+        }
+
+        // copy merged output back into merge buffer partition for next iteration
+        raft::copy_async<value_idx>(merge_buffer_indices.data(),
+                                    indices_merge_buffer_tmp_ptr,
+                                    batch_rows * k,
+                                    raft::resource::get_cuda_stream(handle));
+        raft::copy_async<value_t>(merge_buffer_dists.data(),
+                                  dists_merge_buffer_tmp_ptr,
+                                  batch_rows * k,
+                                  raft::resource::get_cuda_stream(handle));
+      }
+
+      // Copy final merged batch to output array
+      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
+                                  merge_buffer_indices.data(),
+                                  query_batcher.batch_rows() * k,
+                                  raft::resource::get_cuda_stream(handle));
+      raft::copy_async<value_t>(output_dists + (rows_processed * k),
+                                merge_buffer_dists.data(),
+                                query_batcher.batch_rows() * k,
+                                raft::resource::get_cuda_stream(handle));
+
+      rows_processed += query_batcher.batch_rows();
+    }
+  }
+
+ private:
+  void merge_batches(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                     csr_batcher_t<value_idx, value_t>& query_batcher,
+                     value_t* merge_buffer_dists,
+                     value_idx* merge_buffer_indices,
+                     value_t* out_dists,
+                     value_idx* out_indices)
+  {
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    rmm::device_uvector<value_idx> trans(id_ranges.size(), raft::resource::get_cuda_stream(handle));
+    raft::update_device(
+      trans.data(), id_ranges.data(), id_ranges.size(), raft::resource::get_cuda_stream(handle));
+
+    // combine merge buffers only if there's more than 1 partition to combine
+    cuvs::neighbors::detail::knn_merge_parts(merge_buffer_dists,
+                                             merge_buffer_indices,
+                                             out_dists,
+                                             out_indices,
+                                             query_batcher.batch_rows(),
+                                             2,
+                                             k,
+                                             raft::resource::get_cuda_stream(handle),
+                                             trans.data());
+  }
+
+  void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
+                           csr_batcher_t<value_idx, value_t> query_batcher,
+                           value_t* batch_dists,
+                           value_idx* batch_indices,
+                           value_t* out_dists,
+                           value_idx* out_indices)
+  {
+    // populate batch indices array
+    value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    // in the case where the number of idx rows in the batch is < k, we
+    // want to adjust k.
+    value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);
+
+    bool ascending = cuvs::distance::is_min_close(metric);
+
+    // kernel to slice first (min) k cols and copy into batched merge buffer
+    cuvs::selection::select_k(
+      handle,
+      raft::make_device_matrix_view<const value_t, int64_t>(batch_dists, batch_rows, batch_cols),
+      raft::make_device_matrix_view<const value_idx, int64_t>(
+        batch_indices, batch_rows, batch_cols),
+      raft::make_device_matrix_view<value_t, int64_t>(out_dists, batch_rows, n_neighbors),
+      raft::make_device_matrix_view<value_idx, int64_t>(out_indices, batch_rows, n_neighbors),
+      ascending,
+      true);
+  }
+
+  void compute_distances(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                         csr_batcher_t<value_idx, value_t>& query_batcher,
+                         size_t idx_batch_nnz,
+                         size_t query_batch_nnz,
+                         value_idx* idx_batch_indptr,
+                         value_idx* idx_batch_indices,
+                         value_t* idx_batch_data,
+                         value_idx* query_batch_indptr,
+                         value_idx* query_batch_indices,
+                         value_t* query_batch_data,
+                         value_t* batch_dists)
+  {
+    /**
+     * Compute distances
+     */
+    cuvs::distance::detail::sparse::distances_config_t<value_idx, value_t> dist_config(handle);
+    dist_config.b_nrows = idx_batcher.batch_rows();
+    dist_config.b_ncols = n_idx_cols;
+    dist_config.b_nnz   = idx_batch_nnz;
+
+    dist_config.b_indptr  = idx_batch_indptr;
+    dist_config.b_indices = idx_batch_indices;
+    dist_config.b_data    = idx_batch_data;
+
+    dist_config.a_nrows = query_batcher.batch_rows();
+    dist_config.a_ncols = n_query_cols;
+    dist_config.a_nnz   = query_batch_nnz;
+
+    dist_config.a_indptr  = query_batch_indptr;
+    dist_config.a_indices = query_batch_indices;
+    dist_config.a_data    = query_batch_data;
+
+    cuvs::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg);
+  }
+
+  const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
+  value_idx* output_indices;
+  const value_t *idxData, *queryData;
+  value_t* output_dists;
+
+  size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
+
+  cuvs::distance::DistanceType metric;
+
+  float metricArg;
+
+  int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
+
+  raft::resources const& handle;
+};
+
+};  // namespace cuvs::neighbors::detail
diff --git a/cpp/src/neighbors/dynamic_batching.cu b/cpp/src/neighbors/dynamic_batching.cu
new file mode 100644
index 000000000..6be70353b
--- /dev/null
+++ b/cpp/src/neighbors/dynamic_batching.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/dynamic_batching.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+// NB: the (template) index parameter should be the last; it may contain the spaces and so split
+//       into multiple preprocessor token. Then it is consumed as __VA_ARGS__
+//
+#define CUVS_INST_DYNAMIC_BATCHING_INDEX(T, IdxT, Namespace, ...)                         \
+  template <>                                                                             \
+  template <>                                                                             \
+  index<T, IdxT>::index<Namespace ::__VA_ARGS__>(                                         \
+    const raft::resources& res,                                                           \
+    const cuvs::neighbors::dynamic_batching::index_params& params,                        \
+    const Namespace ::__VA_ARGS__& upstream_index,                                        \
+    const typename Namespace ::__VA_ARGS__::search_params_type& upstream_params,          \
+    const cuvs::neighbors::filtering::base_filter* sample_filter)                         \
+    : runner{new detail::batch_runner<T, IdxT>(                                           \
+        res, params, upstream_index, upstream_params, Namespace ::search, sample_filter)} \
+  {                                                                                       \
+  }
+
+#define CUVS_INST_DYNAMIC_BATCHING_SEARCH(T, IdxT)                                 \
+  void search(raft::resources const& res,                                          \
+              cuvs::neighbors::dynamic_batching::search_params const& params,      \
+              cuvs::neighbors::dynamic_batching::index<T, IdxT> const& index,      \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries, \
+              raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,  \
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances) \
+  {                                                                                \
+    return index.runner->search(res, params, queries, neighbors, distances);       \
+  }
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, uint32_t, cuvs::neighbors::cagra, index<float, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(half, uint32_t, cuvs::neighbors::cagra, index<half, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, uint32_t, cuvs::neighbors::cagra, index<int8_t, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t,
+                                 uint32_t,
+                                 cuvs::neighbors::cagra,
+                                 index<uint8_t, uint32_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(half, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_flat, index<float, int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t,
+                                 int64_t,
+                                 cuvs::neighbors::ivf_flat,
+                                 index<int8_t, int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t,
+                                 int64_t,
+                                 cuvs::neighbors::ivf_flat,
+                                 index<uint8_t, int64_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, uint32_t);  // uint32_t index type is needed for CAGRA
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, uint32_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, uint32_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, uint32_t);
+
+#undef CUVS_INST_DYNAMIC_BATCHING_INDEX
+#undef CUVS_INST_DYNAMIC_BATCHING_SEARCH
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/src/neighbors/hnsw.cpp b/cpp/src/neighbors/hnsw.cpp
index e6f3fbcc7..f165176ec 100644
--- a/cpp/src/neighbors/hnsw.cpp
+++ b/cpp/src/neighbors/hnsw.cpp
@@ -21,11 +21,14 @@
 
 namespace cuvs::neighbors::hnsw {
 
-#define CUVS_INST_HNSW_FROM_CAGRA(T)                                                           \
-  std::unique_ptr<index<T>> from_cagra(                                                        \
-    raft::resources const& res, const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index) \
-  {                                                                                            \
-    return detail::from_cagra<T>(res, cagra_index);                                            \
+#define CUVS_INST_HNSW_FROM_CAGRA(T)                                                  \
+  std::unique_ptr<index<T>> from_cagra(                                               \
+    raft::resources const& res,                                                       \
+    const index_params& params,                                                       \
+    const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,                    \
+    std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset) \
+  {                                                                                   \
+    return detail::from_cagra<T>(res, params, cagra_index, dataset);                  \
   }
 
 CUVS_INST_HNSW_FROM_CAGRA(float);
@@ -34,6 +37,21 @@ CUVS_INST_HNSW_FROM_CAGRA(int8_t);
 
 #undef CUVS_INST_HNSW_FROM_CAGRA
 
+#define CUVS_INST_HNSW_EXTEND(T)                                                            \
+  void extend(raft::resources const& res,                                                   \
+              const extend_params& params,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> additional_dataset, \
+              index<T>& idx)                                                                \
+  {                                                                                         \
+    detail::extend<T>(res, params, additional_dataset, idx);                                \
+  }
+
+CUVS_INST_HNSW_EXTEND(float);
+CUVS_INST_HNSW_EXTEND(uint8_t);
+CUVS_INST_HNSW_EXTEND(int8_t);
+
+#undef CUVS_INST_HNSW_EXTEND
+
 #define CUVS_INST_HNSW_SEARCH(T)                                                    \
   void search(raft::resources const& res,                                           \
               const search_params& params,                                          \
@@ -51,20 +69,25 @@ CUVS_INST_HNSW_SEARCH(int8_t);
 
 #undef CUVS_INST_HNSW_SEARCH
 
-#define CUVS_INST_HNSW_DESERIALIZE(T)                        \
-  void deserialize(raft::resources const& res,               \
-                   const std::string& filename,              \
-                   int dim,                                  \
-                   cuvs::distance::DistanceType metric,      \
-                   index<T>** idx)                           \
-  {                                                          \
-    detail::deserialize<T>(res, filename, dim, metric, idx); \
+#define CUVS_INST_HNSW_SERIALIZE(T)                                                            \
+  void serialize(raft::resources const& res, const std::string& filename, const index<T>& idx) \
+  {                                                                                            \
+    detail::serialize<T>(res, filename, idx);                                                  \
+  }                                                                                            \
+  void deserialize(raft::resources const& res,                                                 \
+                   const index_params& params,                                                 \
+                   const std::string& filename,                                                \
+                   int dim,                                                                    \
+                   cuvs::distance::DistanceType metric,                                        \
+                   index<T>** idx)                                                             \
+  {                                                                                            \
+    detail::deserialize<T>(res, params, filename, dim, metric, idx);                           \
   }
 
-CUVS_INST_HNSW_DESERIALIZE(float);
-CUVS_INST_HNSW_DESERIALIZE(uint8_t);
-CUVS_INST_HNSW_DESERIALIZE(int8_t);
+CUVS_INST_HNSW_SERIALIZE(float);
+CUVS_INST_HNSW_SERIALIZE(uint8_t);
+CUVS_INST_HNSW_SERIALIZE(int8_t);
 
-#undef CUVS_INST_HNSW_DESERIALIZE
+#undef CUVS_INST_HNSW_SERIALIZE
 
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/src/neighbors/hnsw_c.cpp b/cpp/src/neighbors/hnsw_c.cpp
index a19875641..0233a510a 100644
--- a/cpp/src/neighbors/hnsw_c.cpp
+++ b/cpp/src/neighbors/hnsw_c.cpp
@@ -31,6 +31,44 @@
 #include <cuvs/neighbors/hnsw.hpp>
 
 namespace {
+
+template <typename T>
+void _from_cagra(cuvsResources_t res,
+                 cuvsHnswIndexParams_t params,
+                 cuvsCagraIndex_t cagra_index,
+                 cuvsHnswIndex_t hnsw_index)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  auto index   = reinterpret_cast<cuvs::neighbors::cagra::index<T, uint32_t>*>(cagra_index->addr);
+  auto cpp_params            = cuvs::neighbors::hnsw::index_params();
+  cpp_params.hierarchy       = static_cast<cuvs::neighbors::hnsw::HnswHierarchy>(params->hierarchy);
+  cpp_params.ef_construction = params->ef_construction;
+  cpp_params.num_threads     = params->num_threads;
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset = std::nullopt;
+
+  auto hnsw_index_unique_ptr =
+    cuvs::neighbors::hnsw::from_cagra(*res_ptr, cpp_params, *index, dataset);
+  auto hnsw_index_ptr = hnsw_index_unique_ptr.release();
+  hnsw_index->addr    = reinterpret_cast<uintptr_t>(hnsw_index_ptr);
+}
+
+template <typename T>
+void _extend(cuvsResources_t res,
+             cuvsHnswExtendParams_t params,
+             DLManagedTensor* additional_dataset,
+             cuvsHnswIndex index)
+{
+  auto res_ptr           = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr         = reinterpret_cast<cuvs::neighbors::hnsw::index<T>*>(index.addr);
+  auto cpp_params        = cuvs::neighbors::hnsw::extend_params();
+  cpp_params.num_threads = params->num_threads;
+
+  using additional_dataset_mdspan_type = raft::host_matrix_view<T const, int64_t, raft::row_major>;
+  auto additional_dataset_mds =
+    cuvs::core::from_dlpack<additional_dataset_mdspan_type>(additional_dataset);
+  cuvs::neighbors::hnsw::extend(*res_ptr, cpp_params, additional_dataset_mds, *index_ptr);
+}
+
 template <typename T>
 void _search(cuvsResources_t res,
              cuvsHnswSearchParams params,
@@ -44,7 +82,7 @@ void _search(cuvsResources_t res,
 
   auto search_params        = cuvs::neighbors::hnsw::search_params();
   search_params.ef          = params.ef;
-  search_params.num_threads = params.numThreads;
+  search_params.num_threads = params.num_threads;
 
   using queries_mdspan_type   = raft::host_matrix_view<T const, int64_t, raft::row_major>;
   using neighbors_mdspan_type = raft::host_matrix_view<uint64_t, int64_t, raft::row_major>;
@@ -57,26 +95,42 @@ void _search(cuvsResources_t res,
 }
 
 template <typename T>
-void* _deserialize(cuvsResources_t res, const char* filename, int dim, cuvsDistanceType metric)
+void _serialize(cuvsResources_t res, const char* filename, cuvsHnswIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::hnsw::index<T>*>(index.addr);
+  cuvs::neighbors::hnsw::serialize(*res_ptr, std::string(filename), *index_ptr);
+}
+
+template <typename T>
+void* _deserialize(cuvsResources_t res,
+                   cuvsHnswIndexParams_t params,
+                   const char* filename,
+                   int dim,
+                   cuvsDistanceType metric)
 {
   auto res_ptr                           = reinterpret_cast<raft::resources*>(res);
   cuvs::neighbors::hnsw::index<T>* index = nullptr;
-  cuvs::neighbors::hnsw::deserialize(*res_ptr, std::string(filename), dim, metric, &index);
+  auto cpp_params                        = cuvs::neighbors::hnsw::index_params();
+  cpp_params.hierarchy = static_cast<cuvs::neighbors::hnsw::HnswHierarchy>(params->hierarchy);
+  cuvs::neighbors::hnsw::deserialize(
+    *res_ptr, cpp_params, std::string(filename), dim, metric, &index);
   return index;
 }
 }  // namespace
 
-extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params)
+extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
 {
-  return cuvs::core::translate_exceptions(
-    [=] { *params = new cuvsHnswSearchParams{.ef = 200, .numThreads = 0}; });
+  return cuvs::core::translate_exceptions([=] {
+    *params = new cuvsHnswIndexParams{
+      .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 2};
+  });
 }
 
-extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params)
+extern "C" cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params)
 {
   return cuvs::core::translate_exceptions([=] { delete params; });
 }
-
 extern "C" cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index)
 {
   return cuvs::core::translate_exceptions([=] { *index = new cuvsHnswIndex{}; });
@@ -101,6 +155,66 @@ extern "C" cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index_c_ptr)
   });
 }
 
+extern "C" cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { *params = new cuvsHnswExtendParams{.num_threads = 0}; });
+}
+
+extern "C" cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
+extern "C" cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                                         cuvsHnswIndexParams_t params,
+                                         cuvsCagraIndex_t cagra_index,
+                                         cuvsHnswIndex_t hnsw_index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index        = *cagra_index;
+    hnsw_index->dtype = index.dtype;
+    if (index.dtype.code == kDLFloat) {
+      _from_cagra<float>(res, params, cagra_index, hnsw_index);
+    } else if (index.dtype.code == kDLUInt) {
+      _from_cagra<uint8_t>(res, params, cagra_index, hnsw_index);
+    } else if (index.dtype.code == kDLInt) {
+      _from_cagra<int8_t>(res, params, cagra_index, hnsw_index);
+    } else {
+      RAFT_FAIL("Unsupported dtype: %d", index.dtype.code);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                                      cuvsHnswExtendParams_t params,
+                                      DLManagedTensor* additional_dataset,
+                                      cuvsHnswIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat) {
+      _extend<float>(res, params, additional_dataset, *index);
+    } else if (index->dtype.code == kDLUInt) {
+      _extend<uint8_t>(res, params, additional_dataset, *index);
+    } else if (index->dtype.code == kDLInt) {
+      _extend<int8_t>(res, params, additional_dataset, *index);
+    } else {
+      RAFT_FAIL("Unsupported dtype: %d", index->dtype.code);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { *params = new cuvsHnswSearchParams{.ef = 200, .num_threads = 0}; });
+}
+
+extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
 extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res,
                                       cuvsHnswSearchParams_t params,
                                       cuvsHnswIndex_t index_c_ptr,
@@ -140,7 +254,25 @@ extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsHnswSerialize(cuvsResources_t res,
+                                         const char* filename,
+                                         cuvsHnswIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat) {
+      _serialize<float>(res, filename, *index);
+    } else if (index->dtype.code == kDLInt) {
+      _serialize<int8_t>(res, filename, *index);
+    } else if (index->dtype.code == kDLUInt) {
+      _serialize<uint8_t>(res, filename, *index);
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
+
 extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                           cuvsHnswIndexParams_t params,
                                            const char* filename,
                                            int dim,
                                            cuvsDistanceType metric,
@@ -148,11 +280,14 @@ extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
 {
   return cuvs::core::translate_exceptions([=] {
     if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<float>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<float>(res, params, filename, dim, metric));
     } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<uint8_t>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<uint8_t>(res, params, filename, dim, metric));
     } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<int8_t>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<int8_t>(res, params, filename, dim, metric));
     } else {
       RAFT_FAIL("Unsupported dtype in file %s", filename);
     }
diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp
index a329db429..98ef3fdd3 100644
--- a/cpp/src/neighbors/iface/iface.hpp
+++ b/cpp/src/neighbors/iface/iface.hpp
@@ -1,11 +1,31 @@
-#include <mutex>
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
+#include <fstream>
 #include <raft/core/device_resources.hpp>
 
+#include <fstream>
+#include <mutex>
+
 namespace cuvs::neighbors {
 
 using namespace raft;
@@ -16,7 +36,7 @@ void build(const raft::device_resources& handle,
            const cuvs::neighbors::index_params* index_params,
            raft::mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> index_dataset)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     auto idx = cuvs::neighbors::ivf_flat::build(
@@ -32,8 +52,6 @@ void build(const raft::device_resources& handle,
     interface.index_.emplace(std::move(idx));
   }
   resource::sync_stream(handle);
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT, typename Accessor1, typename Accessor2>
@@ -44,7 +62,7 @@ void extend(
   std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, Accessor2>>
     new_indices)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     auto idx =
@@ -58,8 +76,6 @@ void extend(
     RAFT_FAIL("CAGRA does not implement the extend method");
   }
   resource::sync_stream(handle);
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -70,7 +86,7 @@ void search(const raft::device_resources& handle,
             raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
             raft::device_matrix_view<float, int64_t, row_major> distances)
 {
-  // interface.mutex_->lock();
+  // std::lock_guard(*interface.mutex_);
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, int64_t>>::value) {
     cuvs::neighbors::ivf_flat::search(
       handle,
@@ -94,9 +110,7 @@ void search(const raft::device_resources& handle,
                                    neighbors,
                                    distances);
   }
-  resource::sync_stream(handle);
-
-  // interface.mutex_->unlock();
+  // resource::sync_stream(handle);
 }
 
 // for MG ANN only
@@ -108,7 +122,7 @@ void search(const raft::device_resources& handle,
             raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,
             raft::device_matrix_view<float, int64_t, row_major> d_distances)
 {
-  // interface.mutex_->lock();
+  // std::lock_guard(*interface.mutex_);
 
   int64_t n_rows = h_queries.extent(0);
   int64_t n_dims = h_queries.extent(1);
@@ -120,8 +134,6 @@ void search(const raft::device_resources& handle,
   auto d_query_view = raft::make_const_mdspan(d_queries.view());
 
   search(handle, interface, search_params, d_query_view, d_neighbors, d_distances);
-
-  // interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -129,7 +141,7 @@ void serialize(const raft::device_resources& handle,
                const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                std::ostream& os)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     ivf_flat::serialize(handle, os, interface.index_.value());
@@ -138,8 +150,6 @@ void serialize(const raft::device_resources& handle,
   } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
     cagra::serialize(handle, os, interface.index_.value(), true);
   }
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -147,7 +157,7 @@ void deserialize(const raft::device_resources& handle,
                  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                  std::istream& is)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     ivf_flat::index<T, IdxT> idx(handle);
@@ -162,8 +172,6 @@ void deserialize(const raft::device_resources& handle,
     cagra::deserialize(handle, is, &idx);
     interface.index_.emplace(std::move(idx));
   }
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -171,7 +179,7 @@ void deserialize(const raft::device_resources& handle,
                  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                  const std::string& filename)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   std::ifstream is(filename, std::ios::in | std::ios::binary);
   if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
@@ -191,8 +199,6 @@ void deserialize(const raft::device_resources& handle,
   }
 
   is.close();
-
-  interface.mutex_->unlock();
 }
 
-};  // namespace cuvs::neighbors
\ No newline at end of file
+};  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
index fb110d810..d6ffc1218 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
@@ -132,6 +132,10 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
 {
   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
   if (i >= n_rows) { return; }
+  auto source_ix = source_ixs == nullptr ? i + batch_offset : source_ixs[i];
+  // In the context of refinement, some indices may be invalid (the generating NN algorithm does
+  // not return enough valid items). Do not add the item to the index in this case.
+  if (source_ix == ivf::kInvalidRecord<IdxT> || source_ix == raft::upper_bound<IdxT>()) { return; }
 
   auto list_id     = labels[i];
   auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
@@ -139,7 +143,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
   auto* list_data  = list_data_ptrs[list_id];
 
   // Record the source vector id in the index
-  list_index[inlist_id] = source_ixs == nullptr ? i + batch_offset : source_ixs[i];
+  list_index[inlist_id] = source_ix;
 
   // The data is written in interleaved groups of `index::kGroupSize` vectors
   using interleaved_group = raft::Pow2<kIndexGroupSize>;
@@ -151,7 +155,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
 
   // Point to the source vector
   if constexpr (gather_src) {
-    source_vecs += source_ixs[i] * dim;
+    source_vecs += source_ix * dim;
   } else {
     source_vecs += i * dim;
   }
diff --git a/cpp/src/neighbors/ivf_flat_c.cpp b/cpp/src/neighbors/ivf_flat_c.cpp
old mode 100755
new mode 100644
index c14c1edc0..2acc6b678
--- a/cpp/src/neighbors/ivf_flat_c.cpp
+++ b/cpp/src/neighbors/ivf_flat_c.cpp
@@ -29,6 +29,8 @@
 #include <cuvs/neighbors/ivf_flat.h>
 #include <cuvs/neighbors/ivf_flat.hpp>
 
+#include <fstream>
+
 namespace {
 
 template <typename T, typename IdxT>
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index 4c9867126..1d4acea1e 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -1754,6 +1754,12 @@ auto build(raft::resources const& handle,
     if constexpr (std::is_same_v<T, float>) {
       raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     } else {
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+        "   ivf_pq::build(%zu, %zu)/sample rows with tmp trainset (%zu rows).",
+        size_t(n_rows),
+        size_t(dim),
+        size_t(n_rows_train));
+
       // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp.
       auto trainset_tmp = raft::make_device_mdarray<T>(
         handle, big_memory_resource, raft::make_extents<int64_t>(n_rows_train, dim));
diff --git a/cpp/src/neighbors/mg/mg.cuh b/cpp/src/neighbors/mg/mg.cuh
index d3f635bc4..e9cdc30f6 100644
--- a/cpp/src/neighbors/mg/mg.cuh
+++ b/cpp/src/neighbors/mg/mg.cuh
@@ -25,6 +25,8 @@
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/mg.hpp>
 
+#include <fstream>
+
 namespace cuvs::neighbors {
 using namespace raft;
 
diff --git a/cpp/src/neighbors/nn_descent.cuh b/cpp/src/neighbors/nn_descent.cuh
index 582da72c1..ed91dac91 100644
--- a/cpp/src/neighbors/nn_descent.cuh
+++ b/cpp/src/neighbors/nn_descent.cuh
@@ -17,9 +17,14 @@
 #pragma once
 
 #include "detail/nn_descent.cuh"
+#include "detail/nn_descent_batch.cuh"
+
+#include <cmath>
+#include <cstdint>
 #include <cuvs/neighbors/nn_descent.hpp>
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
 #include <raft/core/host_mdspan.hpp>
 
 namespace cuvs::neighbors::nn_descent {
@@ -61,7 +66,15 @@ auto build(raft::resources const& res,
            index_params const& params,
            raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      return detail::experimental::batch_build<T, IdxT>(res, params, dataset);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset);
+  }
 }
 
 /**
@@ -100,7 +113,15 @@ void build(raft::resources const& res,
            raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,
            index<IdxT>& idx)
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      detail::experimental::batch_build<T, IdxT>(res, params, dataset, idx);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    detail::build<T, IdxT>(res, params, dataset, idx);
+  }
 }
 
 /**
@@ -135,7 +156,15 @@ auto build(raft::resources const& res,
            index_params const& params,
            raft::host_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      return detail::experimental::batch_build<T, IdxT>(res, params, dataset);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset);
+  }
 }
 
 /**
@@ -174,7 +203,15 @@ void build(raft::resources const& res,
            raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
            index<IdxT>& idx)
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      detail::experimental::batch_build<T, IdxT>(res, params, dataset, idx);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    detail::build<T, IdxT>(res, params, dataset, idx);
+  }
 }
 
 /** @} */  // end group nn-descent
diff --git a/cpp/src/neighbors/nn_descent_float.cu b/cpp/src/neighbors/nn_descent_float.cu
index c6d356671..fa85db127 100644
--- a/cpp/src/neighbors/nn_descent_float.cu
+++ b/cpp/src/neighbors/nn_descent_float.cu
@@ -19,21 +19,38 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    };                                                                                        \
+  }                                                                                           \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(float, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_half.cu b/cpp/src/neighbors/nn_descent_half.cu
index 587993031..2ee45d435 100644
--- a/cpp/src/neighbors/nn_descent_half.cu
+++ b/cpp/src/neighbors/nn_descent_half.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(half, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_index.cpp b/cpp/src/neighbors/nn_descent_index.cpp
new file mode 100644
index 000000000..25d5b6af8
--- /dev/null
+++ b/cpp/src/neighbors/nn_descent_index.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
+namespace cuvs::neighbors::nn_descent {
+
+index_params::index_params(size_t graph_degree, cuvs::distance::DistanceType metric)
+{
+  this->graph_degree              = graph_degree;
+  this->intermediate_graph_degree = 1.5 * graph_degree;
+  this->metric                    = metric;
+}
+}  // namespace cuvs::neighbors::nn_descent
\ No newline at end of file
diff --git a/cpp/src/neighbors/nn_descent_int8.cu b/cpp/src/neighbors/nn_descent_int8.cu
index 813a01746..e150f511b 100644
--- a/cpp/src/neighbors/nn_descent_int8.cu
+++ b/cpp/src/neighbors/nn_descent_int8.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(int8_t, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_uint8.cu b/cpp/src/neighbors/nn_descent_uint8.cu
index 9d73dd90f..d8657777b 100644
--- a/cpp/src/neighbors/nn_descent_uint8.cu
+++ b/cpp/src/neighbors/nn_descent_uint8.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(uint8_t, uint32_t);
diff --git a/cpp/src/neighbors/sparse_brute_force.cu b/cpp/src/neighbors/sparse_brute_force.cu
new file mode 100644
index 000000000..e277961ec
--- /dev/null
+++ b/cpp/src/neighbors/sparse_brute_force.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/neighbors/brute_force.hpp>
+
+#include "detail/sparse_knn.cuh"
+
+namespace cuvs::neighbors::brute_force {
+template <typename T, typename IdxT>
+sparse_index<T, IdxT>::sparse_index(raft::resources const& res,
+                                    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset,
+                                    cuvs::distance::DistanceType metric,
+                                    T metric_arg)
+  : dataset_(dataset), metric_(metric), metric_arg_(metric_arg)
+{
+}
+
+auto build(raft::resources const& handle,
+           raft::device_csr_matrix_view<const float, int, int, int> dataset,
+           cuvs::distance::DistanceType metric,
+           float metric_arg) -> cuvs::neighbors::brute_force::sparse_index<float, int>
+{
+  return sparse_index<float, int>(handle, dataset, metric, metric_arg);
+}
+
+void search(raft::resources const& handle,
+            const sparse_search_params& params,
+            const sparse_index<float, int>& index,
+            raft::device_csr_matrix_view<const float, int, int, int> query,
+            raft::device_matrix_view<int, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances)
+{
+  auto idx_structure   = index.dataset().structure_view();
+  auto query_structure = query.structure_view();
+  int k                = neighbors.extent(1);
+
+  detail::sparse_knn_t<int, float>(idx_structure.get_indptr().data(),
+                                   idx_structure.get_indices().data(),
+                                   index.dataset().get_elements().data(),
+                                   idx_structure.get_nnz(),
+                                   idx_structure.get_n_rows(),
+                                   idx_structure.get_n_cols(),
+                                   query_structure.get_indptr().data(),
+                                   query_structure.get_indices().data(),
+                                   query.get_elements().data(),
+                                   query_structure.get_nnz(),
+                                   query_structure.get_n_rows(),
+                                   query_structure.get_n_cols(),
+                                   neighbors.data_handle(),
+                                   distances.data_handle(),
+                                   k,
+                                   handle,
+                                   params.batch_size_index,
+                                   params.batch_size_query,
+                                   index.metric(),
+                                   index.metric_arg())
+    .run();
+}
+}  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/src/preprocessing/quantize/detail/scalar.cuh b/cpp/src/preprocessing/quantize/detail/scalar.cuh
new file mode 100644
index 000000000..fc132eb7f
--- /dev/null
+++ b/cpp/src/preprocessing/quantize/detail/scalar.cuh
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <thrust/system/omp/execution_policy.h>
+
+namespace cuvs::preprocessing::quantize::detail {
+
+template <class T>
+_RAFT_HOST_DEVICE bool fp_lt(const T& a, const T& b)
+{
+  return a < b;
+}
+
+template <>
+_RAFT_HOST_DEVICE bool fp_lt(const half& a, const half& b)
+{
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+template <typename T, typename QuantI, typename TempT = double>
+struct quantize_op {
+  const T min_;
+  const T max_;
+  const QuantI q_type_min_ = std::numeric_limits<QuantI>::min();
+  const QuantI q_type_max_ = std::numeric_limits<QuantI>::max();
+  const TempT scalar_;
+  const TempT offset_;
+
+  constexpr explicit quantize_op(T min, T max)
+    : min_(min),
+      max_(max),
+      scalar_(static_cast<TempT>(max_) > static_cast<TempT>(min_)
+                ? ((static_cast<TempT>(q_type_max_) - static_cast<TempT>(q_type_min_)) /
+                   (static_cast<TempT>(max_) - static_cast<TempT>(min_)))
+                : static_cast<TempT>(1)),
+      offset_(static_cast<TempT>(q_type_min_) - static_cast<TempT>(min_) * scalar_)
+  {
+  }
+
+  constexpr RAFT_INLINE_FUNCTION QuantI operator()(const T& x) const
+  {
+    if (!fp_lt(min_, x)) return q_type_min_;
+    if (!fp_lt(x, max_)) return q_type_max_;
+    return static_cast<QuantI>(lroundf(scalar_ * static_cast<TempT>(x) + offset_));
+  }
+
+  constexpr RAFT_INLINE_FUNCTION T operator()(const QuantI& x) const
+  {
+    return static_cast<T>((static_cast<TempT>(x) - offset_) / scalar_);
+  }
+};
+
+template <typename T>
+std::tuple<T, T> quantile_min_max(raft::resources const& res,
+                                  raft::device_matrix_view<const T, int64_t> dataset,
+                                  double quantile)
+{
+  // settings for quantile approximation
+  constexpr size_t max_num_samples = 1000000;
+  constexpr int seed               = 137;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  // select subsample
+  raft::random::RngState rng(seed);
+  size_t n_elements  = dataset.extent(0) * dataset.extent(1);
+  size_t subset_size = std::min(max_num_samples, n_elements);
+  auto subset        = raft::make_device_vector<T>(res, subset_size);
+  auto dataset_view  = raft::make_device_vector_view<const T>(dataset.data_handle(), n_elements);
+  raft::random::sample_without_replacement(
+    res, rng, dataset_view, std::nullopt, subset.view(), std::nullopt);
+
+  // quantile / sort and pick for now
+  thrust::sort(raft::resource::get_thrust_policy(res),
+               subset.data_handle(),
+               subset.data_handle() + subset_size);
+
+  double half_quantile_pos = (0.5 + 0.5 * quantile) * subset_size;
+  int pos_max              = std::ceil(half_quantile_pos) - 1;
+  int pos_min              = subset_size - pos_max - 1;
+
+  T minmax_h[2];
+  raft::update_host(&(minmax_h[0]), subset.data_handle() + pos_min, 1, stream);
+  raft::update_host(&(minmax_h[1]), subset.data_handle() + pos_max, 1, stream);
+  raft::resource::sync_stream(res);
+
+  return {minmax_h[0], minmax_h[1]};
+}
+
+template <typename T>
+std::tuple<T, T> quantile_min_max(raft::resources const& res,
+                                  raft::host_matrix_view<const T, int64_t> dataset,
+                                  double quantile)
+{
+  // settings for quantile approximation
+  constexpr size_t max_num_samples = 1000000;
+  constexpr int seed               = 137;
+
+  // select subsample
+  std::mt19937 rng(seed);
+  size_t n_elements  = dataset.extent(0) * dataset.extent(1);
+  size_t subset_size = std::min(max_num_samples, n_elements);
+  std::vector<T> subset;
+  std::sample(dataset.data_handle(),
+              dataset.data_handle() + n_elements,
+              std::back_inserter(subset),
+              subset_size,
+              rng);
+
+  // quantile / sort and pick for now
+  thrust::sort(thrust::omp::par, subset.data(), subset.data() + subset_size, fp_lt<T>);
+  double half_quantile_pos = (0.5 + 0.5 * quantile) * subset_size;
+  int pos_max              = std::ceil(half_quantile_pos) - 1;
+  int pos_min              = subset_size - pos_max - 1;
+
+  return {subset[pos_min], subset[pos_max]};
+}
+
+template <typename T>
+cuvs::preprocessing::quantize::scalar::quantizer<T> train(
+  raft::resources const& res,
+  const cuvs::preprocessing::quantize::scalar::params params,
+  raft::device_matrix_view<const T, int64_t> dataset)
+{
+  RAFT_EXPECTS(params.quantile > 0.0 && params.quantile <= 1.0,
+               "quantile for scalar quantization needs to be within (0, 1] but is %f",
+               params.quantile);
+
+  auto [min, max] = detail::quantile_min_max(res, dataset, params.quantile);
+
+  RAFT_LOG_DEBUG("quantizer train min=%lf max=%lf.", double(min), double(max));
+
+  return cuvs::preprocessing::quantize::scalar::quantizer<T>{min, max};
+}
+
+template <typename T>
+cuvs::preprocessing::quantize::scalar::quantizer<T> train(
+  raft::resources const& res,
+  const cuvs::preprocessing::quantize::scalar::params params,
+  raft::host_matrix_view<const T, int64_t> dataset)
+{
+  RAFT_EXPECTS(params.quantile > 0.0 && params.quantile <= 1.0,
+               "quantile for scalar quantization needs to be within (0, 1] but is %f",
+               params.quantile);
+
+  auto [min, max] = detail::quantile_min_max(res, dataset, params.quantile);
+
+  RAFT_LOG_DEBUG("quantizer train min=%lf max=%lf.", double(min), double(max));
+
+  return cuvs::preprocessing::quantize::scalar::quantizer<T>{min, max};
+}
+
+template <typename T, typename QuantI = int8_t>
+void transform(raft::resources const& res,
+               const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+               raft::device_matrix_view<const T, int64_t> dataset,
+               raft::device_matrix_view<QuantI, int64_t> out)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  raft::linalg::map(res, out, quantize_op<T, QuantI>(quantizer.min_, quantizer.max_), dataset);
+}
+
+template <typename T, typename QuantI = int8_t>
+void transform(raft::resources const& res,
+               const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+               raft::host_matrix_view<const T, int64_t> dataset,
+               raft::host_matrix_view<QuantI, int64_t> out)
+{
+  auto main_op      = quantize_op<T, QuantI>(quantizer.min_, quantizer.max_);
+  size_t n_elements = dataset.extent(0) * dataset.extent(1);
+
+#pragma omp parallel for
+  for (size_t i = 0; i < n_elements; ++i) {
+    out.data_handle()[i] = main_op(dataset.data_handle()[i]);
+  }
+}
+
+template <typename T, typename QuantI = int8_t>
+void inverse_transform(raft::resources const& res,
+                       const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+                       raft::device_matrix_view<const QuantI, int64_t> dataset,
+                       raft::device_matrix_view<T, int64_t> out)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  raft::linalg::map(res, out, quantize_op<T, QuantI>(quantizer.min_, quantizer.max_), dataset);
+}
+
+template <typename T, typename QuantI = int8_t>
+void inverse_transform(raft::resources const& res,
+                       const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+                       raft::host_matrix_view<const QuantI, int64_t> dataset,
+                       raft::host_matrix_view<T, int64_t> out)
+{
+  auto main_op      = quantize_op<T, QuantI>(quantizer.min_, quantizer.max_);
+  size_t n_elements = dataset.extent(0) * dataset.extent(1);
+
+#pragma omp parallel for
+  for (size_t i = 0; i < n_elements; ++i) {
+    out.data_handle()[i] = main_op(dataset.data_handle()[i]);
+  }
+}
+
+}  // namespace cuvs::preprocessing::quantize::detail
diff --git a/cpp/src/preprocessing/quantize/scalar.cu b/cpp/src/preprocessing/quantize/scalar.cu
new file mode 100644
index 000000000..9624ad4fe
--- /dev/null
+++ b/cpp/src/preprocessing/quantize/scalar.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./detail/scalar.cuh"
+
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+
+namespace cuvs::preprocessing::quantize::scalar {
+
+#define CUVS_INST_QUANTIZATION(T, QuantI)                                         \
+  auto train(raft::resources const& res,                                          \
+             const params params,                                                 \
+             raft::device_matrix_view<const T, int64_t> dataset)                  \
+    ->quantizer<T>                                                                \
+  {                                                                               \
+    return detail::train(res, params, dataset);                                   \
+  }                                                                               \
+  auto train(raft::resources const& res,                                          \
+             const params params,                                                 \
+             raft::host_matrix_view<const T, int64_t> dataset)                    \
+    ->quantizer<T>                                                                \
+  {                                                                               \
+    return detail::train(res, params, dataset);                                   \
+  }                                                                               \
+  void transform(raft::resources const& res,                                      \
+                 const quantizer<T>& quantizer,                                   \
+                 raft::device_matrix_view<const T, int64_t> dataset,              \
+                 raft::device_matrix_view<QuantI, int64_t> out)                   \
+  {                                                                               \
+    detail::transform(res, quantizer, dataset, out);                              \
+  }                                                                               \
+  void transform(raft::resources const& res,                                      \
+                 const quantizer<T>& quantizer,                                   \
+                 raft::host_matrix_view<const T, int64_t> dataset,                \
+                 raft::host_matrix_view<QuantI, int64_t> out)                     \
+  {                                                                               \
+    detail::transform(res, quantizer, dataset, out);                              \
+  }                                                                               \
+  void inverse_transform(raft::resources const& res,                              \
+                         const quantizer<T>& quantizer,                           \
+                         raft::device_matrix_view<const QuantI, int64_t> dataset, \
+                         raft::device_matrix_view<T, int64_t> out)                \
+  {                                                                               \
+    detail::inverse_transform(res, quantizer, dataset, out);                      \
+  }                                                                               \
+  void inverse_transform(raft::resources const& res,                              \
+                         const quantizer<T>& quantizer,                           \
+                         raft::host_matrix_view<const QuantI, int64_t> dataset,   \
+                         raft::host_matrix_view<T, int64_t> out)                  \
+  {                                                                               \
+    detail::inverse_transform(res, quantizer, dataset, out);                      \
+  }                                                                               \
+  template struct quantizer<T>;
+
+CUVS_INST_QUANTIZATION(double, int8_t);
+CUVS_INST_QUANTIZATION(float, int8_t);
+CUVS_INST_QUANTIZATION(half, int8_t);
+
+#undef CUVS_INST_QUANTIZATION
+
+}  // namespace cuvs::preprocessing::quantize::scalar
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/cluster_solvers.cuh b/cpp/src/sparse/cluster/cluster_solvers.cuh
new file mode 100644
index 000000000..7b4cf6ab3
--- /dev/null
+++ b/cpp/src/sparse/cluster/cluster_solvers.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CLUSTER_SOLVERS_H
+#define __CLUSTER_SOLVERS_H
+
+#pragma once
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <utility>  // for std::pair
+
+namespace cuvs {
+namespace spectral {
+
+using namespace raft::spectral::matrix;
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct cluster_solver_config_t {
+  size_type_t n_clusters;
+  size_type_t maxIter;
+
+  value_type_t tol;
+
+  unsigned long long seed{123456};
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct kmeans_solver_t {
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  std::pair<value_type_t, index_type_t> solve(raft::resources const& handle,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
+    RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
+    RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
+    value_type_t residual{};
+    index_type_t iters{};
+    cuvs::cluster::kmeans::params km_params;
+    km_params.n_clusters     = config_.n_clusters;
+    km_params.tol            = config_.tol;
+    km_params.max_iter       = config_.maxIter;
+    km_params.rng_state.seed = config_.seed;
+
+    auto X      = raft::make_device_matrix_view<const value_type_t>(obs, n_obs_vecs, dim);
+    auto labels = raft::make_device_vector_view<index_type_t>(codes, n_obs_vecs);
+    auto centroids =
+      raft::make_device_matrix<value_type_t, index_type_t>(handle, config_.n_clusters, dim);
+    auto weight = raft::make_device_vector<value_type_t, index_type_t>(handle, n_obs_vecs);
+    thrust::fill(raft::resource::get_thrust_policy(handle),
+                 weight.data_handle(),
+                 weight.data_handle() + n_obs_vecs,
+                 1);
+
+    auto sw = std::make_optional((raft::device_vector_view<const value_type_t>)weight.view());
+    cuvs::cluster::kmeans::fit_predict(handle,
+                                       km_params,
+                                       X,
+                                       sw,
+                                       centroids.view(),
+                                       labels,
+                                       raft::make_host_scalar_view(&residual),
+                                       raft::make_host_scalar_view(&iters));
+    return std::make_pair(residual, iters);
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/detail/spectral.cuh b/cpp/src/sparse/cluster/detail/spectral.cuh
new file mode 100644
index 000000000..571d92bf5
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral.cuh
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../cluster_solvers.cuh"
+#include "../eigen_solvers.cuh"
+#include "../partition.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/coo.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cuvs::sparse::cluster::spectral::detail {
+
+template <typename T>
+void fit_embedding(raft::resources const& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  rmm::device_uvector<int> src_offsets(n + 1, stream);
+  rmm::device_uvector<int> dst_cols(nnz, stream);
+  rmm::device_uvector<T> dst_vals(nnz, stream);
+  raft::sparse::convert::coo_to_csr(
+    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
+
+  rmm::device_uvector<T> eigVals(n_components + 1, stream);
+  rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
+  rmm::device_uvector<int> labels(n, stream);
+
+  raft::resource::sync_stream(handle, stream);
+
+  /**
+   * Raft spectral clustering
+   */
+  using index_type = int;
+  using value_type = T;
+
+  index_type* ro = src_offsets.data();
+  index_type* ci = dst_cols.data();
+  value_type* vs = dst_vals.data();
+
+  raft::spectral::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+    handle, ro, ci, vs, n, nnz};
+
+  index_type neigvs       = n_components + 1;
+  index_type maxiter      = 4000;  // default reset value (when set to 0);
+  value_type tol          = 0.01;
+  index_type restart_iter = 15 + neigvs;  // what cugraph is using
+
+  cuvs::spectral::eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol};
+
+  cfg.seed = seed;
+
+  cuvs::spectral::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  // cluster computation here is irrelevant,
+  // hence define a no-op such solver to
+  // feed partition():
+  //
+  struct no_op_cluster_solver_t {
+    using index_type_t = index_type;
+    using size_type_t  = index_type;
+    using value_type_t = value_type;
+
+    std::pair<value_type_t, index_type_t> solve(raft::resources const& handle,
+                                                size_type_t n_obs_vecs,
+                                                size_type_t dim,
+                                                value_type_t const* __restrict__ obs,
+                                                index_type_t* __restrict__ codes) const
+    {
+      return std::make_pair<value_type_t, index_type_t>(0, 0);
+    }
+  };
+
+  cuvs::spectral::partition(handle,
+                            r_csr_m,
+                            eig_solver,
+                            no_op_cluster_solver_t{},
+                            labels.data(),
+                            eigVals.data(),
+                            eigVecs.data());
+
+  raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // namespace cuvs::sparse::cluster::spectral::detail
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
new file mode 100644
index 000000000..a42ad2dc1
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include "../../cluster_solvers.cuh"
+#include "../../eigen_solvers.cuh"
+#include "spectral_util.cuh"
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/normalize.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <math.h>
+#include <stdio.h>
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+namespace detail {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // Compute eigenvectors of Modularity Matrix
+
+  // Initialize Modularity Matrix
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute eigenvectors corresponding to largest eigenvalues
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // notice that at this point the matrix has already been transposed, so we are scaling
+  // columns
+  auto dataset_view = raft::make_device_matrix_view(eigVecs, nEigVecs, n);
+  raft::linalg::row_normalize(
+    handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(raft::resources const& handle,
+                       raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+  weight_t partModularity, clustersize;
+
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+  auto stream   = raft::resource::get_cuda_stream(handle);
+
+  // Device memory
+  raft::spectral::matrix::vector_t<weight_t> part_i(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> Bx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Modularity
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  // Initialize output
+  modularity = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    modularity += partModularity;
+  }
+
+  modularity = modularity / B.diagonal_.nrm1();
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/detail/spectral/partition.hpp b/cpp/src/sparse/cluster/detail/spectral/partition.hpp
new file mode 100644
index 000000000..77e83c17d
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/partition.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+
+#include "../../cluster_solvers.cuh"
+#include "../../eigen_solvers.cuh"
+#include "spectral_util.cuh"
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <math.h>
+#include <stdio.h>
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+namespace detail {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            // cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // -------------------------------------------------------
+  // Spectral partitioner
+  // -------------------------------------------------------
+
+  // Compute eigenvectors of Laplacian
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute smallest eigenvalues and eigenvectors
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(raft::resources const& handle,
+                      raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  weight_t partEdgesCut, clustersize;
+
+  // Device memory
+  raft::spectral::matrix::vector_t<weight_t> part_i(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> Lx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  // Initialize output
+  cost    = 0;
+  edgeCut = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    // Construct indicator vector for ith partition
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    cost += partEdgesCut / clustersize;
+    edgeCut += partEdgesCut / 2;
+  }
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
new file mode 100644
index 000000000..1d2e58e2a
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+
+namespace cuvs {
+namespace spectral {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void transform_eigen_matrix(raft::resources const& handle,
+                            edge_t n,
+                            vertex_t nEigVecs,
+                            weight_t* eigVecs)
+{
+  auto stream             = raft::resource::get_cuda_stream(handle);
+  auto cublas_h           = raft::resource::get_cublas_handle(handle);
+  auto thrust_exec_policy = raft::resource::get_thrust_policy(handle);
+
+  const weight_t zero{0.0};
+  const weight_t one{1.0};
+
+  // Whiten eigenvector matrix
+  for (auto i = 0; i < nEigVecs; ++i) {
+    weight_t mean, std;
+
+    mean = thrust::reduce(thrust_exec_policy,
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    RAFT_CHECK_CUDA(stream);
+    mean /= n;
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(mean),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::minus<weight_t>());
+    RAFT_CHECK_CUDA(stream);
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+
+    std /= std::sqrt(static_cast<weight_t>(n));
+
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(std),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::divides<weight_t>());
+    RAFT_CHECK_CUDA(stream);
+  }
+
+  // Transpose eigenvector matrix
+  //   TODO: in-place transpose
+  {
+    raft::spectral::matrix::vector_t<weight_t> work(handle, nEigVecs * n);
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     nEigVecs,
+                                                     n,
+                                                     &one,
+                                                     eigVecs,
+                                                     n,
+                                                     &zero,
+                                                     (weight_t*)NULL,
+                                                     nEigVecs,
+                                                     work.raw(),
+                                                     nEigVecs,
+                                                     stream));
+
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
+  }
+}
+
+namespace {
+/// Functor to generate indicator vectors
+/** For use in Thrust transform
+ */
+template <typename index_type_t, typename value_type_t>
+struct equal_to_i_op {
+  const index_type_t i;
+
+ public:
+  equal_to_i_op(index_type_t _i) : i(_i) {}
+  template <typename Tuple_>
+  __host__ __device__ void operator()(Tuple_ t)
+  {
+    thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  }
+};
+}  // namespace
+
+// Construct indicator vector for ith partition
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+bool construct_indicator(raft::resources const& handle,
+                         edge_t index,
+                         edge_t n,
+                         weight_t& clustersize,
+                         weight_t& partStats,
+                         vertex_t const* __restrict__ clusters,
+                         raft::spectral::matrix::vector_t<weight_t>& part_i,
+                         raft::spectral::matrix::vector_t<weight_t>& Bx,
+                         raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> const& B)
+{
+  auto stream             = raft::resource::get_cuda_stream(handle);
+  auto cublas_h           = raft::resource::get_cublas_handle(handle);
+  auto thrust_exec_policy = raft::resource::get_thrust_policy(handle);
+
+  thrust::for_each(
+    thrust_exec_policy,
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters),
+                                                 thrust::device_pointer_cast(part_i.raw()))),
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
+                                                 thrust::device_pointer_cast(part_i.raw() + n))),
+    equal_to_i_op<vertex_t, weight_t>(index));
+  RAFT_CHECK_CUDA(stream);
+
+  // Compute size of ith partition
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+    cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+
+  clustersize = round(clustersize);
+  if (clustersize < 0.5) { return false; }
+
+  // Compute part stats
+  B.mv(1, part_i.raw(), 0, Bx.raw());
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+
+  return true;
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/eigen_solvers.cuh b/cpp/src/sparse/cluster/eigen_solvers.cuh
new file mode 100644
index 000000000..1b2501d68
--- /dev/null
+++ b/cpp/src/sparse/cluster/eigen_solvers.cuh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __EIGEN_SOLVERS_H
+#define __EIGEN_SOLVERS_H
+
+#pragma once
+
+#include <raft/sparse/solver/lanczos.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace cuvs {
+namespace spectral {
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct eigen_solver_config_t {
+  size_type_t n_eigVecs;
+  size_type_t maxIter;
+
+  size_type_t restartIter;
+  value_type_t tol;
+
+  bool reorthogonalize{false};
+  unsigned long long seed{
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct lanczos_solver_t {
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  index_type_t solve_smallest_eigenvectors(
+    raft::resources const& handle,
+    raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    raft::sparse::solver::computeSmallestEigenvectors(handle,
+                                                      A,
+                                                      config_.n_eigVecs,
+                                                      config_.maxIter,
+                                                      config_.restartIter,
+                                                      config_.tol,
+                                                      config_.reorthogonalize,
+                                                      iters,
+                                                      eigVals,
+                                                      eigVecs,
+                                                      config_.seed);
+    return iters;
+  }
+
+  index_type_t solve_largest_eigenvectors(
+    raft::resources const& handle,
+    raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    raft::sparse::solver::computeLargestEigenvectors(handle,
+                                                     A,
+                                                     config_.n_eigVecs,
+                                                     config_.maxIter,
+                                                     config_.restartIter,
+                                                     config_.tol,
+                                                     config_.reorthogonalize,
+                                                     iters,
+                                                     eigVals,
+                                                     eigVecs,
+                                                     config_.seed);
+    return iters;
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
diff --git a/cpp/src/sparse/cluster/modularity_maximization.cuh b/cpp/src/sparse/cluster/modularity_maximization.cuh
new file mode 100644
index 000000000..71cba6927
--- /dev/null
+++ b/cpp/src/sparse/cluster/modularity_maximization.cuh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MODULARITY_MAXIMIZATION_H
+#define __MODULARITY_MAXIMIZATION_H
+
+#pragma once
+
+#include "detail/spectral/modularity_maximization.hpp"
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return cuvs::spectral::detail::
+    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(raft::resources const& handle,
+                       raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  cuvs::spectral::detail::analyzeModularity<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, modularity);
+}
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/partition.cuh b/cpp/src/sparse/cluster/partition.cuh
new file mode 100644
index 000000000..df78a8a2d
--- /dev/null
+++ b/cpp/src/sparse/cluster/partition.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PARTITION_H
+#define __PARTITION_H
+
+#pragma once
+
+#include "detail/spectral/partition.hpp"
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return cuvs::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(raft::resources const& handle,
+                      raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  cuvs::spectral::detail::analyzePartition<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, edgeCut, cost);
+}
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f4d35e438..9224e88d8 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -94,7 +94,7 @@ endfunction()
 if(BUILD_TESTS)
   ConfigureTest(
     NAME NEIGHBORS_TEST PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu
-    neighbors/refine.cu GPUS 1 PERCENT 100
+    neighbors/sparse_brute_force.cu neighbors/refine.cu GPUS 1 PERCENT 100
   )
 
   ConfigureTest(
@@ -137,6 +137,8 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_CAGRA_TEST
     PATH
+    neighbors/ann_cagra/bug_extreme_inputs_oob.cu
+    neighbors/ann_cagra/bug_multi_cta_crash.cu
     neighbors/ann_cagra/test_float_uint32_t.cu
     neighbors/ann_cagra/test_half_uint32_t.cu
     neighbors/ann_cagra/test_int8_t_uint32_t.cu
@@ -173,6 +175,19 @@ if(BUILD_TESTS)
     100
   )
 
+  ConfigureTest(
+    NAME
+    NEIGHBORS_DYNAMIC_BATCHING_TEST
+    PATH
+    neighbors/dynamic_batching/test_cagra.cu
+    neighbors/dynamic_batching/test_ivf_flat.cu
+    neighbors/dynamic_batching/test_ivf_pq.cu
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
   if(BUILD_CAGRA_HNSWLIB)
     ConfigureTest(NAME NEIGHBORS_HNSW_TEST PATH neighbors/hnsw.cu GPUS 1 PERCENT 100)
     target_link_libraries(NEIGHBORS_HNSW_TEST PRIVATE hnswlib::hnswlib)
@@ -203,19 +218,33 @@ if(BUILD_TESTS)
     distance/dist_l_inf.cu
     distance/dist_lp_unexp.cu
     distance/dist_russell_rao.cu
+    distance/gram.cu
     distance/masked_nn.cu
+    distance/sparse_distance.cu
     sparse/neighbors/cross_component_nn.cu
     GPUS
     1
     PERCENT
     100
   )
+
+  ConfigureTest(
+    NAME SPARSE_TEST PATH sparse/cluster/cluster_solvers.cu sparse/cluster/eigen_solvers.cu
+    sparse/cluster/spectral.cu GPUS 1 PERCENT 100
+  )
+  
+  ConfigureTest(
+    NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100
+  )
+
   ConfigureTest(
     NAME STATS_TEST PATH stats/trustworthiness.cu stats/silhouette_score.cu GPUS 1 PERCENT 100
   )
 endif()
 
-if(BUILD_C_TESTS)
+if(TARGET cuvs::c_api)
+  enable_language(C)
+
   ConfigureTest(NAME INTEROP_TEST PATH core/interop.cu C_LIB)
   ConfigureTest(
     NAME DISTANCE_C_TEST PATH distance/run_pairwise_distance_c.c distance/pairwise_distance_c.cu
@@ -239,19 +268,36 @@ if(BUILD_C_TESTS)
     target_link_libraries(NEIGHBORS_HNSW_TEST PRIVATE hnswlib::hnswlib)
     target_compile_definitions(NEIGHBORS_HNSW_TEST PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
   endif()
-endif()
-
-# ##################################################################################################
-# Install tests ####################################################################################
-# ##################################################################################################
-rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcuvs)
-
-if(BUILD_C_TESTS)
-  enable_language(C)
 
   add_executable(cuvs_c_test core/c_api.c)
   target_link_libraries(cuvs_c_test PUBLIC cuvs::c_api)
 
   add_executable(cuvs_c_neighbors_test neighbors/c_api.c)
   target_link_libraries(cuvs_c_neighbors_test PUBLIC cuvs::c_api)
+
+  set_target_properties(
+    cuvs_c_test cuvs_c_neighbors_test
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUVS_BINARY_DIR}/gtests>"
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+  )
+
+  rapids_test_add(
+    NAME cuvs_c_test
+    COMMAND cuvs_c_test
+    GPUS 1
+    PERCENT 100
+    INSTALL_COMPONENT_SET testing
+  )
+  rapids_test_add(
+    NAME cuvs_c_neighbors_test
+    COMMAND cuvs_c_neighbors_test
+    GPUS 1
+    PERCENT 100
+    INSTALL_COMPONENT_SET testing
+  )
 endif()
+
+# ##################################################################################################
+# Install tests ####################################################################################
+# ##################################################################################################
+rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcuvs)
diff --git a/cpp/test/core/c_api.c b/cpp/test/core/c_api.c
index a3dae6004..a51824d2b 100644
--- a/cpp/test/core/c_api.c
+++ b/cpp/test/core/c_api.c
@@ -73,6 +73,15 @@ int main()
   error = cuvsRMMMemoryResourceReset();
   if (error == CUVS_ERROR) { exit(EXIT_FAILURE); }
 
+  // Alloc memory on host (pinned)
+  void* ptr3;
+  cuvsError_t alloc_error_pinned = cuvsRMMHostAlloc(&ptr3, 1024);
+  if (alloc_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); }
+
+  // Free memory
+  cuvsError_t free_error_pinned = cuvsRMMHostFree(ptr3, 1024);
+  if (free_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); }
+
   // Destroy resources
   error = cuvsResourcesDestroy(res);
   if (error == CUVS_ERROR) { exit(EXIT_FAILURE); }
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
new file mode 100644
index 000000000..89b1525ea
--- /dev/null
+++ b/cpp/test/distance/gram.cu
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include "gram_base.cuh"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs::distance::kernels {
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_"
+     << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+const std::vector<GramMatrixInputs> inputs = {
+  {42, 137, 2, false, {KernelType::LINEAR}},
+  {42, 137, 2, true, {KernelType::LINEAR}},
+  {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
+  {3, 4, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, {KernelType::RBF, 0, 0.5}},
+  // Distance kernel does not support LD parameter yet.
+  //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49},
+  // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143},
+};
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      handle(),
+      x1(0, raft::resource::get_cuda_stream(handle)),
+      x2(0, raft::resource::get_cuda_stream(handle)),
+      gram(0, raft::resource::get_cuda_stream(handle)),
+      gram_host(0)
+  {
+    auto stream = raft::resource::get_cuda_stream(handle);
+
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::RngState rng(42137ULL);
+    raft::random::uniform(handle, rng, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, rng, x2.data(), x2.size(), math_t(0), math_t(1));
+  }
+
+  ~GramMatrixTest() override {}
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    (*kernel)(handle, x1_span, x2_span, out_span);
+
+    auto stream = raft::resource::get_cuda_stream(handle);
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+
+    ASSERT_TRUE(cuvs::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox<math_t>(1e-6f), stream));
+  }
+
+  GramMatrixInputs params;
+  raft::resources handle;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+  rmm::device_uvector<math_t> gram;
+
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloat;
+typedef GramMatrixTest<double> GramMatrixTestDouble;
+
+TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
+};  // namespace cuvs::distance::kernels
\ No newline at end of file
diff --git a/cpp/test/distance/gram_base.cuh b/cpp/test/distance/gram_base.cuh
new file mode 100644
index 000000000..326cdb4f8
--- /dev/null
+++ b/cpp/test/distance/gram_base.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace distance {
+namespace kernels {
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+// Calculate the Gram matrix on the host.
+template <typename math_t>
+void naiveGramMatrixKernel(int n1,
+                           int n2,
+                           int n_cols,
+                           const rmm::device_uvector<math_t>& x1,
+                           const rmm::device_uvector<math_t>& x2,
+                           math_t* gram_host,
+                           int ld1,
+                           int ld2,
+                           int ld_out,
+                           bool is_row_major,
+                           KernelParams kernel,
+                           cudaStream_t stream,
+                           const raft::resources& handle)
+{
+  std::vector<math_t> x1_host(x1.size());
+  raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+  std::vector<math_t> x2_host(x2.size());
+  raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+  raft::resource::sync_stream(handle, stream);
+
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      float d = 0;
+      for (int k = 0; k < n_cols; k++) {
+        if (kernel.kernel == KernelType::RBF) {
+          math_t diff = x1_host[get_offset(i, k, ld1, is_row_major)] -
+                        x2_host[get_offset(j, k, ld2, is_row_major)];
+          d += diff * diff;
+        } else {
+          d += x1_host[get_offset(i, k, ld1, is_row_major)] *
+               x2_host[get_offset(j, k, ld2, is_row_major)];
+        }
+      }
+      int idx  = get_offset(i, j, ld_out, is_row_major);
+      math_t v = 0;
+      switch (kernel.kernel) {
+        case (KernelType::LINEAR): gram_host[idx] = d; break;
+        case (KernelType::POLYNOMIAL):
+          v              = kernel.gamma * d + kernel.coef0;
+          gram_host[idx] = std::pow(v, kernel.degree);
+          break;
+        case (KernelType::TANH): gram_host[idx] = std::tanh(kernel.gamma * d + kernel.coef0); break;
+        case (KernelType::RBF): gram_host[idx] = exp(-kernel.gamma * d); break;
+      }
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace distance
+}  // namespace cuvs
\ No newline at end of file
diff --git a/cpp/test/distance/sparse_distance.cu b/cpp/test/distance/sparse_distance.cu
new file mode 100644
index 000000000..f95487414
--- /dev/null
+++ b/cpp/test/distance/sparse_distance.cu
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cuvs/distance/distance.hpp>
+
+#include <cusparse_v2.h>
+#include <gtest/gtest.h>
+
+namespace cuvs {
+namespace distance {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseDistanceInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+
+  cuvs::distance::DistanceType metric;
+
+  float metric_arg = 0.0;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs<value_idx, value_t>& dims)
+{
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseDistanceTest
+  : public ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>> {
+ public:
+  SparseDistanceTest()
+    : params(::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>>::GetParam()),
+      indptr(0, resource::get_cuda_stream(handle)),
+      indices(0, resource::get_cuda_stream(handle)),
+      data(0, resource::get_cuda_stream(handle)),
+      out_dists(0, resource::get_cuda_stream(handle)),
+      out_dists_ref(0, resource::get_cuda_stream(handle))
+  {
+  }
+
+  void SetUp() override
+  {
+    make_data();
+
+    int out_size = static_cast<value_idx>(params.indptr_h.size() - 1) *
+                   static_cast<value_idx>(params.indptr_h.size() - 1);
+
+    out_dists.resize(out_size, resource::get_cuda_stream(handle));
+
+    auto out = raft::make_device_matrix_view<value_t, value_idx>(
+      out_dists.data(),
+      static_cast<value_idx>(params.indptr_h.size() - 1),
+      static_cast<value_idx>(params.indptr_h.size() - 1));
+
+    auto x_structure = raft::make_device_compressed_structure_view<value_idx, value_idx, value_idx>(
+      indptr.data(),
+      indices.data(),
+      static_cast<value_idx>(params.indptr_h.size() - 1),
+      params.n_cols,
+      static_cast<value_idx>(params.indices_h.size()));
+    auto x = raft::make_device_csr_matrix_view<const value_t>(data.data(), x_structure);
+
+    cuvs::distance::pairwise_distance(handle, x, x, out, params.metric, params.metric_arg);
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle)));
+  }
+
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(),
+                            out_dists.data(),
+                            params.out_dists_ref_h.size(),
+                            CompareApprox<value_t>(1e-3)));
+  }
+
+ protected:
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h      = params.data_h;
+
+    auto stream = resource::get_cuda_stream(handle);
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
+
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+
+    out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
+
+    update_device(out_dists_ref.data(),
+                  out_dists_ref_h.data(),
+                  out_dists_ref_h.size(),
+                  resource::get_cuda_stream(handle));
+  }
+
+  raft::resources handle;
+
+  // input data
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
+
+  // output data
+  rmm::device_uvector<value_t> out_dists, out_dists_ref;
+
+  SparseDistanceInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
+  {5,
+   {0, 0, 1, 2},
+
+   {1, 2},
+   {0.5, 0.5},
+   {0, 1, 1, 1, 0, 1, 1, 1, 0},
+   cuvs::distance::DistanceType::CosineExpanded,
+   0.0},
+  {5,
+   {0, 0, 1, 2},
+
+   {1, 2},
+   {1.0, 1.0},
+   {0, 1, 1, 1, 0, 1, 1, 1, 0},
+   cuvs::distance::DistanceType::JaccardExpanded,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L2Expanded,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},
+   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
+   cuvs::distance::DistanceType::InnerProduct,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L2Unexpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102,
+    1.,         0.76978799, 0.39419924, 0.,         0.97577154, 0.48904013, 0.48300801, 0.45087445,
+    0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,         0.79593037, 0.48904013,
+    0.51413997, 0.,         0.28605559, 0.35772784, 1.,         0.60889396, 0.43324829, 0.84923694,
+    0.45658883, 0.48300801, 0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0.,
+    0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1.,
+    0.6745457,  0.77917274, 0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,         0.51360432, 0.68185144,
+    1.,         0.54847744, 0.8321819,  0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432,
+    0.,         1.,         0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+    0.61547536, 0.68185144, 1.,         0.},
+   cuvs::distance::DistanceType::CosineExpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {0.0,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.75,
+    0.2857142857142857,
+    0.75,
+    0.7142857142857143,
+    0.5,
+    1.0,
+    0.6666666666666666,
+    0.42857142857142855,
+    0.0,
+    0.75,
+    0.625,
+    0.375,
+    0.42857142857142855,
+    0.75,
+    0.375,
+    0.75,
+    0.7142857142857143,
+    0.7142857142857143,
+    0.75,
+    0.0,
+    0.7142857142857143,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.6666666666666666,
+    0.625,
+    0.6666666666666666,
+    1.0,
+    0.75,
+    0.625,
+    0.7142857142857143,
+    0.0,
+    0.5,
+    0.5714285714285714,
+    1.0,
+    0.8,
+    0.5,
+    0.6666666666666666,
+    0.2857142857142857,
+    0.375,
+    0.42857142857142855,
+    0.5,
+    0.0,
+    0.6666666666666666,
+    0.7777777777777778,
+    0.4444444444444444,
+    0.7777777777777778,
+    0.75,
+    0.75,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.5714285714285714,
+    0.6666666666666666,
+    0.0,
+    0.7142857142857143,
+    0.5,
+    0.5,
+    0.8571428571428571,
+    0.7142857142857143,
+    0.75,
+    0.6666666666666666,
+    1.0,
+    0.7777777777777778,
+    0.7142857142857143,
+    0.0,
+    0.42857142857142855,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.5,
+    0.375,
+    0.625,
+    0.8,
+    0.4444444444444444,
+    0.5,
+    0.42857142857142855,
+    0.0,
+    0.7777777777777778,
+    0.75,
+    1.0,
+    0.75,
+    0.6666666666666666,
+    0.5,
+    0.7777777777777778,
+    0.5,
+    0.8571428571428571,
+    0.7777777777777778,
+    0.0,
+    1.0,
+    0.6666666666666666,
+    0.7142857142857143,
+    1.0,
+    0.6666666666666666,
+    0.75,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.75,
+    1.0,
+    0.0},
+   cuvs::distance::DistanceType::JaccardExpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    3.3954660629919076,
+    5.6469232737388815,
+    6.373112846266441,
+    4.0212880272531715,
+    6.916281504639404,
+    5.741508386786526,
+    5.411470999663036,
+    9.0,
+    4.977014354725805,
+    3.3954660629919076,
+    0.0,
+    7.56256082439209,
+    5.540261147481582,
+    4.832322929216881,
+    4.62003193872216,
+    6.498056792320361,
+    4.309846252268695,
+    6.317531174829905,
+    6.016362684141827,
+    5.6469232737388815,
+    7.56256082439209,
+    0.0,
+    5.974878731322299,
+    4.898357301336036,
+    6.442097410320605,
+    5.227077347287883,
+    7.134101195584642,
+    5.457753923371659,
+    7.0,
+    6.373112846266441,
+    5.540261147481582,
+    5.974878731322299,
+    0.0,
+    5.5507273748583,
+    4.897749658726415,
+    9.0,
+    8.398776718824767,
+    3.908281400328807,
+    4.83431066343688,
+    4.0212880272531715,
+    4.832322929216881,
+    4.898357301336036,
+    5.5507273748583,
+    0.0,
+    6.632989819428174,
+    7.438852294822894,
+    5.6631570310967465,
+    7.579428202635459,
+    6.760811985364303,
+    6.916281504639404,
+    4.62003193872216,
+    6.442097410320605,
+    4.897749658726415,
+    6.632989819428174,
+    0.0,
+    5.249404187382862,
+    6.072559523278559,
+    4.07661278488929,
+    6.19678948003145,
+    5.741508386786526,
+    6.498056792320361,
+    5.227077347287883,
+    9.0,
+    7.438852294822894,
+    5.249404187382862,
+    0.0,
+    3.854811639654704,
+    6.652724827169063,
+    5.298236851430971,
+    5.411470999663036,
+    4.309846252268695,
+    7.134101195584642,
+    8.398776718824767,
+    5.6631570310967465,
+    6.072559523278559,
+    3.854811639654704,
+    0.0,
+    7.529184598969917,
+    6.903282911791188,
+    9.0,
+    6.317531174829905,
+    5.457753923371659,
+    3.908281400328807,
+    7.579428202635459,
+    4.07661278488929,
+    6.652724827169063,
+    7.529184598969917,
+    0.0,
+    7.0,
+    4.977014354725805,
+    6.016362684141827,
+    7.0,
+    4.83431066343688,
+    6.760811985364303,
+    6.19678948003145,
+    5.298236851430971,
+    6.903282911791188,
+    7.0,
+    0.0},
+   cuvs::distance::DistanceType::Canberra,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    1.31462855332296,
+    1.3690307816129905,
+    1.698603990921237,
+    1.3460470789553531,
+    1.6636670712582544,
+    1.2651744044972217,
+    1.1938329352055201,
+    1.8811409082590185,
+    1.3653115050624267,
+    1.31462855332296,
+    0.0,
+    1.9447722703291133,
+    1.42818777206562,
+    1.4685491458946494,
+    1.3071999866010466,
+    1.4988622861692171,
+    0.9698559287406783,
+    1.4972023224597841,
+    1.5243383567266802,
+    1.3690307816129905,
+    1.9447722703291133,
+    0.0,
+    1.2748400840107568,
+    1.0599569946448246,
+    1.546591282841402,
+    1.147526531928459,
+    1.447002179128145,
+    1.5982242387673176,
+    1.3112533607072414,
+    1.698603990921237,
+    1.42818777206562,
+    1.2748400840107568,
+    0.0,
+    1.038121552545461,
+    1.011788365364402,
+    1.3907391109256988,
+    1.3128200942311496,
+    1.19595706584447,
+    1.3233328139624725,
+    1.3460470789553531,
+    1.4685491458946494,
+    1.0599569946448246,
+    1.038121552545461,
+    0.0,
+    1.3642741698145529,
+    1.3493868683808095,
+    1.394942694628328,
+    1.572881849642552,
+    1.380122665319464,
+    1.6636670712582544,
+    1.3071999866010466,
+    1.546591282841402,
+    1.011788365364402,
+    1.3642741698145529,
+    0.0,
+    1.018961640373018,
+    1.0114394258945634,
+    0.8338711034820684,
+    1.1247823842299223,
+    1.2651744044972217,
+    1.4988622861692171,
+    1.147526531928459,
+    1.3907391109256988,
+    1.3493868683808095,
+    1.018961640373018,
+    0.0,
+    0.7701238110357329,
+    1.245486437864406,
+    0.5551259549534626,
+    1.1938329352055201,
+    0.9698559287406783,
+    1.447002179128145,
+    1.3128200942311496,
+    1.394942694628328,
+    1.0114394258945634,
+    0.7701238110357329,
+    0.0,
+    1.1886800117391216,
+    1.0083692448135637,
+    1.8811409082590185,
+    1.4972023224597841,
+    1.5982242387673176,
+    1.19595706584447,
+    1.572881849642552,
+    0.8338711034820684,
+    1.245486437864406,
+    1.1886800117391216,
+    0.0,
+    1.3661374102525012,
+    1.3653115050624267,
+    1.5243383567266802,
+    1.3112533607072414,
+    1.3233328139624725,
+    1.380122665319464,
+    1.1247823842299223,
+    0.5551259549534626,
+    1.0083692448135637,
+    1.3661374102525012,
+    0.0},
+   cuvs::distance::DistanceType::LpUnexpanded,
+   2.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    0.9251771844789913,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.717493881903289,
+    0.6920214832303888,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.0,
+    0.9036452083899731,
+    0.8655339692155823,
+    0.8706483735804971,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.6329837991017668,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.0,
+    0.7988276152181608,
+    0.7028075145996631,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.8429599432532096,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.7988276152181608,
+    0.0,
+    0.48376552205293305,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8429599432532096,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.7028075145996631,
+    0.48376552205293305,
+    0.0,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8429599432532096,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.0,
+    0.8853924473642432,
+    0.535821510936138,
+    0.6497196601457607,
+    0.8853924473642432,
+    0.717493881903289,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.0,
+    0.5279604218147174,
+    0.6658348373853169,
+    0.33799874888632914,
+    0.6920214832303888,
+    0.6329837991017668,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.535821510936138,
+    0.5279604218147174,
+    0.0,
+    0.662579808115858,
+    0.5079750812968089,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.6497196601457607,
+    0.6658348373853169,
+    0.662579808115858,
+    0.0,
+    0.8429599432532096,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.33799874888632914,
+    0.5079750812968089,
+    0.8429599432532096,
+    0.0},
+   cuvs::distance::DistanceType::Linf,
+   0.0},
+
+  {15,
+   {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
+   {0, 1, 5,  6, 9, 1,  4,  14, 7, 3,  4,  7, 9, 11, 14, 0, 3, 7, 8, 12, 0,  2, 5,
+    7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8,  9,  0, 2, 3, 4, 6,  10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219,
+    1.,         0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246,
+    0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739,
+    0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409,  0.31461499, 0.24412279, 0.08327667,
+    0.04444576, 0.05047969, 0.26190054, 0.2077349,  0.10803964},
+   {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
+    9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
+    6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
+    1.00000000e+00, 7.33151506e-01, 1.00000000e+00, 9.86880955e-01,
+    9.19154851e-01, 5.38849774e-01, 1.00000000e+00, 8.98332369e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 8.03303970e-01,
+    6.64465915e-01, 8.69374690e-01, 1.00000000e+00, 1.00000000e+00,
+    1.00000000e+00, 1.00000000e+00, 9.24116813e-01, 7.33151506e-01,
+    8.03303970e-01, 0.00000000e+00, 8.16225843e-01, 9.39818306e-01,
+    7.27700415e-01, 7.30155528e-01, 8.89451011e-01, 8.05419635e-01,
+    9.90039274e-01, 1.00000000e+00, 6.64465915e-01, 8.16225843e-01,
+    0.00000000e+00, 6.38804490e-01, 1.00000000e+00, 1.00000000e+00,
+    9.52559809e-01, 9.53789212e-01, 7.97613546e-01, 9.86880955e-01,
+    8.69374690e-01, 9.39818306e-01, 6.38804490e-01, 0.0,
+    1.00000000e+00, 9.72569112e-01, 8.24907516e-01, 8.07933016e-01,
+    8.91271059e-01, 9.19154851e-01, 1.00000000e+00, 7.27700415e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 7.63596268e-01,
+    8.40131263e-01, 7.40428532e-01, 1.00000000e+00, 5.38849774e-01,
+    1.00000000e+00, 7.30155528e-01, 1.00000000e+00, 9.72569112e-01,
+    7.63596268e-01, 0.00000000e+00, 1.00000000e+00, 7.95485011e-01,
+    6.64669302e-01, 1.00000000e+00, 1.00000000e+00, 8.89451011e-01,
+    9.52559809e-01, 8.24907516e-01, 8.40131263e-01, 1.00000000e+00,
+    0.00000000e+00, 8.51370877e-01, 8.59439512e-01, 8.98332369e-01,
+    1.00000000e+00, 8.05419635e-01, 9.53789212e-01, 8.07933016e-01,
+    7.40428532e-01, 7.95485011e-01, 8.51370877e-01, 1.49011612e-08},
+   // Dataset is L1 normalized into pdfs
+   cuvs::distance::DistanceType::HellingerExpanded,
+   0.0},
+
+  {4,
+   {0, 1, 1, 2, 4},
+   {3, 2, 0, 1},  // indices
+   {0.99296, 0.42180, 0.11687, 0.305869},
+   {
+     // dense output
+     0.0,
+     0.99296,
+     1.41476,
+     1.415707,
+     0.99296,
+     0.0,
+     0.42180,
+     0.42274,
+     1.41476,
+     0.42180,
+     0.0,
+     0.84454,
+     1.41570,
+     0.42274,
+     0.84454,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L1,
+   0.0},
+  {5,
+   {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45},
+   {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2,
+    3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4},
+   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667,
+    0.34426657, 0.2357925,  0.01274851, 0.11422017, 0.3437756,  0.31967718, 0.5956055,  0.31610373,
+    0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
+    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736,
+    0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815,
+    0.31648823, 0.89874295, 0.27366735, 0.5119944,  0.11416134},
+   {// dense output
+    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,  0.76962708, 1.122858,
+    1.1232498,  1.08166081, 0.48769777, 0.,         1.31332116, 0.98318907, 0.42661815, 0.09279052,
+    1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
+    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907,
+    1.82943642, 0.,         0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116,
+    0.26657011, 0.42661815, 1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
+    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093, 0.45060069, 0.,
+    1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281,
+    0.77814948, 1.29899154, 0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
+    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,         1.47318624, 1.92660889,
+    1.1232498,  0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624,
+    0.,         0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
+    1.88812175, 1.92660889, 0.24992619, 0.},
+   cuvs::distance::DistanceType::CorrelationExpanded,
+   0.0},
+  {5,
+   {0, 1, 2, 4, 4, 5, 6, 7, 9, 9, 10},
+   {1, 4, 0, 4, 1, 3, 0, 1, 3, 0},
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {// dense output
+    0.,  1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1.,  0., 0.8, 1., 1.,  1.,  1.,  1.,  1., 1.,
+    1.,  0.8, 0.,  1., 1.,  1., 0.8, 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  1.,  1.,  1., 1.,
+    0.8, 1.,  1.,  1., 0.,  1., 1.,  0.8, 1., 1.,  1.,  1., 1.,  1., 1.,  0.,  1.,  0.8, 1., 1.,
+    1.,  1.,  0.8, 1., 1.,  1., 0.,  1.,  1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1.,  0.,  1., 1.,
+    1.,  1.,  1.,  1., 1.,  1., 1.,  1.,  0., 1.,  1.,  1., 0.8, 1., 1.,  1.,  0.8, 1.,  1., 0.},
+   cuvs::distance::DistanceType::RusselRaoExpanded,
+   0.0},
+  {5,
+   {0, 1, 1, 3, 3, 4, 4, 6, 9, 10, 10},
+   {0, 3, 4, 4, 2, 3, 0, 2, 3, 2},
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {// dense output
+    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4,
+    0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0.,  0.4, 0.,
+    0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,  0.2, 0.6, 0.8, 0.4, 0.2, 0.2,
+    0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2,
+    0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4,
+    0.2, 0.2, 0.4, 0.,  0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
+   cuvs::distance::DistanceType::HammingUnexpanded,
+   0.0},
+  {3,
+   {0, 1, 2},
+   {0, 1},
+   {1.0, 1.0},
+   {0.0, 0.83255, 0.83255, 0.0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+  {2,
+   {0, 1, 3},
+   {0, 0, 1},
+   {1.0, 0.5, 0.5},
+   {0, 0.4645014, 0.4645014, 0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+  {3,
+   {0, 1, 2},
+   {0, 0},
+   {1.0, 1.0},
+   {0.0, 0.0, 0.0, 0.0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+
+  {3,
+   {0, 1, 2},
+   {0, 1},
+   {1.0, 1.0},
+   {0.0, 1.0, 1.0, 0.0},
+   cuvs::distance::DistanceType::DiceExpanded,
+   0.0},
+  {3,
+   {0, 1, 3},
+   {0, 0, 1},
+   {1.0, 1.0, 1.0},
+   {0, 0.333333, 0.333333, 0},
+   cuvs::distance::DistanceType::DiceExpanded,
+   0.0},
+
+};
+
+typedef SparseDistanceTest<int, float> SparseDistanceTestF;
+TEST_P(SparseDistanceTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests,
+                        SparseDistanceTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+}  // end namespace distance
+}  // end namespace cuvs
diff --git a/cpp/test/neighbors/ann_brute_force.cuh b/cpp/test/neighbors/ann_brute_force.cuh
index c2afa4e8b..03d6e820c 100644
--- a/cpp/test/neighbors/ann_brute_force.cuh
+++ b/cpp/test/neighbors/ann_brute_force.cuh
@@ -114,12 +114,28 @@ class AnnBruteForceTest : public ::testing::TestWithParam<AnnBruteForceInputs<Id
                                                       0.001f,
                                                       stream_,
                                                       true));
+
+      brute_force::serialize(handle_, std::string{"brute_force_index"}, idx, true);
+      auto index_loaded = brute_force::index<DataT, T>(handle_);
+      brute_force::deserialize(handle_, std::string{"brute_force_index"}, &index_loaded);
+
       brute_force::search(handle_,
-                          idx,
+                          index_loaded,
                           search_queries_view,
                           indices_out_view,
                           dists_out_view,
                           cuvs::neighbors::filtering::none_sample_filter{});
+      raft::resource::sync_stream(handle_);
+
+      ASSERT_TRUE(cuvs::neighbors::devArrMatchKnnPair(indices_naive_dev.data(),
+                                                      indices_bruteforce_dev.data(),
+                                                      distances_naive_dev.data(),
+                                                      distances_bruteforce_dev.data(),
+                                                      ps.num_queries,
+                                                      ps.k,
+                                                      0.001f,
+                                                      stream_,
+                                                      true));
     }
   }
 
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 37d42dd1d..8d5701439 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -361,8 +361,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
                                           // not used for knn_graph building.
         switch (ps.build_algo) {
           case graph_build_algo::IVF_PQ:
-            index_params.graph_build_params =
-              graph_build_params::ivf_pq_params(raft::matrix_extent<int64_t>(ps.n_rows, ps.dim));
+            index_params.graph_build_params = graph_build_params::ivf_pq_params(
+              raft::matrix_extent<int64_t>(ps.n_rows, ps.dim), index_params.metric);
             if (ps.ivf_pq_search_refine_ratio) {
               std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
                 index_params.graph_build_params)
@@ -370,8 +370,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
             }
             break;
           case graph_build_algo::NN_DESCENT: {
-            index_params.graph_build_params =
-              graph_build_params::nn_descent_params(index_params.intermediate_graph_degree);
+            index_params.graph_build_params = graph_build_params::nn_descent_params(
+              index_params.intermediate_graph_degree, index_params.metric);
             break;
           }
           case graph_build_algo::AUTO:
@@ -389,7 +389,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
           (const DataT*)database.data(), ps.n_rows, ps.dim);
 
         {
-          cagra::index<DataT, IdxT> index(handle_);
+          cagra::index<DataT, IdxT> index(handle_, index_params.metric);
           if (ps.host_dataset) {
             auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
@@ -758,11 +758,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
         search_params.team_size   = ps.team_size;
-
-        // TODO: setting search_params.itopk_size here breaks the filter tests, but is required for
-        // k>1024 skip these tests until fixed
-        if (ps.k >= 1024) { GTEST_SKIP(); }
-        // search_params.itopk_size   = ps.itopk_size;
+        search_params.itopk_size  = ps.itopk_size;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
diff --git a/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu
new file mode 100644
index 000000000..e21a54e9e
--- /dev/null
+++ b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/cagra.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::cagra {
+
+class cagra_extreme_inputs_oob_test : public ::testing::Test {
+ public:
+  using data_type = float;
+
+ protected:
+  void run()
+  {
+    cagra::index_params ix_ps;
+    graph_build_params::ivf_pq_params gb_params{};
+    gb_params.refinement_rate       = 2;
+    ix_ps.graph_build_params        = gb_params;
+    ix_ps.graph_degree              = 64;
+    ix_ps.intermediate_graph_degree = 128;
+
+    [[maybe_unused]] auto ix = cagra::build(res, ix_ps, raft::make_const_mdspan(dataset->view()));
+    raft::resource::sync_stream(res);
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, n_samples, n_dim));
+    raft::random::RngState r(1234ULL);
+    raft::random::normal(
+      res, r, dataset->data_handle(), n_samples * n_dim, data_type(0), data_type(1e20));
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    dataset.reset();
+    raft::resource::sync_stream(res);
+  }
+
+ private:
+  raft::resources res;
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset = std::nullopt;
+
+  constexpr static int64_t n_samples                   = 100000;
+  constexpr static int64_t n_dim                       = 200;
+  constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+};
+
+TEST_F(cagra_extreme_inputs_oob_test, cagra_extreme_inputs_oob_test) { this->run(); }
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu b/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu
new file mode 100644
index 000000000..6f4aa059e
--- /dev/null
+++ b/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_cagra.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::cagra {
+
+class AnnCagraBugMultiCTACrash : public ::testing::TestWithParam<cagra::search_algo> {
+ public:
+  using data_type = half;
+
+ protected:
+  void run()
+  {
+    cagra::index_params cagra_index_params;
+    cagra_index_params.graph_degree              = 32;
+    cagra_index_params.intermediate_graph_degree = 48;
+
+    auto cagra_index =
+      cagra::build(res, cagra_index_params, raft::make_const_mdspan(dataset->view()));
+    raft::resource::sync_stream(res);
+
+    cagra::search_params cagra_search_params;
+    cagra_search_params.itopk_size        = 32;
+    cagra_search_params.thread_block_size = 256;
+    cagra_search_params.search_width      = 1;
+    cagra_search_params.max_iterations    = 0;
+    cagra_search_params.algo = ::testing::TestWithParam<cagra::search_algo>::GetParam();
+
+    // NOTE: when using one resource/stream for everything, the bug is NOT reproducible
+    raft::resources res_search;
+    cagra::search(res_search,
+                  cagra_search_params,
+                  cagra_index,
+                  raft::make_const_mdspan(queries->view()),
+                  neighbors->view(),
+                  distances->view());
+
+    raft::resource::sync_stream(res_search);
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, n_samples, n_dim));
+    queries.emplace(raft::make_device_matrix<data_type, int64_t>(res, n_queries, n_dim));
+    neighbors.emplace(raft::make_device_matrix<uint32_t, int64_t>(res, n_queries, k));
+    distances.emplace(raft::make_device_matrix<float, int64_t>(res, n_queries, k));
+    raft::random::RngState r(1234ULL);
+    InitDataset(res, dataset->data_handle(), n_samples, n_dim, metric, r);
+    // NOTE: when initializing queries with "normal" data, the bug is NOT reproducible
+    raft::linalg::map(
+      res, queries->view(), raft::const_op<data_type>{raft::upper_bound<data_type>()});
+    // InitDataset(res, queries->data_handle(), n_queries, n_dim, metric, r);
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    dataset.reset();
+    queries.reset();
+    neighbors.reset();
+    distances.reset();
+    raft::resource::sync_stream(res);
+  }
+
+ private:
+  raft::resources res;
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset  = std::nullopt;
+  std::optional<raft::device_matrix<data_type, int64_t>> queries  = std::nullopt;
+  std::optional<raft::device_matrix<uint32_t, int64_t>> neighbors = std::nullopt;
+  std::optional<raft::device_matrix<float, int64_t>> distances    = std::nullopt;
+
+  constexpr static int64_t n_samples                   = 1183514;
+  constexpr static int64_t n_dim                       = 100;
+  constexpr static int64_t n_queries                   = 30;
+  constexpr static int64_t k                           = 10;
+  constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+};
+
+TEST_P(AnnCagraBugMultiCTACrash, AnnCagraBugMultiCTACrash) { this->run(); }
+
+INSTANTIATE_TEST_CASE_P(AnnCagraBugMultiCTACrashReproducer,
+                        AnnCagraBugMultiCTACrash,
+                        ::testing::Values(cagra::search_algo::MULTI_CTA));
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_hnsw_c.cu b/cpp/test/neighbors/ann_hnsw_c.cu
index fc740b924..2a6401b1d 100644
--- a/cpp/test/neighbors/ann_hnsw_c.cu
+++ b/cpp/test/neighbors/ann_hnsw_c.cu
@@ -111,7 +111,9 @@ TEST(CagraHnswC, BuildSearch)
   cuvsHnswIndex_t hnsw_index;
   cuvsHnswIndexCreate(&hnsw_index);
   hnsw_index->dtype = index->dtype;
-  cuvsHnswDeserialize(res, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index);
+  cuvsHnswIndexParams_t hnsw_params;
+  cuvsHnswIndexParamsCreate(&hnsw_params);
+  cuvsHnswDeserialize(res, hnsw_params, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index);
 
   // search index
   cuvsHnswSearchParams_t search_params;
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 8cc46b2f7..23d84ca98 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -24,6 +24,7 @@
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <raft/linalg/normalize.cuh>
 #include <raft/stats/mean.cuh>
+#include <thrust/reduce.h>
 #include <thrust/sequence.h>
 
 #include <raft/core/resource/cuda_stream_pool.hpp>
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index fd4e330db..3a92b5e3d 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -379,7 +379,14 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // Pack a few vectors back to the list.
     int row_offset = 5;
     int n_vec      = 3;
-    ASSERT_TRUE(row_offset + n_vec < n_rows);
+    if (static_cast<decltype(n_rows)>(row_offset + n_vec) > n_rows) {
+      RAFT_LOG_INFO(
+        "Skipping IVF-PQ check_packing/pack test for label %u due to insufficient data (%u "
+        "records)",
+        label,
+        uint32_t(n_rows));
+      return;
+    }
     size_t offset      = row_offset * index->pq_dim();
     auto codes_to_pack = raft::make_device_matrix_view<const uint8_t, uint32_t>(
       codes.data_handle() + offset, n_vec, index->pq_dim());
@@ -393,7 +400,14 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // Another test with the API that take list_data directly
     [[maybe_unused]] auto list_data = index->lists()[label]->data.view();
     uint32_t n_take                 = 4;
-    ASSERT_TRUE(row_offset + n_take < n_rows);
+    if (static_cast<decltype(n_rows)>(row_offset + n_take) > n_rows) {
+      RAFT_LOG_INFO(
+        "Skipping IVF-PQ check_packing/take test for label %u due to insufficient data (%u "
+        "records)",
+        label,
+        uint32_t(n_rows));
+      return;
+    }
     auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
     ivf_pq::helpers::codepacker::unpack(
       handle_, list_data, index->pq_bits(), row_offset, codes2.view());
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index bce0f9899..09861a219 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -18,11 +18,16 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
+
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include "naive_knn.cuh"
+#include <cuvs/distance/distance.hpp>
 
 #include <gtest/gtest.h>
 
@@ -42,6 +47,15 @@ struct AnnNNDescentInputs {
   double min_recall;
 };
 
+struct AnnNNDescentBatchInputs {
+  std::pair<double, size_t> recall_cluster;
+  int n_rows;
+  int dim;
+  int graph_degree;
+  cuvs::distance::DistanceType metric;
+  bool host_dataset;
+};
+
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p)
 {
   os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
@@ -50,6 +64,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs&
   return os;
 }
 
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentBatchInputs& p)
+{
+  os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << ", clusters=" << p.recall_cluster.second << std::endl;
+  return os;
+}
+
 template <typename DistanceT, typename DataT, typename IdxT>
 class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
  public:
@@ -65,7 +87,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
   {
     size_t queries_size = ps.n_rows * ps.graph_degree;
     std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
 
     {
       rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
@@ -81,16 +105,17 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
                                         ps.graph_degree,
                                         ps.metric);
       raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       raft::resource::sync_stream(handle_);
     }
-
     {
       {
-        cuvs::neighbors::nn_descent::index_params index_params;
+        nn_descent::index_params index_params;
         index_params.metric                    = ps.metric;
         index_params.graph_degree              = ps.graph_degree;
         index_params.intermediate_graph_degree = 2 * ps.graph_degree;
         index_params.max_iterations            = 100;
+        index_params.return_distances          = true;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -99,24 +124,171 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
           if (ps.host_dataset) {
             auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            raft::resource::sync_stream(handle_);
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
               (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
-            auto index =
-              cuvs::neighbors::nn_descent::build(handle_, index_params, database_host_view);
-            raft::update_host(
+            auto index = nn_descent::build(handle_, index_params, database_host_view);
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
           } else {
-            auto index = cuvs::neighbors::nn_descent::build(handle_, index_params, database_view);
-            raft::update_host(
+            auto index = nn_descent::build(handle_, index_params, database_view);
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
           };
         }
         raft::resource::sync_stream(handle_);
       }
 
+      if (ps.metric == cuvs::distance::DistanceType::InnerProduct) {
+        std::transform(
+          distances_naive.begin(), distances_naive.end(), distances_naive.begin(), [](auto x) {
+            return -x;
+          });
+      }
+
       double min_recall = ps.min_recall;
-      EXPECT_TRUE(eval_recall(
-        indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall));
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.001,
+                                  min_recall));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    raft::random::RngState r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
+    } else if constexpr (std::is_same<DataT, int8_t>{}) {
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(-5), DataT(5));
+    } else {
+      raft::random::uniformInt(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0), DataT(5));
+    }
+    raft::resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    raft::resource::sync_stream(handle_);
+    database.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnNNDescentInputs ps;
+  rmm::device_uvector<DataT> database;
+};
+
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnNNDescentBatchTest : public ::testing::TestWithParam<AnnNNDescentBatchInputs> {
+ public:
+  AnnNNDescentBatchTest()
+    : stream_(raft::resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnNNDescentBatchInputs>::GetParam()),
+      database(0, stream_)
+  {
+  }
+
+  void testNNDescentBatch()
+  {
+    size_t queries_size = ps.n_rows * ps.graph_degree;
+    std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        database.data(),
+                                        database.data(),
+                                        ps.n_rows,
+                                        ps.n_rows,
+                                        ps.dim,
+                                        ps.graph_degree,
+                                        ps.metric);
+      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    {
+      {
+        nn_descent::index_params index_params;
+        index_params.metric                    = ps.metric;
+        index_params.graph_degree              = ps.graph_degree;
+        index_params.intermediate_graph_degree = 2 * ps.graph_degree;
+        index_params.max_iterations            = 10;
+        index_params.return_distances          = true;
+        index_params.n_clusters                = ps.recall_cluster.second;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        {
+          if (ps.host_dataset) {
+            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            auto index = nn_descent::build(handle_, index_params, database_host_view);
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
+          } else {
+            auto index = nn_descent::build(handle_, index_params, database_view);
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+          };
+        }
+        raft::resource::sync_stream(handle_);
+      }
+      double min_recall = ps.recall_cluster.first;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.01,
+                                  min_recall,
+                                  true,
+                                  static_cast<size_t>(ps.graph_degree * 0.1)));
     }
   }
 
@@ -142,16 +314,29 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
  private:
   raft::resources handle_;
   rmm::cuda_stream_view stream_;
-  AnnNNDescentInputs ps;
+  AnnNNDescentBatchInputs ps;
   rmm::device_uvector<DataT> database;
 };
 
-const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<AnnNNDescentInputs>(
-  {1000, 2000},                                              // n_rows
-  {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
-  {32, 64},                                                  // graph_degree
-  {cuvs::distance::DistanceType::L2Expanded},
-  {false, true},
-  {0.90});
+const std::vector<AnnNNDescentInputs> inputs =
+  raft::util::itertools::product<AnnNNDescentInputs>({2000, 4000},            // n_rows
+                                                     {4, 16, 64, 256, 1024},  // dim
+                                                     {32, 64},                // graph_degree
+                                                     {cuvs::distance::DistanceType::L2Expanded,
+                                                      cuvs::distance::DistanceType::InnerProduct,
+                                                      cuvs::distance::DistanceType::CosineExpanded},
+                                                     {false, true},
+                                                     {0.90});
+
+// TODO : Investigate why this test is failing Reference issue https
+// :  // github.com/rapidsai/raft/issues/2450
+const std::vector<AnnNNDescentBatchInputs> inputsBatch =
+  raft::util::itertools::product<AnnNNDescentBatchInputs>(
+    {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
+    {4000, 5000},                                          // n_rows
+    {192, 512},                                            // dim
+    {32, 64},                                              // graph_degree
+    {cuvs::distance::DistanceType::L2Expanded},
+    {false, true});
 
-}  // namespace  cuvs::neighbors::nn_descent
+}  // namespace cuvs::neighbors::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
index 64c0e0291..7a24f96a1 100644
--- a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
@@ -23,6 +23,12 @@ namespace cuvs::neighbors::nn_descent {
 typedef AnnNNDescentTest<float, float, std::uint32_t> AnnNNDescentTestF_U32;
 TEST_P(AnnNNDescentTestF_U32, AnnNNDescent) { this->testNNDescent(); }
 
+// typedef AnnNNDescentBatchTest<float, float, std::uint32_t> AnnNNDescentBatchTestF_U32;
+// TEST_P(AnnNNDescentBatchTestF_U32, AnnNNDescentBatch) { this->testNNDescentBatch(); }
+
 INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestF_U32, ::testing::ValuesIn(inputs));
+// INSTANTIATE_TEST_CASE_P(AnnNNDescentBatchTest,
+//                         AnnNNDescentBatchTestF_U32,
+//                         ::testing::ValuesIn(inputsBatch));
 
 }  // namespace   cuvs::neighbors::nn_descent
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index b08e1d725..94bccade2 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/device_mdarray.hpp>  // raft::make_device_matrix
 #include <raft/core/resource/cuda_stream.hpp>
@@ -165,9 +166,14 @@ auto calc_recall(const std::vector<T>& expected_idx,
 /** check uniqueness of indices
  */
 template <typename T>
-auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t cols)
+auto check_unique_indices(const std::vector<T>& actual_idx,
+                          size_t rows,
+                          size_t cols,
+                          size_t max_duplicates = 0)
 {
   size_t max_count;
+  size_t dup_count = 0lu;
+
   std::set<T> unique_indices;
   for (size_t i = 0; i < rows; ++i) {
     unique_indices.clear();
@@ -180,8 +186,11 @@ auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t
       } else if (unique_indices.find(act_idx) == unique_indices.end()) {
         unique_indices.insert(act_idx);
       } else {
-        return testing::AssertionFailure()
-               << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        dup_count++;
+        if (dup_count > max_duplicates) {
+          return testing::AssertionFailure()
+                 << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        }
       }
     }
   }
@@ -264,7 +273,8 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
                      size_t cols,
                      double eps,
                      double min_recall,
-                     bool test_unique = true) -> testing::AssertionResult
+                     bool test_unique      = true,
+                     size_t max_duplicates = 0) -> testing::AssertionResult
 {
   auto [actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps);
@@ -284,7 +294,7 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
            << min_recall << "); eps = " << eps << ". ";
   }
   if (test_unique)
-    return check_unique_indices(actual_idx, rows, cols);
+    return check_unique_indices(actual_idx, rows, cols, max_duplicates);
   else
     return testing::AssertionSuccess();
 }
diff --git a/cpp/test/neighbors/dynamic_batching.cuh b/cpp/test/neighbors/dynamic_batching.cuh
new file mode 100644
index 000000000..b64c5b01e
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching.cuh
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_utils.cuh"
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cstdint>
+#include <future>
+#include <vector>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+struct dynamic_batching_spec {
+  int64_t n_queries                   = 1000;
+  int64_t n_rows                      = 100000;
+  int64_t dim                         = 128;
+  int64_t k                           = 10;
+  int64_t max_batch_size              = 64;
+  size_t n_queues                     = 3;
+  bool conservative_dispatch          = false;
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+  int64_t max_concurrent_threads      = 128;
+};
+
+inline ::std::ostream& operator<<(::std::ostream& os, const dynamic_batching_spec& p)
+{
+  os << "{n_queries=" << p.n_queries;
+  os << ", dataset shape=" << p.n_rows << "x" << p.dim;
+  os << ", metric=" << print_metric{p.metric};
+  os << ", k=" << p.k;
+  os << ", max_batch_size=" << p.max_batch_size;
+  os << ", n_queues=" << p.n_queues;
+  os << ", conservative_dispatch=" << p.conservative_dispatch;
+  os << '}' << std::endl;
+  return os;
+}
+
+template <typename DataT, typename IdxT, typename UpstreamT>
+using build_function = UpstreamT(const raft::resources&,
+                                 const typename UpstreamT::index_params_type&,
+                                 raft::device_matrix_view<const DataT, int64_t, raft::row_major>);
+
+template <typename DataT, typename IdxT, typename UpstreamT>
+using search_function = void(const raft::resources&,
+                             const typename UpstreamT::search_params_type& params,
+                             const UpstreamT& index,
+                             raft::device_matrix_view<const DataT, int64_t, raft::row_major>,
+                             raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                             raft::device_matrix_view<float, int64_t, raft::row_major>,
+                             const cuvs::neighbors::filtering::base_filter&);
+
+template <typename DataT,
+          typename IdxT,
+          typename UpstreamT,
+          build_function<DataT, IdxT, UpstreamT> UpstreamBuildF,
+          search_function<DataT, IdxT, UpstreamT> UpstreamSearchF>
+struct dynamic_batching_test : public ::testing::TestWithParam<dynamic_batching_spec> {
+  using distance_type = float;
+  using data_type     = DataT;
+  using index_type    = IdxT;
+  using upstream_type = UpstreamT;
+
+  dynamic_batching_spec ps = ::testing::TestWithParam<dynamic_batching_spec>::GetParam();
+  raft::resources res;
+
+  // input data
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset            = std::nullopt;
+  std::optional<raft::device_matrix<data_type, int64_t>> queries            = std::nullopt;
+  std::optional<raft::device_matrix<index_type, int64_t>> neighbors_upsm    = std::nullopt;
+  std::optional<raft::device_matrix<index_type, int64_t>> neighbors_dynb    = std::nullopt;
+  std::optional<raft::device_matrix<distance_type, int64_t>> distances_upsm = std::nullopt;
+  std::optional<raft::device_matrix<distance_type, int64_t>> distances_dynb = std::nullopt;
+
+  // build parameters
+  cuvs::neighbors::index_params build_params_base{ps.metric};
+  typename upstream_type::index_params_type build_params_upsm{build_params_base};
+  dynamic_batching::index_params build_params_dynb{
+    build_params_base, ps.k, ps.max_batch_size, ps.n_queues, ps.conservative_dispatch};
+
+  // search parameters
+  typename upstream_type::search_params_type search_params_upsm{};
+  dynamic_batching::search_params search_params_dynb{};
+
+  // indexes
+  std::optional<upstream_type> index_upsm                                  = std::nullopt;
+  std::optional<dynamic_batching::index<data_type, index_type>> index_dynb = std::nullopt;
+
+  void build_all()
+  {
+    index_dynb.reset();
+    index_upsm.reset();
+    index_upsm = UpstreamBuildF(res, build_params_upsm, dataset->view());
+    index_dynb.emplace(res, build_params_dynb, index_upsm.value(), search_params_upsm);
+  }
+
+  void search_all()
+  {
+    // Search using upstream index - all queries at once
+    UpstreamSearchF(res,
+                    search_params_upsm,
+                    index_upsm.value(),
+                    queries->view(),
+                    neighbors_upsm->view(),
+                    distances_upsm->view(),
+                    filtering::none_sample_filter{});
+    raft::resource::sync_stream(res);
+
+    // Search with dynamic batching
+    // Streaming scenario: prepare concurrent resources
+    rmm::cuda_stream_pool worker_streams(ps.max_concurrent_threads);
+    std::vector<std::future<void>> futures(ps.max_concurrent_threads);
+    std::vector<raft::resources> resource_pool(0);
+    for (int64_t i = 0; i < ps.max_concurrent_threads; i++) {
+      resource_pool.push_back(res);  // copies the resource
+      raft::resource::set_cuda_stream(resource_pool[i], worker_streams.get_stream(i));
+    }
+
+    // Try multiple batch sizes in a round-robin to improve test coverage
+    std::vector<int64_t> minibatch_sizes{1, 3, 7, 10};
+    auto get_bs = [&minibatch_sizes](auto i) {
+      return minibatch_sizes[i % minibatch_sizes.size()];
+    };
+    int64_t i = 0;
+    for (int64_t offset = 0; offset < ps.n_queries; offset += get_bs(i++)) {
+      auto bs = std::min<int64_t>(get_bs(i), ps.n_queries - offset);
+      auto j  = i % ps.max_concurrent_threads;
+      // wait for previous job in the same slot to finish
+      if (i >= ps.max_concurrent_threads) { futures[j].wait(); }
+      // submit a new job
+      futures[j] = std::async(
+        std::launch::async,
+        [&res       = resource_pool[j],
+         &params    = search_params_dynb,
+         index      = index_dynb.value(),
+         query_view = raft::make_device_matrix_view<data_type, int64_t>(
+           queries->data_handle() + offset * ps.dim, bs, ps.dim),
+         neighbors_view = raft::make_device_matrix_view<index_type, int64_t>(
+           neighbors_dynb->data_handle() + offset * ps.k, bs, ps.k),
+         distances_view = raft::make_device_matrix_view<distance_type, int64_t>(
+           distances_dynb->data_handle() + offset * ps.k, bs, ps.k)]() {
+          dynamic_batching::search(res, params, index, query_view, neighbors_view, distances_view);
+        });
+    }
+
+    // finalize all resources
+    for (int64_t j = 0; j < ps.max_concurrent_threads && j < i; j++) {
+      futures[j].wait();
+      raft::resource::sync_stream(resource_pool[j]);
+    }
+    raft::resource::sync_stream(res);
+  }
+
+  /*
+    Check the dynamic batching generated neighbors against the upstream index. They both may be
+    imperfect w.r.t. the ground truth, but they shouldn't differ too much.
+   */
+  void check_neighbors()
+  {
+    auto stream         = raft::resource::get_cuda_stream(res);
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<index_type> neighbors_upsm_host(queries_size);
+    std::vector<index_type> neighbors_dynb_host(queries_size);
+    std::vector<distance_type> distances_upsm_host(queries_size);
+    std::vector<distance_type> distances_dynb_host(queries_size);
+    raft::copy(neighbors_upsm_host.data(), neighbors_upsm->data_handle(), queries_size, stream);
+    raft::copy(neighbors_dynb_host.data(), neighbors_dynb->data_handle(), queries_size, stream);
+    raft::copy(distances_upsm_host.data(), distances_upsm->data_handle(), queries_size, stream);
+    raft::copy(distances_dynb_host.data(), distances_dynb->data_handle(), queries_size, stream);
+    raft::resource::sync_stream(res);
+    ASSERT_TRUE(eval_neighbours(neighbors_upsm_host,
+                                neighbors_dynb_host,
+                                distances_upsm_host,
+                                distances_dynb_host,
+                                ps.n_queries,
+                                ps.k,
+                                0.001,
+                                0.9))
+      << ps;
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, ps.n_rows, ps.dim));
+    queries.emplace(raft::make_device_matrix<data_type, int64_t>(res, ps.n_queries, ps.dim));
+    neighbors_upsm.emplace(raft::make_device_matrix<index_type, int64_t>(res, ps.n_queries, ps.k));
+    neighbors_dynb.emplace(raft::make_device_matrix<index_type, int64_t>(res, ps.n_queries, ps.k));
+    distances_upsm.emplace(
+      raft::make_device_matrix<distance_type, int64_t>(res, ps.n_queries, ps.k));
+    distances_dynb.emplace(
+      raft::make_device_matrix<distance_type, int64_t>(res, ps.n_queries, ps.k));
+
+    raft::random::RngState rng(666ULL);
+    if constexpr (std::is_same_v<data_type, float> || std::is_same_v<data_type, half>) {
+      raft::random::uniform(
+        res, rng, dataset->data_handle(), dataset->size(), data_type(0.1), data_type(2.0));
+      raft::random::uniform(
+        res, rng, queries->data_handle(), queries->size(), data_type(0.1), data_type(2.0));
+    } else {
+      raft::random::uniformInt(
+        res, rng, dataset->data_handle(), dataset->size(), data_type(1), data_type(20));
+      raft::random::uniformInt(
+        res, rng, queries->data_handle(), queries->size(), data_type(1), data_type(20));
+    }
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    index_dynb.reset();
+    index_upsm.reset();
+    dataset.reset();
+    queries.reset();
+    neighbors_upsm.reset();
+    neighbors_dynb.reset();
+    distances_upsm.reset();
+    distances_dynb.reset();
+    raft::resource::sync_stream(res);
+  }
+};
+
+inline std::vector<dynamic_batching_spec> generate_inputs()
+{
+  std::vector<dynamic_batching_spec> inputs{dynamic_batching_spec{}};
+
+  for (auto alt_n_queries : {10, 50, 100}) {
+    dynamic_batching_spec input{};
+    input.n_queries = alt_n_queries;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_k : {100, 200}) {
+    dynamic_batching_spec input{};
+    input.k = alt_k;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_max_batch_size : {4, 16, 128, 256, 512, 1024}) {
+    dynamic_batching_spec input{};
+    input.max_batch_size = alt_max_batch_size;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_n_queues : {1, 2, 16, 32}) {
+    dynamic_batching_spec input{};
+    input.n_queues = alt_n_queues;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_max_concurrent_threads : {1, 2, 16, 32}) {
+    dynamic_batching_spec input{};
+    input.max_concurrent_threads = alt_max_concurrent_threads;
+    inputs.push_back(input);
+  }
+
+  {
+    auto n = inputs.size();
+    for (size_t i = 0; i < n; i++) {
+      auto input                  = inputs[i];
+      input.conservative_dispatch = !input.conservative_dispatch;
+      inputs.push_back(input);
+    }
+  }
+
+  return inputs;
+}
+
+const std::vector<dynamic_batching_spec> inputs = generate_inputs();
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_cagra.cu b/cpp/test/neighbors/dynamic_batching/test_cagra.cu
new file mode 100644
index 000000000..604fc29cf
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_cagra.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using cagra_F32 = dynamic_batching_test<float,
+                                        uint32_t,
+                                        cagra::index<float, uint32_t>,
+                                        cagra::build,
+                                        cagra::search>;
+
+using cagra_U8 = dynamic_batching_test<uint8_t,
+                                       uint32_t,
+                                       cagra::index<uint8_t, uint32_t>,
+                                       cagra::build,
+                                       cagra::search>;
+
+template <typename fixture>
+static void set_default_cagra_params(fixture& that)
+{
+  that.build_params_upsm.intermediate_graph_degree = 128;
+  that.build_params_upsm.graph_degree              = 64;
+  that.search_params_upsm.itopk_size =
+    std::clamp<int64_t>(raft::bound_by_power_of_two(that.ps.k) * 16, 128, 512);
+}
+
+TEST_P(cagra_F32, single_cta)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::SINGLE_CTA;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_F32, multi_cta)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::MULTI_CTA;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_F32, multi_kernel)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::MULTI_KERNEL;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_U8, defaults)
+{
+  set_default_cagra_params(*this);
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_F32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_U8, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu
new file mode 100644
index 000000000..4922cffa3
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using ivf_flat_i8 = dynamic_batching_test<uint8_t,
+                                          int64_t,
+                                          ivf_flat::index<uint8_t, int64_t>,
+                                          ivf_flat::build,
+                                          ivf_flat::search>;
+
+TEST_P(ivf_flat_i8, defaults)
+{
+  build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows));
+  search_params_upsm.n_probes =
+    std::max<uint32_t>(std::min<uint32_t>(build_params_upsm.n_lists, 10),
+                       raft::div_rounding_up_safe<uint32_t>(build_params_upsm.n_lists, 50));
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_flat_i8, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu
new file mode 100644
index 000000000..ec57e0b57
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/ivf_pq.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using ivf_pq_f16 =
+  dynamic_batching_test<half, int64_t, ivf_pq::index<int64_t>, ivf_pq::build, ivf_pq::search>;
+
+TEST_P(ivf_pq_f16, defaults)
+{
+  build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows));
+  search_params_upsm.n_probes =
+    std::max<uint32_t>(std::min<uint32_t>(build_params_upsm.n_lists, 10),
+                       raft::div_rounding_up_safe<uint32_t>(build_params_upsm.n_lists, 50));
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_pq_f16, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/hnsw.cu b/cpp/test/neighbors/hnsw.cu
index 9fb88be05..20ee83a11 100644
--- a/cpp/test/neighbors/hnsw.cu
+++ b/cpp/test/neighbors/hnsw.cu
@@ -108,7 +108,8 @@ class AnnHNSWTest : public ::testing::TestWithParam<AnnHNSWInputs> {
 
       cuvs::neighbors::hnsw::search_params search_params;
       search_params.ef = ps.ef;
-      auto hnsw_index  = cuvs::neighbors::hnsw::from_cagra(handle_, index);
+      cuvs::neighbors::hnsw::index_params hnsw_params;
+      auto hnsw_index = cuvs::neighbors::hnsw::from_cagra(handle_, hnsw_params, index);
       auto queries_HNSW_view =
         raft::make_host_matrix_view<DataT, int64_t>(queries_h.data(), ps.n_queries, ps.dim);
       auto indices_HNSW_view =
diff --git a/cpp/test/neighbors/sparse_brute_force.cu b/cpp/test/neighbors/sparse_brute_force.cu
new file mode 100644
index 000000000..cb68989d4
--- /dev/null
+++ b/cpp/test/neighbors/sparse_brute_force.cu
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cusparse_v2.h>
+#include <gtest/gtest.h>
+
+namespace cuvs {
+namespace neighbors {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseKNNInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+  std::vector<value_idx> out_indices_ref_h;
+
+  int k;
+
+  int batch_size_index = 2;
+  int batch_size_query = 2;
+
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2SqrtExpanded;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs<value_idx, value_t>& dims)
+{
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+ public:
+  SparseKNNTest()
+    : params(::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam()),
+      indptr(0, resource::get_cuda_stream(handle)),
+      indices(0, resource::get_cuda_stream(handle)),
+      data(0, resource::get_cuda_stream(handle)),
+      out_indices(0, resource::get_cuda_stream(handle)),
+      out_dists(0, resource::get_cuda_stream(handle)),
+      out_indices_ref(0, resource::get_cuda_stream(handle)),
+      out_dists_ref(0, resource::get_cuda_stream(handle))
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    n_rows = params.indptr_h.size() - 1;
+    nnz    = params.indices_h.size();
+    k      = params.k;
+
+    make_data();
+
+    auto index_structure =
+      raft::make_device_compressed_structure_view<value_idx, value_idx, value_idx>(
+        indptr.data(), indices.data(), n_rows, params.n_cols, nnz);
+    auto index_csr = raft::make_device_csr_matrix_view<const value_t>(data.data(), index_structure);
+
+    auto index = cuvs::neighbors::brute_force::build(handle, index_csr, params.metric);
+
+    cuvs::neighbors::brute_force::sparse_search_params search_params;
+    search_params.batch_size_index = params.batch_size_index;
+    search_params.batch_size_query = params.batch_size_query;
+
+    cuvs::neighbors::brute_force::search(
+      handle,
+      search_params,
+      index,
+      index_csr,
+      raft::make_device_matrix_view<value_idx, int64_t>(out_indices.data(), n_rows, k),
+      raft::make_device_matrix_view<value_t, int64_t>(out_dists.data(), n_rows, k));
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle)));
+  }
+
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(
+      out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(
+      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
+  }
+
+ protected:
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h      = params.data_h;
+
+    auto stream = resource::get_cuda_stream(handle);
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
+
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
+    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+
+    out_indices_ref.resize(out_indices_ref_h.size(), stream);
+    out_dists_ref.resize(out_dists_ref_h.size(), stream);
+
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
+
+    out_dists.resize(n_rows * k, stream);
+    out_indices.resize(n_rows * k, stream);
+  }
+
+  raft::resources handle;
+
+  int n_rows, nnz, k;
+
+  // input data
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
+
+  // output data
+  rmm::device_uvector<value_idx> out_indices;
+  rmm::device_uvector<value_t> out_dists;
+
+  rmm::device_uvector<value_idx> out_indices_ref;
+  rmm::device_uvector<value_t> out_dists_ref;
+
+  SparseKNNInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
+  {9,                                                 // ncols
+   {0, 2, 4, 6, 8},                                   // indptr
+   {0, 4, 0, 3, 0, 2, 0, 8},                          // indices
+   {0.0f, 1.0f, 5.0f, 6.0f, 5.0f, 6.0f, 0.0f, 1.0f},  // data
+   {0, 1.41421, 0, 7.87401, 0, 7.87401, 0, 1.41421},  // dists
+   {0, 3, 1, 0, 2, 0, 3, 0},                          // inds
+   2,
+   2,
+   2,
+   cuvs::distance::DistanceType::L2SqrtExpanded}};
+typedef SparseKNNTest<int, float> SparseKNNTestF;
+TEST_P(SparseKNNTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace neighbors
+};  // end namespace cuvs
diff --git a/cpp/test/preprocessing/scalar_quantization.cu b/cpp/test/preprocessing/scalar_quantization.cu
new file mode 100644
index 000000000..2fdfe7555
--- /dev/null
+++ b/cpp/test/preprocessing/scalar_quantization.cu
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/stats/stddev.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+
+namespace cuvs::preprocessing::quantize::scalar {
+
+template <typename T>
+struct QuantizationInputs {
+  cuvs::preprocessing::quantize::scalar::params quantization_params;
+  int rows;
+  int cols;
+  T min            = T(-1.0);
+  T max            = T(1.0);
+  double threshold = 2e-2;
+};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const QuantizationInputs<T>& inputs)
+{
+  return os << "quantization_quantile:<" << inputs.quantization_params.quantile
+            << "> rows:" << inputs.rows << " cols:" << inputs.cols << " min:" << (double)inputs.min
+            << " max:" << (double)inputs.max;
+}
+
+template <typename T, typename QuantI>
+class QuantizationTest : public ::testing::TestWithParam<QuantizationInputs<T>> {
+ public:
+  QuantizationTest()
+    : params_(::testing::TestWithParam<QuantizationInputs<T>>::GetParam()),
+      stream(raft::resource::get_cuda_stream(handle)),
+      input_(0, stream)
+  {
+  }
+
+  double getRelativeErrorStddev(const T* array_a, const T* array_b, size_t size, float quantile)
+  {
+    // relative error elementwise
+    rmm::device_uvector<double> relative_error(size, stream);
+    raft::linalg::binaryOp(
+      relative_error.data(),
+      array_a,
+      array_b,
+      size,
+      [] __device__(double a, double b) {
+        return a != b ? (raft::abs(a - b) / raft::max(raft::abs(a), raft::abs(b))) : 0;
+      },
+      stream);
+
+    // sort by size --> remove largest errors to account for quantile chosen
+    thrust::sort(raft::resource::get_thrust_policy(handle),
+                 relative_error.data(),
+                 relative_error.data() + size);
+    int elements_to_consider =
+      std::ceil(double(params_.quantization_params.quantile) * double(size));
+
+    rmm::device_uvector<double> mu(1, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(mu.data(), 0, sizeof(double), stream));
+
+    rmm::device_uvector<double> error_stddev(1, stream);
+    raft::stats::stddev(error_stddev.data(),
+                        relative_error.data(),
+                        mu.data(),
+                        1,
+                        elements_to_consider,
+                        false,
+                        true,
+                        stream);
+
+    double error_stddev_h;
+    raft::update_host(&error_stddev_h, error_stddev.data(), 1, stream);
+    raft::resource::sync_stream(handle, stream);
+    return error_stddev_h;
+  }
+
+ protected:
+  void testScalarQuantization()
+  {
+    // dataset identical on host / device
+    auto dataset = raft::make_device_matrix_view<const T, int64_t, raft::row_major>(
+      (const T*)(input_.data()), rows_, cols_);
+    auto dataset_h = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+      (const T*)(host_input_.data()), rows_, cols_);
+
+    size_t print_size = std::min(input_.size(), 20ul);
+
+    // train quantizer_1 on device
+    auto quantizer_1 =
+      cuvs::preprocessing::quantize::scalar::train(handle, params_.quantization_params, dataset);
+    std::cerr << "Q1: min = " << (double)quantizer_1.min_ << ", max = " << (double)quantizer_1.max_
+              << std::endl;
+
+    {
+      auto quantized_input_h = raft::make_host_matrix<QuantI, int64_t>(rows_, cols_);
+      auto quantized_input_d = raft::make_device_matrix<QuantI, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_1, dataset, quantized_input_d.view());
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_1, dataset_h, quantized_input_h.view());
+
+      {
+        raft::print_device_vector("Input array: ", input_.data(), print_size, std::cerr);
+
+        rmm::device_uvector<int> quantization_for_print(print_size, stream);
+        raft::linalg::unaryOp(quantization_for_print.data(),
+                              quantized_input_d.data_handle(),
+                              print_size,
+                              raft::cast_op<int>{},
+                              stream);
+        raft::resource::sync_stream(handle, stream);
+        raft::print_device_vector(
+          "Quantized array 1: ", quantization_for_print.data(), print_size, std::cerr);
+      }
+
+      // test (inverse) transform host/device equal
+      ASSERT_TRUE(devArrMatchHost(quantized_input_h.data_handle(),
+                                  quantized_input_d.data_handle(),
+                                  input_.size(),
+                                  cuvs::Compare<QuantI>(),
+                                  stream));
+
+      auto quantized_input_h_const_view = raft::make_host_matrix_view<const QuantI, int64_t>(
+        quantized_input_h.data_handle(), rows_, cols_);
+      auto re_transformed_input_h = raft::make_host_matrix<T, int64_t>(rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::inverse_transform(
+        handle, quantizer_1, quantized_input_h_const_view, re_transformed_input_h.view());
+
+      auto quantized_input_d_const_view = raft::make_device_matrix_view<const QuantI, int64_t>(
+        quantized_input_d.data_handle(), rows_, cols_);
+      auto re_transformed_input_d = raft::make_device_matrix<T, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::inverse_transform(
+        handle, quantizer_1, quantized_input_d_const_view, re_transformed_input_d.view());
+      raft::print_device_vector(
+        "re-transformed array: ", re_transformed_input_d.data_handle(), print_size, std::cerr);
+
+      {
+        double l2_error = getRelativeErrorStddev(dataset.data_handle(),
+                                                 re_transformed_input_d.data_handle(),
+                                                 input_.size(),
+                                                 params_.quantization_params.quantile);
+        std::cerr << "error stddev = " << l2_error << ", threshold = " << params_.threshold
+                  << std::endl;
+        // test (inverse) transform close to original dataset
+        ASSERT_TRUE(l2_error < params_.threshold);
+      }
+    }
+
+    // train quantizer_2 on host
+    auto quantizer_2 =
+      cuvs::preprocessing::quantize::scalar::train(handle, params_.quantization_params, dataset_h);
+    std::cerr << "Q2: min = " << (double)quantizer_2.min_ << ", max = " << (double)quantizer_2.max_
+              << std::endl;
+
+    // check both quantizers are the same (valid if sampling is identical)
+    if (input_.size() <= 1000000) {
+      ASSERT_TRUE((double)quantizer_1.min_ == (double)quantizer_2.min_);
+      ASSERT_TRUE((double)quantizer_1.max_ == (double)quantizer_2.max_);
+    }
+
+    {
+      // test transform host/device equal
+      auto quantized_input_h = raft::make_host_matrix<QuantI, int64_t>(rows_, cols_);
+      auto quantized_input_d = raft::make_device_matrix<QuantI, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_2, dataset, quantized_input_d.view());
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_2, dataset_h, quantized_input_h.view());
+
+      {
+        rmm::device_uvector<int> quantization_for_print(print_size, stream);
+        raft::linalg::unaryOp(quantization_for_print.data(),
+                              quantized_input_d.data_handle(),
+                              print_size,
+                              raft::cast_op<int>{},
+                              stream);
+        raft::resource::sync_stream(handle, stream);
+        raft::print_device_vector(
+          "Quantized array 2: ", quantization_for_print.data(), print_size, std::cerr);
+      }
+
+      ASSERT_TRUE(devArrMatchHost(quantized_input_h.data_handle(),
+                                  quantized_input_d.data_handle(),
+                                  input_.size(),
+                                  cuvs::Compare<QuantI>(),
+                                  stream));
+    }
+
+    // sort_by_key (input, quantization) -- check <= on result
+    {
+      auto quantized_input = raft::make_device_matrix<QuantI, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_1, dataset, quantized_input.view());
+      thrust::sort_by_key(raft::resource::get_thrust_policy(handle),
+                          input_.data(),
+                          input_.data() + input_.size(),
+                          quantized_input.data_handle());
+      std::vector<QuantI> quantized_input_sorted_host(input_.size());
+      raft::update_host(
+        quantized_input_sorted_host.data(), quantized_input.data_handle(), input_.size(), stream);
+      raft::resource::sync_stream(handle, stream);
+
+      for (size_t i = 0; i < input_.size() - 1; ++i) {
+        ASSERT_TRUE(quantized_input_sorted_host[i] <= quantized_input_sorted_host[i + 1]);
+      }
+    }
+  }
+
+  void SetUp() override
+  {
+    rows_ = params_.rows;
+    cols_ = params_.cols;
+
+    int n_elements = rows_ * cols_;
+    input_.resize(n_elements, stream);
+    host_input_.resize(n_elements);
+
+    // random input
+    unsigned long long int seed = 1234ULL;
+    raft::random::RngState r(seed);
+    uniform(handle, r, input_.data(), input_.size(), params_.min, params_.max);
+
+    raft::update_host(host_input_.data(), input_.data(), input_.size(), stream);
+
+    raft::resource::sync_stream(handle, stream);
+  }
+
+ private:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  QuantizationInputs<T> params_;
+  int rows_;
+  int cols_;
+  rmm::device_uvector<T> input_;
+  std::vector<T> host_input_;
+};
+
+template <typename T>
+const std::vector<QuantizationInputs<T>> inputs = {
+  {{1.0}, 5, 5, T(0.0), T(1.0)},
+  {{0.98}, 10, 20, T(0.0), T(1.0)},
+  {{0.90}, 1000, 1500, T(-500.0), T(100.0)},
+  {{0.59}, 100, 200},
+  {{0.1}, 1, 1, T(0.0), T(1.0)},
+  {{0.01}, 50, 50, T(0.0), T(1.0)},
+  {{0.94}, 10, 20, T(-1.0), T(0.0)},
+  {{0.95}, 10, 2, T(50.0), T(100.0)},
+  {{0.95}, 10, 20, T(-500.0), T(-100.0)},
+  {{0.95}, 10, 20, T(5.0), T(5.0)},
+};
+
+typedef QuantizationTest<float, int8_t> QuantizationTest_float_int8t;
+TEST_P(QuantizationTest_float_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); }
+
+typedef QuantizationTest<double, int8_t> QuantizationTest_double_int8t;
+TEST_P(QuantizationTest_double_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); }
+
+typedef QuantizationTest<half, int8_t> QuantizationTest_half_int8t;
+TEST_P(QuantizationTest_half_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); }
+
+INSTANTIATE_TEST_CASE_P(QuantizationTest,
+                        QuantizationTest_float_int8t,
+                        ::testing::ValuesIn(inputs<float>));
+INSTANTIATE_TEST_CASE_P(QuantizationTest,
+                        QuantizationTest_double_int8t,
+                        ::testing::ValuesIn(inputs<double>));
+INSTANTIATE_TEST_CASE_P(QuantizationTest,
+                        QuantizationTest_half_int8t,
+                        ::testing::ValuesIn(inputs<half>));
+
+}  // namespace cuvs::preprocessing::quantize::scalar
diff --git a/cpp/test/sparse/cluster/cluster_solvers.cu b/cpp/test/sparse/cluster/cluster_solvers.cu
new file mode 100644
index 000000000..c0b6c1a78
--- /dev/null
+++ b/cpp/test/sparse/cluster/cluster_solvers.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../src/sparse/cluster/cluster_solvers.cuh"
+#include "../../../src/sparse/cluster/eigen_solvers.cuh"
+#include "../../../src/sparse/cluster/modularity_maximization.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace spectral {
+
+TEST(Raft, ClusterSolvers)
+{
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+
+  index_type maxiter{100};
+  value_type tol{1.0e-10};
+  unsigned long long seed{100110021003};
+
+  auto stream = raft::resource::get_cuda_stream(h);
+
+  index_type n{100};
+  index_type d{10};
+  index_type k{5};
+
+  // nullptr expected to trigger exceptions:
+  //
+  value_type* eigvecs{nullptr};
+  index_type* codes{nullptr};
+
+  cluster_solver_config_t<index_type, value_type> cfg{k, maxiter, tol, seed};
+
+  kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
+
+  EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
+}
+
+TEST(Raft, ModularitySolvers)
+{
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  auto stream = raft::resource::get_cuda_stream(h);
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+
+  EXPECT_ANY_THROW(cuvs::spectral::modularity_maximization(
+    h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type modularity{0};
+  EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/cluster/eigen_solvers.cu b/cpp/test/sparse/cluster/eigen_solvers.cu
new file mode 100644
index 000000000..8de0b49e7
--- /dev/null
+++ b/cpp/test/sparse/cluster/eigen_solvers.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../src/sparse/cluster/eigen_solvers.cuh"
+#include "../../../src/sparse/cluster/partition.cuh"
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+
+namespace cuvs {
+namespace spectral {
+
+TEST(Raft, EigenSolvers)
+{
+  raft::common::nvtx::range fun_scope("test::EigenSolvers");
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz   = 0;
+  index_type nrows = 0;
+
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+  std::uint64_t seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+
+  lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+
+  EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+}
+
+TEST(Raft, SpectralSolvers)
+{
+  raft::common::nvtx::range fun_scope("test::SpectralSolvers");
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h)
+
+  );
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  EXPECT_ANY_THROW(
+    spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type edgeCut{0};
+  value_type cost{0};
+  EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/cluster/spectral.cu b/cpp/test/sparse/cluster/spectral.cu
new file mode 100644
index 000000000..7d0cdef9d
--- /dev/null
+++ b/cpp/test/sparse/cluster/spectral.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../test_utils.cuh"
+
+#include "../../../src/sparse/cluster/modularity_maximization.cuh"
+#include "../../../src/sparse/cluster/partition.cuh"
+#include <raft/core/handle.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace cluster {
+
+/**
+ * Warning: There appears to be a CUDA 12.2 bug in cusparse that causes an
+ * alignment issue. We've fixed the bug in our code through a workaround
+ * (see raft/sparse/linalg/spmm.hpp for fix). This test is meant to fail
+ * in the case where the fix is accidentally reverted, so that it doesn't
+ * break any downstream libraries that depend on RAFT
+ */
+TEST(Raft, Spectral)
+{
+  raft::handle_t handle;
+
+  std::vector<int32_t> h_offsets({0, 2, 4, 7, 10, 12, 14});
+  std::vector<int32_t> h_indices({1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4});
+  std::vector<float> h_values(
+    {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  std::vector<int32_t> expected_clustering({1, 1, 1, 0, 0, 0});
+
+  int32_t n_clusters{2};
+  int32_t n_eigenvectors{2};
+  int32_t evs_max_it{100};
+  int32_t kmean_max_it{100};
+  int32_t restartIter_lanczos = 15 + n_eigenvectors;
+  float evs_tol{0.001};
+  float kmean_tol{0.001};
+  unsigned long long seed1{1234567};
+  unsigned long long seed2{12345678};
+  bool reorthog{false};
+
+  rmm::device_uvector<int32_t> offsets(h_offsets.size(), handle.get_stream());
+  rmm::device_uvector<int32_t> indices(h_indices.size(), handle.get_stream());
+  rmm::device_uvector<float> values(h_indices.size(), handle.get_stream());
+  rmm::device_uvector<int32_t> clustering(expected_clustering.size(), handle.get_stream());
+  rmm::device_uvector<float> eigenvalues(n_eigenvectors, handle.get_stream());
+  rmm::device_uvector<float> eigenvectors(n_eigenvectors * expected_clustering.size(),
+                                          handle.get_stream());
+
+  rmm::device_uvector<int32_t> exp_dev(expected_clustering.size(), handle.get_stream());
+
+  raft::update_device(
+    exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream());
+
+  raft::update_device(offsets.data(), h_offsets.data(), h_offsets.size(), handle.get_stream());
+  raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream());
+  raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream());
+
+  raft::spectral::matrix::sparse_matrix_t<int32_t, float> const matrix{
+    handle,
+    offsets.data(),
+    indices.data(),
+    values.data(),
+    static_cast<int32_t>(offsets.size() - 1),
+    static_cast<int32_t>(indices.size())};
+
+  cuvs::spectral::eigen_solver_config_t<int32_t, float> eig_cfg{
+    n_eigenvectors, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed1};
+  cuvs::spectral::lanczos_solver_t<int32_t, float> eig_solver{eig_cfg};
+
+  cuvs::spectral::cluster_solver_config_t<int32_t, float> clust_cfg{
+    n_clusters, kmean_max_it, kmean_tol, seed2};
+  cuvs::spectral::kmeans_solver_t<int32_t, float> cluster_solver{clust_cfg};
+
+  cuvs::spectral::partition(handle,
+                            matrix,
+                            eig_solver,
+                            cluster_solver,
+                            clustering.data(),
+                            eigenvalues.data(),
+                            eigenvectors.data());
+
+  ASSERT_TRUE(devArrMatch(expected_clustering.data(),
+                          exp_dev.data(),
+                          exp_dev.size(),
+                          1,
+                          cuvs::Compare<int32_t>(),
+                          handle.get_stream()));
+}
+
+}  // namespace cluster
+}  // namespace cuvs
\ No newline at end of file
diff --git a/cpp/test/sparse/cluster/spectral_matrix.cu b/cpp/test/sparse/cluster/spectral_matrix.cu
new file mode 100644
index 000000000..37a4202b8
--- /dev/null
+++ b/cpp/test/sparse/cluster/spectral_matrix.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace spectral {
+namespace matrix {
+namespace {
+template <typename index_type, typename value_type>
+struct csr_view_t {
+  index_type* offsets;
+  index_type* indices;
+  value_type* edge_data;
+  index_type number_of_vertices;
+  index_type number_of_edges;
+};
+}  // namespace
+TEST(Raft, SpectralMatrices)
+{
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};
+
+  int const sz = 10;
+  vector_t<index_type> d_v{h, sz};
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz   = 0;
+  index_type nrows = 0;
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+  ASSERT_EQ(nullptr, sm2.row_offsets_);
+
+  auto stream = resource::get_cuda_stream(h);
+
+  auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    laplacian_matrix_t<index_type, value_type> lm1{h, ro, ci, vs, nrows, nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
+
+  auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t<index_type, value_type> lm2{h, sm2}; };
+  EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
+
+  auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    modularity_matrix_t<index_type, value_type> mm1{h, ro, ci, vs, nrows, nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
+
+  auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t<index_type, value_type> mm2{h, sm2}; };
+  EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
+}
+
+}  // namespace matrix
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
new file mode 100644
index 000000000..d7af30a1c
--- /dev/null
+++ b/cpp/test/sparse/gram.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include "../distance/gram_base.cuh"
+#include "../test_utils.cuh"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/dense.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs::distance::kernels::sparse {
+
+/**
+ * Structure to describe structure of the input matrices:
+ *  - DENSE: dense, dense
+ *  - MIX: CSR, dense
+ *  - CSR: CSR, CSR
+ */
+enum SparseType { DENSE, MIX, CSR };
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  SparseType sparse_input;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/")
+     << (p.sparse_input == SparseType::DENSE
+           ? "DenseDense/"
+           : (p.sparse_input == SparseType::MIX ? "CsrDense/" : "CsrCsr/"))
+     << kernel_names[p.kernel.kernel] << "/ld_" << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+/*struct KernelParams {
+  // Kernel function parameters
+  KernelType kernel;  //!< Type of the kernel function
+  int degree;         //!< Degree of polynomial kernel (ignored by others)
+  double gamma;       //!< multiplier in the
+  double coef0;       //!< additive constant in poly and tanh kernels
+};*/
+
+// const KernelParams linear_kernel_params{.kernel=KernelType::LINEAR};
+
+// {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, {KernelType::TANH, 0, 0.5, 2.4}, {KernelType::RBF, 0, 0.5}
+const std::vector<GramMatrixInputs> inputs = raft::util::itertools::product<GramMatrixInputs>(
+  {42},
+  {137},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX, SparseType::CSR},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4},
+   KernelParams{KernelType::RBF, 0, 0.5}});
+
+// (ld_1, ld_2, ld_out) not supported by RBF and CSR
+const std::vector<GramMatrixInputs> inputs_ld = raft::util::itertools::product<GramMatrixInputs>(
+  {137},
+  {42},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+  {159},
+  {73},
+  {144});
+
+// (ld_1, ld_2) are supported by CSR
+const std::vector<GramMatrixInputs> inputs_ld_csr =
+  raft::util::itertools::product<GramMatrixInputs>(
+    {42},
+    {137},
+    {2},
+    {true, false},
+    {SparseType::CSR, SparseType::MIX},
+    {KernelParams{KernelType::LINEAR},
+     KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+     KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+    {64},
+    {155},
+    {0});
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      stream(raft::resource::get_cuda_stream(handle)),
+      x1(0, stream),
+      x2(0, stream),
+      x1_csr_indptr(0, stream),
+      x1_csr_indices(0, stream),
+      x1_csr_data(0, stream),
+      x2_csr_indptr(0, stream),
+      x2_csr_indices(0, stream),
+      x2_csr_data(0, stream),
+      gram(0, stream),
+      gram_host(0)
+  {
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::RngState r(42137ULL);
+    raft::random::uniform(handle, r, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, r, x2.data(), x2.size(), math_t(0), math_t(1));
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  ~GramMatrixTest() override {}
+
+  int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data)
+  {
+    int nnz           = 0;
+    double eps        = 1e-6;
+    int n_cols        = params.n_cols;
+    bool is_row_major = params.is_row_major;
+    size_t dense_size = get_offset(n_rows - 1, n_cols - 1, ld, is_row_major) + 1;
+
+    std::vector<math_t> dense_host(dense_size);
+    raft::update_host(dense_host.data(), dense, dense_size, stream);
+    raft::resource::sync_stream(handle, stream);
+
+    std::vector<int> indptr_host(n_rows + 1);
+    std::vector<int> indices_host(n_rows * n_cols);
+    std::vector<math_t> data_host(n_rows * n_cols);
+
+    // create csr matrix from dense (with threshold)
+    for (int i = 0; i < n_rows; ++i) {
+      indptr_host[i] = nnz;
+      for (int j = 0; j < n_cols; ++j) {
+        math_t value = dense_host[get_offset(i, j, ld, is_row_major)];
+        if (value > eps) {
+          indices_host[nnz] = j;
+          data_host[nnz]    = value;
+          nnz++;
+        }
+      }
+    }
+    indptr_host[n_rows] = nnz;
+
+    // fill back dense matrix from CSR
+    std::fill(dense_host.data(), dense_host.data() + dense_size, 0);
+    for (int i = 0; i < n_rows; ++i) {
+      for (int idx = indptr_host[i]; idx < indptr_host[i + 1]; ++idx) {
+        dense_host[get_offset(i, indices_host[idx], ld, is_row_major)] = data_host[idx];
+      }
+    }
+
+    raft::update_device(dense, dense_host.data(), dense_size, stream);
+    raft::update_device(indptr, indptr_host.data(), n_rows + 1, stream);
+    raft::update_device(indices, indices_host.data(), nnz, stream);
+    raft::update_device(data, data_host.data(), nnz, stream);
+    raft::resource::sync_stream(handle, stream);
+    return nnz;
+  }
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    if (params.sparse_input == SparseType::DENSE) {
+      (*kernel)(handle, x1_span, x2_span, out_span);
+    } else {
+      x1_csr_indptr.reserve(params.n1 + 1, stream);
+      x1_csr_indices.reserve(params.n1 * params.n_cols, stream);
+      x1_csr_data.reserve(params.n1 * params.n_cols, stream);
+      int x1_nnz = prepareCsr(x1.data(),
+                              params.n1,
+                              params.ld1,
+                              x1_csr_indptr.data(),
+                              x1_csr_indices.data(),
+                              x1_csr_data.data());
+
+      auto x1_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+        x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz);
+      auto x1_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+        raft::device_span<const math_t>(x1_csr_data.data(), x1_csr_structure.get_nnz()),
+        x1_csr_structure);
+
+      if (params.sparse_input == SparseType::MIX) {
+        (*kernel)(handle, x1_csr, x2_span, out_span);
+      } else {
+        x2_csr_indptr.reserve(params.n2 + 1, stream);
+        x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
+        x2_csr_data.reserve(params.n2 * params.n_cols, stream);
+        int x2_nnz = prepareCsr(x2.data(),
+                                params.n2,
+                                params.ld2,
+                                x2_csr_indptr.data(),
+                                x2_csr_indices.data(),
+                                x2_csr_data.data());
+
+        auto x2_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+          x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz);
+        auto x2_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+          raft::device_span<const math_t>(x2_csr_data.data(), x2_csr_structure.get_nnz()),
+          x2_csr_structure);
+
+        (*kernel)(handle, x1_csr, x2_csr, out_span);
+      }
+    }
+    // Something in gram is executing not on the 'stream' and therefore
+    // a full device sync is required
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+    raft::resource::sync_stream(handle, stream);
+
+    ASSERT_TRUE(cuvs::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox<math_t>(1e-6f), stream));
+  }
+
+  raft::resources handle;
+  cudaStream_t stream = 0;
+  GramMatrixInputs params;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+
+  rmm::device_uvector<int> x1_csr_indptr;
+  rmm::device_uvector<int> x1_csr_indices;
+  rmm::device_uvector<math_t> x1_csr_data;
+  rmm::device_uvector<int> x2_csr_indptr;
+  rmm::device_uvector<int> x2_csr_indices;
+  rmm::device_uvector<math_t> x2_csr_data;
+
+  rmm::device_uvector<math_t> gram;
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloatStandard;
+typedef GramMatrixTest<float> GramMatrixTestFloatLd;
+typedef GramMatrixTest<float> GramMatrixTestFloatLdCsr;
+
+TEST_P(GramMatrixTestFloatStandard, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLd, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLdCsr, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatStandard, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatLd, ::testing::ValuesIn(inputs_ld));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests,
+                         GramMatrixTestFloatLdCsr,
+                         ::testing::ValuesIn(inputs_ld_csr));
+};  // namespace cuvs::distance::kernels::sparse
\ No newline at end of file
diff --git a/dependencies.yaml b/dependencies.yaml
index a5b07e250..98cac5300 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -39,6 +39,7 @@ files:
       - bench
       - bench_python
       - rapids_build_setuptools
+      - cupy
   test_cpp:
     output: none
     includes:
@@ -74,14 +75,14 @@ files:
       - rapids_build
       - cuda
       - rust
-  py_build_py_cuvs:
+  py_build_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
       table: build-system
     includes:
       - build
-  py_rapids_build_py_cuvs:
+  py_rapids_build_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
@@ -90,7 +91,7 @@ files:
     includes:
       - rapids_build
       - build_py_cuvs
-  py_run_py_cuvs:
+  py_run_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
@@ -99,7 +100,7 @@ files:
       - cuda_wheels
       - run_py_cuvs
       - depends_on_pylibraft
-  py_test_py_cuvs:
+  py_test_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
@@ -116,7 +117,7 @@ files:
       table: build-system
     includes:
       - rapids_build_setuptools
-  py_rapids_build_py_cuvs_bench:
+  py_rapids_build_cuvs_bench:
     output: pyproject
     pyproject_dir: python/cuvs_bench
     extras:
@@ -213,11 +214,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0
+              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
           - matrix:
               cuda: "11.*"
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
           - matrix:
             packages:
               - &cuda_python cuda-python
@@ -470,17 +471,18 @@ dependencies:
     common:
       - output_types: [conda, pyproject, requirements]
         packages:
-          - hnswlib=0.6.2
           - nlohmann_json>=3.11.2
           - glog>=0.6.0
           - h5py>=3.8.0
           - benchmark>=1.8.2
           - openblas
+          - libcuvs==24.12.*,>=0.0.0a0
   bench_python:
     common:
       - output_types: [conda, pyproject, requirements]
         packages:
           - click
+          - cuvs==24.12.*,>=0.0.0a0
           - matplotlib
           - pandas
           - pyyaml
@@ -488,7 +490,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==24.10.*
+          - &librmm_unsuffixed librmm==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -501,18 +503,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==24.10.*
+              - librmm-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu11==24.10.*
+              - librmm-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==24.10.*
+          - &pylibraft_unsuffixed pylibraft==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -525,10 +527,10 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==24.10.*
+              - pylibraft-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu11==24.10.*
+              - pylibraft-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibraft_unsuffixed]}
diff --git a/docs/source/c_api/neighbors_bruteforce_c.rst b/docs/source/c_api/neighbors_bruteforce_c.rst
index af0356eee..a12175209 100644
--- a/docs/source/c_api/neighbors_bruteforce_c.rst
+++ b/docs/source/c_api/neighbors_bruteforce_c.rst
@@ -32,3 +32,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: bruteforce_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst
index 4d83cd3e3..22ffc236d 100644
--- a/docs/source/c_api/neighbors_hnsw_c.rst
+++ b/docs/source/c_api/neighbors_hnsw_c.rst
@@ -26,16 +26,38 @@ Index
     :members:
     :content-only:
 
+Index extend parameters
+-----------------------
+
+.. doxygengroup:: hnsw_c_extend_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
+------------
+.. doxygengroup:: hnsw_c_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index load
+----------
+.. doxygengroup:: hnsw_c_index_load
+    :project: cuvs
+    :members:
+    :content-only:
+
 Index search
 ------------
 
-.. doxygengroup:: cagra_c_index_search
+.. doxygengroup:: hnsw_c_index_search
     :project: cuvs
     :members:
     :content-only:
 
 Index serialize
-------------
+---------------
 
 .. doxygengroup:: hnsw_c_index_serialize
     :project: cuvs
diff --git a/docs/source/c_api/neighbors_ivf_flat_c.rst b/docs/source/c_api/neighbors_ivf_flat_c.rst
index 9e1ccc0d1..1254d70ef 100644
--- a/docs/source/c_api/neighbors_ivf_flat_c.rst
+++ b/docs/source/c_api/neighbors_ivf_flat_c.rst
@@ -48,3 +48,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_flat_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/c_api/neighbors_ivf_pq_c.rst b/docs/source/c_api/neighbors_ivf_pq_c.rst
index 070719609..260057b8c 100644
--- a/docs/source/c_api/neighbors_ivf_pq_c.rst
+++ b/docs/source/c_api/neighbors_ivf_pq_c.rst
@@ -48,3 +48,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_pq_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 49732dc92..34f48a88f 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -10,5 +10,6 @@ C++ API Documentation
    cpp_api/cluster.rst
    cpp_api/distance.rst
    cpp_api/neighbors.rst
+   cpp_api/preprocessing.rst
    cpp_api/selection.rst
    cpp_api/stats.rst
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index d55d58eb0..ab810ab53 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -11,6 +11,7 @@ Nearest Neighbors
 
    neighbors_bruteforce.rst
    neighbors_cagra.rst
+   neighbors_dynamic_batching.rst
    neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
diff --git a/docs/source/cpp_api/neighbors_bruteforce.rst b/docs/source/cpp_api/neighbors_bruteforce.rst
index 3adcb01c5..f75e26b3c 100644
--- a/docs/source/cpp_api/neighbors_bruteforce.rst
+++ b/docs/source/cpp_api/neighbors_bruteforce.rst
@@ -34,3 +34,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: bruteforce_cpp_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/neighbors_dynamic_batching.rst b/docs/source/cpp_api/neighbors_dynamic_batching.rst
new file mode 100644
index 000000000..adc5cb56a
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_dynamic_batching.rst
@@ -0,0 +1,45 @@
+Dynamic Batching
+================
+
+Dynamic Batching allows grouping small search requests into batches to increase the device occupancy and throughput while keeping the latency within limits.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/dynamic_batching.hpp>``
+
+namespace *cuvs::neighbors::dynamic_batching*
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: dynamic_batching_cpp_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search parameters
+-----------------------
+
+.. doxygengroup:: dynamic_batching_cpp_search_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index
+-----
+
+.. doxygengroup:: dynamic_batching_cpp_index
+    :project: cuvs
+    :members:
+    :content-only:
+
+
+Index search
+------------
+
+.. doxygengroup:: dynamic_batching_cpp_search
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/neighbors_hnsw.rst b/docs/source/cpp_api/neighbors_hnsw.rst
index b0af88af0..00dd3a213 100644
--- a/docs/source/cpp_api/neighbors_hnsw.rst
+++ b/docs/source/cpp_api/neighbors_hnsw.rst
@@ -27,10 +27,25 @@ Index
     :members:
     :content-only:
 
-Index load
+Index extend parameters
+-----------------------
+
+.. doxygengroup:: hnsw_cpp_extend_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
 ------------
+.. doxygengroup:: hnsw_cpp_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
 
-.. doxygengroup:: hnsw_cpp_index_search
+Index load
+----------
+
+.. doxygengroup:: hnsw_cpp_index_load
     :project: cuvs
     :members:
     :content-only:
@@ -43,10 +58,10 @@ Index search
     :members:
     :content-only:
 
-Index deserialize
+Index serialize
 ---------------
 
-.. doxygengroup:: hnsw_cpp_index_deserialize
+.. doxygengroup:: hnsw_cpp_index_serialize
     :project: cuvs
     :members:
     :content-only:
diff --git a/docs/source/cpp_api/preprocessing.rst b/docs/source/cpp_api/preprocessing.rst
new file mode 100644
index 000000000..1c2b0f051
--- /dev/null
+++ b/docs/source/cpp_api/preprocessing.rst
@@ -0,0 +1,12 @@
+Preprocessing
+=============
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   preprocessing_quantize.rst
diff --git a/docs/source/cpp_api/preprocessing_quantize.rst b/docs/source/cpp_api/preprocessing_quantize.rst
new file mode 100644
index 000000000..b660c61c5
--- /dev/null
+++ b/docs/source/cpp_api/preprocessing_quantize.rst
@@ -0,0 +1,20 @@
+Quantize
+========
+
+This page provides C++ class references for the publicly-exposed elements of the
+`cuvs/preprocessing/quantize` package. 
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Scalar
+------
+
+``#include <cuvs/preprocessing/quantize/scalar.hpp>``
+
+namespace *cuvs::preprocessing::quantize::scalar*
+
+.. doxygengroup:: scalar
+   :project: cuvs
+
diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst
index 61ac622d2..81fb7537c 100644
--- a/docs/source/cuvs_bench/index.rst
+++ b/docs/source/cuvs_bench/index.rst
@@ -93,20 +93,20 @@ We provide images for GPU enabled systems, as well as systems without a GPU. The
 - `cuvs-bench-datasets`: Contains the GPU and CPU benchmarks with million-scale datasets already included in the container. Best suited for users that want to run multiple million scale datasets already included in the image.
 - `cuvs-bench-cpu`: Contains only CPU benchmarks with minimal size. Best suited for users that want the smallest containers to reproduce benchmarks on systems without a GPU.
 
-Nightly images are located in `dockerhub <https://hub.docker.com/r/rapidsai/cuvs-ann-bench/tags>`_, meanwhile release (stable) versions are located in `NGC <https://hub.docker.com/r/rapidsai/cuvs_bench>`_, starting with release 24.10.
+Nightly images are located in `dockerhub <https://hub.docker.com/r/rapidsai/cuvs-bench/tags>`_, meanwhile release (stable) versions are located in `NGC <https://hub.docker.com/r/rapidsai/cuvs-bench>`_, starting with release 24.10.
 
-The following command pulls the nightly container for python version 10, cuda version 12, and CUVS version 23.10:
+The following command pulls the nightly container for Python version 3.10, CUDA version 12.0, and cuVS version 24.10:
 
 .. code-block:: bash
 
-   docker pull rapidsai/cuvs_bench:24.10a-cuda12.0-py3.10 #substitute cuvs_bench for the exact desired container.
+   docker pull rapidsai/cuvs-bench:24.10a-cuda12.0-py3.10 #substitute cuvs-bench for the exact desired container.
 
 The CUDA and python versions can be changed for the supported values:
 - Supported CUDA versions: 11.4 and 12.x
 - Supported Python versions: 3.9 and 3.10.
 
 You can see the exact versions as well in the dockerhub site:
-- `cuVS bench images <https://hub.docker.com/r/rapidsai/cuvs_bench/tags>`_
+- `cuVS bench images <https://hub.docker.com/r/rapidsai/cuvs-bench/tags>`_
 - `cuVS bench with datasets preloaded images <https://hub.docker.com/r/rapidsai/cuvs-bench-cpu/tags>`_
 - `cuVS bench CPU only images <https://hub.docker.com/r/rapidsai/cuvs-bench-datasets/tags>`_
 
@@ -583,7 +583,7 @@ A default `datasets.yaml` is provided by CUVS in `${CUVS_HOME}/python/cuvs-ann-b
       dims: 128
       distance: euclidean
 
-Configuration files for ANN algorithms supported by `cuvs-bench` are provided in `${CUVS_HOME}/python/cuvs-bench/src/cuvs_bench/run/conf`. `cuvs_cagra` algorithm configuration looks like:
+Configuration files for ANN algorithms supported by `cuvs-bench` are provided in `${CUVS_HOME}/python/cuvs_bench/cuvs_bench/config/algos`. `cuvs_cagra` algorithm configuration looks like:
 
 .. code-block:: yaml
 
@@ -767,4 +767,4 @@ Add a new entry to `algos.yaml` to map the name of the algorithm to its binary e
       requires_gpu: true
 
 `executable` : specifies the name of the binary that will build/search the index. It is assumed to be available in `cuvs/cpp/build/`.
-`requires_gpu` : denotes whether an algorithm requires GPU to run.
\ No newline at end of file
+`requires_gpu` : denotes whether an algorithm requires GPU to run.
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index e54336852..7702f80b3 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 647061ae5..286836c18 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,19 +1,8 @@
 cuVS: Vector Search and Clustering on the GPU
 =============================================
 
-
 Welcome to cuVS, the premier library for GPU-accelerated vector search and clustering! cuVS provides several core building blocks for constructing new algorithms, as well as end-to-end vector search and clustering algorithms for use either standalone or through a growing list of :doc:`integrations <integrations>`.
 
-There are several benefits to using cuVS and GPUs for vector search, including
-
-#. Fast index build
-#. Latency critical and high throughput search
-#. Parameter tuning
-#. Cost savings
-#. Interoperability (build on GPU, deploy on CPU)
-#. Multiple language support
-#. Building blocks for composing new or accelerating existing algorithms
-
 Useful Resources
 ################
 
@@ -26,6 +15,67 @@ Useful Resources
 - `Issue tracker <https://github.com/rapidsai/cuvs/issues>`_: Report issues or request features.
 
 
+
+What is cuVS?
+#############
+
+cuVS contains state-of-the-art implementations of several algorithms for running approximate and exact nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering.
+
+Vector search is an information retrieval method that has been growing in popularity over the past few  years, partly because of the rising importance of multimedia embeddings created from unstructured data and the need to perform semantic search on the embeddings to find items which are semantically similar to each other.
+
+Vector search is also used in *data mining and machine learning* tasks and comprises an important step in many *clustering* and *visualization* algorithms like `UMAP <https://arxiv.org/abs/2008.00325>`_, `t-SNE <https://lvdmaaten.github.io/tsne/>`_, K-means, and `HDBSCAN <https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html>`_.
+
+Finally, faster vector search enables interactions between dense vectors and graphs. Converting a pile of dense vectors into nearest neighbors graphs unlocks the entire world of graph analysis algorithms, such as those found in `GraphBLAS <https://graphblas.org/>`_ and `cuGraph <https://github.com/rapidsai/cugraph>`_.
+
+Below are some common use-cases for vector search
+
+Semantic search
+~~~~~~~~~~~~~~~
+- Generative AI & Retrieval augmented generation (RAG)
+- Recommender systems
+- Computer vision
+- Image search
+- Text search
+- Audio search
+- Molecular search
+- Model training
+
+
+Data mining
+~~~~~~~~~~~
+- Clustering algorithms
+- Visualization algorithms
+- Sampling algorithms
+- Class balancing
+- Ensemble methods
+- k-NN graph construction
+
+Why cuVS?
+#########
+
+There are several benefits to using cuVS and GPUs for vector search, including
+
+1. Fast index build
+2. Latency critical and high throughput search
+3. Parameter tuning
+4. Cost savings
+5. Interoperability (build on GPU, deploy on CPU)
+6. Multiple language support
+7. Building blocks for composing new or accelerating existing algorithms
+
+In addition to the items above, cuVS shoulders the responsibility of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be receiving the best performance and scale.
+
+cuVS Technology Stack
+#####################
+
+cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU.
+
+.. image:: ../../img/tech_stack.png
+  :width: 600
+  :alt: cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU
+
+
+
 Contents
 ########
 
diff --git a/docs/source/python_api/neighbors_brute_force.rst b/docs/source/python_api/neighbors_brute_force.rst
index 5fdc3658f..d756a6c80 100644
--- a/docs/source/python_api/neighbors_brute_force.rst
+++ b/docs/source/python_api/neighbors_brute_force.rst
@@ -20,3 +20,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.brute_force.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.brute_force.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.brute_force.load
diff --git a/docs/source/python_api/neighbors_cagra.rst b/docs/source/python_api/neighbors_cagra.rst
index 09b2e2694..e7155efb8 100644
--- a/docs/source/python_api/neighbors_cagra.rst
+++ b/docs/source/python_api/neighbors_cagra.rst
@@ -34,3 +34,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.cagra.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.cagra.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.cagra.load
diff --git a/docs/source/python_api/neighbors_hnsw.rst b/docs/source/python_api/neighbors_hnsw.rst
index 9922805b3..64fe5493b 100644
--- a/docs/source/python_api/neighbors_hnsw.rst
+++ b/docs/source/python_api/neighbors_hnsw.rst
@@ -28,3 +28,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.hnsw.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.hnsw.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.hnsw.load
diff --git a/docs/source/python_api/neighbors_ivf_flat.rst b/docs/source/python_api/neighbors_ivf_flat.rst
index 5514e5e43..f2c21e68a 100644
--- a/docs/source/python_api/neighbors_ivf_flat.rst
+++ b/docs/source/python_api/neighbors_ivf_flat.rst
@@ -32,3 +32,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.ivf_flat.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_flat.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_flat.load
diff --git a/docs/source/python_api/neighbors_ivf_pq.rst b/docs/source/python_api/neighbors_ivf_pq.rst
index e3625ba67..57668fbc3 100644
--- a/docs/source/python_api/neighbors_ivf_pq.rst
+++ b/docs/source/python_api/neighbors_ivf_pq.rst
@@ -32,3 +32,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.ivf_pq.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_pq.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_pq.load
diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt
index ec8ca827a..2a7e70522 100644
--- a/examples/c/CMakeLists.txt
+++ b/examples/c/CMakeLists.txt
@@ -42,3 +42,11 @@ target_link_libraries(CAGRA_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXIST
 add_executable(L2_C_EXAMPLE src/L2_c_example.c)
 target_include_directories(L2_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 target_link_libraries(L2_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXISTS:conda_env>)
+
+add_executable(IVF_FLAT_C_EXAMPLE src/ivf_flat_c_example.c)
+target_include_directories(IVF_FLAT_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
+target_link_libraries(IVF_FLAT_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXISTS:conda_env>)
+
+add_executable(IVF_PQ_C_EXAMPLE src/ivf_pq_c_example.c)
+target_include_directories(IVF_PQ_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
+target_link_libraries(IVF_PQ_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/c/src/common.h b/examples/c/src/common.h
new file mode 100644
index 000000000..60b9b73cf
--- /dev/null
+++ b/examples/c/src/common.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dlpack/dlpack.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+
+/**
+ * @brief Initialize Tensor for kDLFloat.
+ *
+ * @param[in] t_d Pointer to a vector
+ * @param[in] t_shape[] Two-dimensional array, which stores the number of rows and columns of vectors.
+ * @param[out] t_tensor Stores the initialized DLManagedTensor.
+ */
+void float_tensor_initialize(float* t_d, int64_t t_shape[2], DLManagedTensor* t_tensor) {
+  t_tensor->dl_tensor.data = t_d;
+  t_tensor->dl_tensor.device.device_type = kDLCUDA;
+  t_tensor->dl_tensor.ndim = 2;
+  t_tensor->dl_tensor.dtype.code = kDLFloat;
+  t_tensor->dl_tensor.dtype.bits = 32;
+  t_tensor->dl_tensor.dtype.lanes = 1;
+  t_tensor->dl_tensor.shape = t_shape;
+  t_tensor->dl_tensor.strides = NULL;
+}
+
+/**
+ * @brief Initialize Tensor for kDLInt.
+ *
+ * @param[in] t_d Pointer to a vector
+ * @param[in] t_shape[] Two-dimensional array, which stores the number of rows and columns of vectors.
+ * @param[out] t_tensor Stores the initialized DLManagedTensor.
+ */
+void int_tensor_initialize(int64_t* t_d, int64_t t_shape[], DLManagedTensor* t_tensor) {
+  t_tensor->dl_tensor.data = t_d;
+  t_tensor->dl_tensor.device.device_type = kDLCUDA;
+  t_tensor->dl_tensor.ndim = 2;
+  t_tensor->dl_tensor.dtype.code = kDLInt;
+  t_tensor->dl_tensor.dtype.bits = 64;
+  t_tensor->dl_tensor.dtype.lanes = 1;
+  t_tensor->dl_tensor.shape = t_shape;
+  t_tensor->dl_tensor.strides = NULL;
+}
+
+/**
+ * @brief Fill a vector with random values.
+ *
+ * @param[out] Vec Pointer to a vector
+ * @param[in] n_rows the number of rows in the matrix.
+ * @param[in] n_cols the number of columns in the matrix.
+ * @param[in] min Minimum value among random values.
+ * @param[in] max Maximum value among random values.
+ */
+void generate_dataset(float * Vec,int n_rows, int n_cols, float min, float max) {
+    float scale;
+    float * ptr = Vec;
+    srand((unsigned int)time(NULL));
+    for (int i = 0; i < n_rows; i++) {
+        for (int j = 0; j < n_cols; j++) {
+            scale = rand()/(float)RAND_MAX;
+            ptr = Vec + i * n_cols + j;
+            *ptr = min + scale * (max - min);
+        }
+    }
+}
+
+/**
+ * @brief print the result.
+ *
+ * @param[in] neighbor Pointer to a neighbor vector
+ * @param[in] distances Pointer to a distances vector.
+ * @param[in] n_rows the number of rows in the matrix.
+ * @param[in] n_cols the number of columns in the matrix.
+ */
+void print_results(int64_t * neighbor, float* distances,int n_rows, int n_cols) {
+    int64_t * pn = neighbor;
+    float * pd = distances;
+    for (int i = 0; i < n_rows; ++i) {
+        printf("Query %d neighbor indices: =[", i);
+        for (int j = 0; j < n_cols; ++j) {
+            pn = neighbor + i * n_cols + j;
+            printf(" %ld", *pn);
+        }
+        printf("]\n");
+        printf("Query %d neighbor distances: =[", i);
+        for (int j = 0; j < n_cols; ++j) {
+            pd = distances + i * n_cols + j;
+            printf(" %f", *pd);
+        }
+        printf("]\n");
+    }
+}
+
diff --git a/examples/c/src/ivf_flat_c_example.c b/examples/c/src/ivf_flat_c_example.c
new file mode 100644
index 000000000..c068d04f8
--- /dev/null
+++ b/examples/c/src/ivf_flat_c_example.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/core/c_api.h>
+#include <cuvs/neighbors/ivf_flat.h>
+
+#include <cuda_runtime.h>
+#include "common.h"
+
+void ivf_flat_build_search_simple(cuvsResources_t *res, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) {
+    // Create default index params
+    cuvsIvfFlatIndexParams_t index_params;
+    cuvsIvfFlatIndexParamsCreate(&index_params);
+    index_params->n_lists                  = 1024; // default value
+    index_params->kmeans_n_iters = 20; // default value
+    index_params->kmeans_trainset_fraction = 0.1;
+    //index_params->metric default is L2Expanded
+
+    // Create IVF-Flat index
+    cuvsIvfFlatIndex_t index;
+    cuvsIvfFlatIndexCreate(&index);
+
+    printf("Building IVF-Flat index\n");
+    // Build the IVF-Flat Index
+    cuvsError_t build_status = cuvsIvfFlatBuild(*res, index_params, dataset_tensor, index);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        cuvsIvfFlatIndexDestroy(index);
+        cuvsIvfFlatIndexParamsDestroy(index_params);
+        return;
+    }
+
+    // Create output arrays.
+    int64_t topk      = 10;
+    int64_t n_queries = queries_tensor->dl_tensor.shape[0];
+
+    //Allocate memory for `neighbors` and `distances` output
+    int64_t *neighbors_d;
+    float *distances_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk);
+
+    DLManagedTensor neighbors_tensor;
+    int64_t neighbors_shape[2] = {n_queries, topk};
+    int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor);
+    
+    DLManagedTensor distances_tensor;
+    int64_t distances_shape[2] = {n_queries, topk};
+    float_tensor_initialize(distances_d, distances_shape, &distances_tensor);
+
+    // Create default search params
+    cuvsIvfFlatSearchParams_t search_params;
+    cuvsIvfFlatSearchParamsCreate(&search_params);
+    search_params->n_probes = 50;
+
+    // Search the `index` built using `ivfFlatBuild`
+    cuvsError_t search_status = cuvsIvfFlatSearch(*res, search_params, index,
+     queries_tensor, &neighbors_tensor, &distances_tensor);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+    }
+
+    int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t));
+    float *distances = (float *)malloc(n_queries * topk * sizeof(float));
+    memset(neighbors, 0, n_queries * topk * sizeof(int64_t));
+    memset(distances, 0, n_queries * topk * sizeof(float));
+
+    cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault);
+    cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault);
+
+    print_results(neighbors, distances, 2, topk);
+
+    free(distances);
+    free(neighbors);
+
+    cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk);
+
+    cuvsIvfFlatSearchParamsDestroy(search_params);
+    cuvsIvfFlatIndexDestroy(index);
+    cuvsIvfFlatIndexParamsDestroy(index_params);  
+}
+
+void ivf_flat_build_extend_search(cuvsResources_t *res, DLManagedTensor * trainset_tensor, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) {
+    int64_t *data_indices_d;
+    int64_t n_dataset = dataset_tensor->dl_tensor.shape[0];
+    cuvsRMMAlloc(*res, (void**) &data_indices_d, sizeof(int64_t) * n_dataset);
+    DLManagedTensor data_indices_tensor;
+    int64_t data_indices_shape[1] = {n_dataset};
+    int_tensor_initialize(data_indices_d, data_indices_shape, &data_indices_tensor);
+    data_indices_tensor.dl_tensor.ndim = 1;
+    
+    printf("\nRun k-means clustering using the training set\n");
+
+    int64_t *data_indices = (int64_t *)malloc(n_dataset * sizeof(int64_t));
+    int64_t * ptr = data_indices;
+    for (int i = 0; i < n_dataset; i++) {
+        *ptr = i;
+        ptr++;
+    }
+    ptr = NULL;
+    cudaMemcpy(data_indices_d, data_indices, sizeof(int64_t) * n_dataset, cudaMemcpyDefault);
+
+    // Create default index params
+    cuvsIvfFlatIndexParams_t index_params;
+    cuvsIvfFlatIndexParamsCreate(&index_params);
+    index_params->n_lists                  = 100;
+    index_params->add_data_on_build = false;
+    //index_params->metric default is L2Expanded
+
+    // Create IVF-Flat index
+    cuvsIvfFlatIndex_t index;
+    cuvsIvfFlatIndexCreate(&index);
+
+    // Build the IVF-Flat Index
+    cuvsError_t build_status = cuvsIvfFlatBuild(*res, index_params, trainset_tensor, index);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        cuvsIvfFlatIndexDestroy(index);
+        cuvsIvfFlatIndexParamsDestroy(index_params);
+        return;
+    }
+
+    printf("Filling index with the dataset vectors\n");
+    cuvsError_t extend_status = cuvsIvfFlatExtend(*res, dataset_tensor, &data_indices_tensor, index);
+    if (extend_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        return;
+    }
+
+    // Create output arrays.
+    int64_t topk      = 10;
+    int64_t n_queries = queries_tensor->dl_tensor.shape[0];
+
+    //Allocate memory for `neighbors` and `distances` output
+    int64_t *neighbors_d;
+    float *distances_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk);
+
+    DLManagedTensor neighbors_tensor;
+    int64_t neighbors_shape[2] = {n_queries, topk};
+    int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor);
+    
+    DLManagedTensor distances_tensor;
+    int64_t distances_shape[2] = {n_queries, topk};
+    float_tensor_initialize(distances_d, distances_shape, &distances_tensor);
+    
+    // Create default search params
+    cuvsIvfFlatSearchParams_t search_params;
+    cuvsIvfFlatSearchParamsCreate(&search_params);
+    search_params->n_probes = 10;
+
+    // Search the `index` built using `ivfFlatBuild`
+    cuvsError_t search_status = cuvsIvfFlatSearch(*res, search_params, index,
+     queries_tensor, &neighbors_tensor, &distances_tensor);
+    if (search_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        exit(-1);
+    }
+
+    int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t));
+    float *distances = (float *)malloc(n_queries * topk * sizeof(float));
+    memset(neighbors, 0, n_queries * topk * sizeof(int64_t));
+    memset(distances, 0, n_queries * topk * sizeof(float));
+
+    cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault);
+    cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault);
+
+    print_results(neighbors, distances, 2, topk);
+
+    free(distances);
+    free(neighbors);
+    free(data_indices);
+    cuvsRMMFree(*res, data_indices_d, sizeof(int64_t) * n_dataset);
+    cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk);
+
+    cuvsIvfFlatSearchParamsDestroy(search_params);
+    cuvsIvfFlatIndexDestroy(index);
+    cuvsIvfFlatIndexParamsDestroy(index_params);
+}
+
+int main() {
+    // Create input arrays.
+    int64_t n_samples = 10000;
+    int64_t n_dim     = 3;
+    int64_t n_queries = 10;
+    float *dataset = (float *)malloc(n_samples * n_dim * sizeof(float));
+    float *queries = (float *)malloc(n_queries * n_dim * sizeof(float));
+    generate_dataset(dataset, n_samples, n_dim, -10.0, 10.0);
+    generate_dataset(queries, n_queries, n_dim, -1.0, 1.0);
+    
+    // Create a cuvsResources_t object
+    cuvsResources_t res;
+    cuvsResourcesCreate(&res);
+
+    // Allocate memory for `queries`
+    float *dataset_d;
+    cuvsRMMAlloc(res, (void**) &dataset_d, sizeof(float) * n_samples * n_dim);
+    // Use DLPack to represent `dataset_d` as a tensor
+    cudaMemcpy(dataset_d, dataset, sizeof(float) * n_samples * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor dataset_tensor;
+    int64_t dataset_shape[2] = {n_samples,n_dim};
+    float_tensor_initialize(dataset_d, dataset_shape, &dataset_tensor);
+
+    // Allocate memory for `queries`
+    float *queries_d;
+    cuvsRMMAlloc(res, (void**) &queries_d, sizeof(float) * n_queries * n_dim);
+
+    // Use DLPack to represent `queries` as tensors
+    cudaMemcpy(queries_d, queries, sizeof(float) * n_queries * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor queries_tensor;
+    int64_t queries_shape[2] = {n_queries, n_dim};
+    float_tensor_initialize(queries_d, queries_shape, &queries_tensor);
+
+    // Simple build and search example.
+    ivf_flat_build_search_simple(&res, &dataset_tensor, &queries_tensor);
+
+    float *trainset_d;
+    int64_t n_trainset = n_samples * 0.1;
+    float *trainset = (float *)malloc(n_trainset * n_dim * sizeof(float));
+    for (int i = 0; i < n_trainset; i++) {
+        for (int j = 0; j < n_dim; j++) {
+            *(trainset + i * n_dim + j)  = *(dataset + i * n_dim + j);
+        }
+    }
+    cuvsRMMAlloc(res, (void**) &trainset_d, sizeof(float) * n_trainset * n_dim);
+    cudaMemcpy(trainset_d, trainset, sizeof(float) * n_trainset * n_dim, cudaMemcpyDefault);
+    DLManagedTensor trainset_tensor;
+    int64_t trainset_shape[2] = {n_trainset, n_dim};
+    float_tensor_initialize(trainset_d, trainset_shape, &trainset_tensor);
+    
+    // Build and extend example.
+    ivf_flat_build_extend_search(&res, &trainset_tensor, &dataset_tensor, &queries_tensor);
+
+    cuvsRMMFree(res, trainset_d, sizeof(float) * n_trainset * n_dim);
+    cuvsRMMFree(res, queries_d, sizeof(float) * n_queries * n_dim);
+    cuvsRMMFree(res, dataset_d, sizeof(float) * n_samples * n_dim);
+    cuvsResourcesDestroy(res);
+    free(trainset);
+    free(dataset);
+    free(queries);
+}
diff --git a/examples/c/src/ivf_pq_c_example.c b/examples/c/src/ivf_pq_c_example.c
new file mode 100644
index 000000000..b6d6b485b
--- /dev/null
+++ b/examples/c/src/ivf_pq_c_example.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/core/c_api.h>
+#include <cuvs/neighbors/ivf_pq.h>
+#include <cuvs/neighbors/refine.h>
+
+#include <cuda_runtime.h>
+#include "common.h"
+
+void ivf_pq_build_search(cuvsResources_t *res, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) {
+    // Create default index params
+    cuvsIvfPqIndexParams_t index_params;
+    cuvsIvfPqIndexParamsCreate(&index_params);
+    index_params->n_lists                  = 1024; // default value
+    index_params->kmeans_trainset_fraction = 0.1;
+    //index_params->metric default is L2Expanded
+    index_params->pq_bits = 8;
+    index_params->pq_dim = 2;
+
+    // Create IVF-PQ index
+    cuvsIvfPqIndex_t index;
+    cuvsIvfPqIndexCreate(&index);
+
+    printf("Building IVF-PQ index\n");
+
+    // Build the IVF-PQ Index
+    cuvsError_t build_status = cuvsIvfPqBuild(*res, index_params, dataset_tensor, index);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        cuvsIvfPqIndexDestroy(index);
+        cuvsIvfPqIndexParamsDestroy(index_params);
+        return;
+    }
+    
+    // Create output arrays.
+    int64_t topk      = 10;
+    int64_t n_queries = queries_tensor->dl_tensor.shape[0];
+
+    //Allocate memory for `neighbors` and `distances` output
+    int64_t *neighbors_d;
+    float *distances_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk);
+
+    DLManagedTensor neighbors_tensor;
+    int64_t neighbors_shape[2] = {n_queries, topk};
+    int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor);
+
+    DLManagedTensor distances_tensor;
+    int64_t distances_shape[2] = {n_queries, topk};
+    float_tensor_initialize(distances_d, distances_shape, &distances_tensor);
+
+    // Create default search params
+    cuvsIvfPqSearchParams_t search_params;
+    cuvsIvfPqSearchParamsCreate(&search_params);
+    search_params->n_probes = 50;
+    search_params->internal_distance_dtype = CUDA_R_16F;
+    search_params->lut_dtype = CUDA_R_16F;
+
+    // Search the `index` built using `cuvsIvfPqBuild`
+    cuvsError_t search_status = cuvsIvfPqSearch(*res, search_params, index,
+     queries_tensor, &neighbors_tensor, &distances_tensor);
+    if (search_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        exit(-1);
+    }
+
+    int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t));
+    float *distances = (float *)malloc(n_queries * topk * sizeof(float));
+    memset(neighbors, 0, n_queries * topk * sizeof(int64_t));
+    memset(distances, 0, n_queries * topk * sizeof(float));
+
+    cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault);
+    cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault);
+
+    printf("\nOriginal results:\n");
+    print_results(neighbors, distances, 2, topk);
+    
+    // Re-ranking operation: refine the initial search results by computing exact distances
+    int64_t topk_refined = 7;
+    int64_t *neighbors_refined_d;
+    float *distances_refined_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined);
+    cuvsRMMAlloc(*res, (void**) &distances_refined_d, sizeof(float) * n_queries * topk_refined);
+
+    DLManagedTensor neighbors_refined_tensor;
+    int64_t neighbors_refined_shape[2] = {n_queries, topk_refined};
+    int_tensor_initialize(neighbors_refined_d, neighbors_refined_shape, &neighbors_refined_tensor);
+    
+    DLManagedTensor distances_refined_tensor;
+    int64_t distances_refined_shape[2] = {n_queries, topk_refined};
+    float_tensor_initialize(distances_refined_d, distances_refined_shape, &distances_refined_tensor);
+    
+    // Note, refinement requires the original dataset and the queries.
+    // Don't forget to specify the same distance metric as used by the index.
+    cuvsError_t refine_status = cuvsRefine(*res, dataset_tensor, queries_tensor,
+                       &neighbors_tensor, index_params->metric,
+                       &neighbors_refined_tensor, &distances_refined_tensor);
+    if (refine_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        exit(-1);
+    }
+
+    int64_t *neighbors_refine = (int64_t *)malloc(n_queries * topk_refined * sizeof(int64_t));
+    float *distances_refine = (float *)malloc(n_queries * topk_refined * sizeof(float));
+    memset(neighbors_refine, 0, n_queries * topk_refined * sizeof(int64_t));
+    memset(distances_refine, 0, n_queries * topk_refined * sizeof(float));
+
+    cudaMemcpy(neighbors_refine, neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined, cudaMemcpyDefault);
+    cudaMemcpy(distances_refine, distances_refined_d, sizeof(float) * n_queries * topk_refined, cudaMemcpyDefault);
+
+    printf("\nRefined results:\n");
+    print_results(neighbors, distances, 2, topk_refined);
+
+    free(distances_refine);
+    free(neighbors_refine);
+
+    free(distances);
+    free(neighbors);
+
+    cuvsRMMFree(*res, neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined);
+    cuvsRMMFree(*res, distances_refined_d, sizeof(float) * n_queries * topk_refined);
+
+    cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk);
+
+    cuvsIvfPqSearchParamsDestroy(search_params);
+    cuvsIvfPqIndexDestroy(index);
+    cuvsIvfPqIndexParamsDestroy(index_params);  
+}
+
+int main() {
+    // Create input arrays.
+    int64_t n_samples = 10000;
+    int64_t n_dim     = 3;
+    int64_t n_queries = 10;
+    float *dataset = (float *)malloc(n_samples * n_dim * sizeof(float));
+    float *queries = (float *)malloc(n_queries * n_dim * sizeof(float));
+    generate_dataset(dataset, n_samples, n_dim, -10.0, 10.0);
+    generate_dataset(queries, n_queries, n_dim, -1.0, 1.0);
+    
+    // Create a cuvsResources_t object
+    cuvsResources_t res;
+    cuvsResourcesCreate(&res);
+
+    // Allocate memory for `queries`
+    float *dataset_d;
+    cuvsRMMAlloc(res, (void**) &dataset_d, sizeof(float) * n_samples * n_dim);
+    // Use DLPack to represent `dataset_d` as a tensor
+    cudaMemcpy(dataset_d, dataset, sizeof(float) * n_samples * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor dataset_tensor;
+    int64_t dataset_shape[2] = {n_samples,n_dim};
+    float_tensor_initialize(dataset_d, dataset_shape, &dataset_tensor);
+    
+    // Allocate memory for `queries`
+    float *queries_d;
+    cuvsRMMAlloc(res, (void**) &queries_d, sizeof(float) * n_queries * n_dim);
+
+    // Use DLPack to represent `queries` as tensors
+    cudaMemcpy(queries_d, queries, sizeof(float) * n_queries * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor queries_tensor;
+    int64_t queries_shape[2] = {n_queries, n_dim};
+    float_tensor_initialize(queries_d, queries_shape, &queries_tensor);
+    
+    // Simple build and search example.
+    ivf_pq_build_search(&res, &dataset_tensor, &queries_tensor);
+
+    cuvsRMMFree(res, queries_d, sizeof(float) * n_queries * n_dim);
+    cuvsRMMFree(res, dataset_d, sizeof(float) * n_samples * n_dim);
+    cuvsResourcesDestroy(res);
+    free(dataset);
+    free(queries);
+}
diff --git a/examples/cmake/thirdparty/fetch_rapids.cmake b/examples/cmake/thirdparty/fetch_rapids.cmake
index f64a924cf..6f4c627ed 100644
--- a/examples/cmake/thirdparty/fetch_rapids.cmake
+++ b/examples/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.10")
+set(RAPIDS_VERSION "24.12")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 092b65ed9..951e0ad0c 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -38,6 +38,7 @@ include(../cmake/thirdparty/get_cuvs.cmake)
 # -------------- compile tasks ----------------- #
 add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
 add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
+add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu)
 add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
 add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu)
 add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
@@ -48,6 +49,9 @@ target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:c
 target_link_libraries(
   CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
 )
+target_link_libraries(
+  DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+)
 target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/cpp/src/common.cuh b/examples/cpp/src/common.cuh
index 1c93dec0e..8e109a764 100644
--- a/examples/cpp/src/common.cuh
+++ b/examples/cpp/src/common.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cstdint>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_resources.hpp>
@@ -28,6 +30,8 @@
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <fstream>
+
 // Fill dataset and queries with synthetic data.
 void generate_dataset(raft::device_resources const &dev_resources,
                       raft::device_matrix_view<float, int64_t> dataset,
diff --git a/examples/cpp/src/dynamic_batching_example.cu b/examples/cpp/src/dynamic_batching_example.cu
new file mode 100644
index 000000000..95f66a454
--- /dev/null
+++ b/examples/cpp/src/dynamic_batching_example.cu
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <future>
+
+// A helper to split the dataset into chunks
+template <typename DeviceMatrixOrView>
+auto slice_matrix(const DeviceMatrixOrView &source,
+                  typename DeviceMatrixOrView::index_type offset_rows,
+                  typename DeviceMatrixOrView::index_type count_rows) {
+  auto n_cols = source.extent(1);
+  return raft::make_device_matrix_view<
+      typename DeviceMatrixOrView::element_type,
+      typename DeviceMatrixOrView::index_type>(
+      const_cast<typename DeviceMatrixOrView::element_type *>(
+          source.data_handle()) +
+          offset_rows * n_cols,
+      count_rows, n_cols);
+}
+
+// A helper to measure the execution time of a function
+template <typename F, typename... Args>
+void time_it(std::string label, F f, Args &&...xs) {
+  auto start = std::chrono::system_clock::now();
+  f(std::forward<Args>(xs)...);
+  auto end = std::chrono::system_clock::now();
+  auto t = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  auto t_ms = double(t.count()) / 1000.0;
+  std::cout << "[" << label << "] execution time: " << t_ms << " ms"
+            << std::endl;
+}
+
+/**
+ * Wrap waiting on a stream work into an async C++ future object.
+ * This is similar to recording and waiting on CUDA events, but in C++11 API.
+ */
+struct cuda_work_completion_promise {
+
+  cuda_work_completion_promise(const raft::resources &res) {
+    auto *promise = new std::promise<void>;
+    RAFT_CUDA_TRY(cudaLaunchHostFunc(raft::resource::get_cuda_stream(res),
+                                     completion_callback,
+                                     reinterpret_cast<void *>(promise)));
+    value_ = promise->get_future();
+  }
+
+  /**
+   * Waiting on the produced `future` object has the same effect as
+   * cudaEventSynchronize if an event was recorded at the time of creation of
+   * this promise object.
+   */
+  auto get_future() -> std::future<void> && { return std::move(value_); }
+
+private:
+  std::future<void> value_;
+
+  static void completion_callback(void *ptr) {
+    auto *promise = reinterpret_cast<std::promise<void> *>(ptr);
+    promise->set_value();
+    delete promise;
+  }
+};
+
+void dynamic_batching_example(
+    raft::resources const &res,
+    raft::device_matrix_view<const float, int64_t> dataset,
+    raft::device_matrix_view<const float, int64_t> queries) {
+  using namespace cuvs::neighbors;
+
+  // Number of neighbors to search
+  int64_t topk = 100;
+
+  // Streaming scenario: maximum number of requests in-flight
+  constexpr int64_t kMaxJobs = 1000;
+  // Streaming scenario: number of concurrent CUDA streams
+  constexpr int64_t kNumWorkerStreams = 5;
+
+  // Split the queries into two subsets to run every experiment twice and thus
+  // surface any initialization overheads.
+  int64_t n_queries_a = queries.extent(0) / 2;
+  int64_t n_queries_b = queries.extent(0) - n_queries_a;
+
+  auto queries_a = slice_matrix(queries, 0, n_queries_a);
+  auto queries_b = slice_matrix(queries, n_queries_a, n_queries_b);
+
+  // create output arrays
+  auto neighbors =
+      raft::make_device_matrix<uint32_t>(res, queries.extent(0), topk);
+  auto distances =
+      raft::make_device_matrix<float>(res, queries.extent(0), topk);
+  // slice them same as queries
+  auto neighbors_a = slice_matrix(neighbors, 0, n_queries_a);
+  auto distances_a = slice_matrix(distances, 0, n_queries_a);
+  auto neighbors_b = slice_matrix(neighbors, n_queries_a, n_queries_b);
+  auto distances_b = slice_matrix(distances, n_queries_a, n_queries_b);
+
+  // use default index parameters
+  cagra::index_params orig_index_params;
+
+  std::cout << "Building CAGRA index (search graph)" << std::endl;
+  auto orig_index = cagra::build(res, orig_index_params, dataset);
+
+  std::cout << "CAGRA index has " << orig_index.size() << " vectors"
+            << std::endl;
+  std::cout << "CAGRA graph has degree " << orig_index.graph_degree()
+            << ", graph size [" << orig_index.graph().extent(0) << ", "
+            << orig_index.graph().extent(1) << "]" << std::endl;
+
+  // use default search parameters
+  cagra::search_params orig_search_params;
+  // get a decent recall by increasing the internal topk list
+  orig_search_params.itopk_size = 512;
+  orig_search_params.algo = cagra::search_algo::SINGLE_CTA;
+
+  // Set up dynamic batching parameters
+  dynamic_batching::index_params dynb_index_params{
+      /* default-initializing the parent `neighbors::index_params`
+         (not used anyway) */
+      {},
+      /* Set the K in advance (the batcher needs to allocate buffers) */
+      topk,
+      /* Configure the number and the size of IO buffers */
+      64,
+      kNumWorkerStreams};
+
+  // "build" the index (it's a low-cost index wrapping),
+  //  that is we need to pass the original index and its search params here
+  dynamic_batching::index<float, uint32_t> dynb_index(
+      res, dynb_index_params, orig_index, orig_search_params);
+
+  // You can implement job priorities by varying the deadlines of individual
+  // requests
+  dynamic_batching::search_params dynb_search_params;
+  dynb_search_params.dispatch_timeout_ms = 0.1;
+
+  // Define the big-batch setting as a baseline for measuring the throughput.
+  auto search_batch_orig =
+      [&res, &orig_index, &orig_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        cagra::search(res, orig_search_params, orig_index, queries, neighbors,
+                      distances);
+        raft::resource::sync_stream(res);
+      };
+
+  // Launch the baseline search: check the big-batch performance
+  time_it("standard/batch A", search_batch_orig, queries_a, neighbors_a,
+          distances_a);
+  time_it("standard/batch B", search_batch_orig, queries_b, neighbors_b,
+          distances_b);
+
+  // Streaming scenario: prepare concurrent resources
+  rmm::cuda_stream_pool worker_streams{kNumWorkerStreams};
+  std::vector<raft::resources> resource_pool(0);
+  for (int64_t i = 0; i < kNumWorkerStreams; i++) {
+    resource_pool.push_back(res);
+    raft::resource::set_cuda_stream(resource_pool[i],
+                                    worker_streams.get_stream(i));
+  }
+
+  // Streaming scenario:
+  // send queries one-by-one, with a maximum kMaxJobs in-flight
+  auto search_async_orig =
+      [&resource_pool, &orig_index, &orig_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        auto work_size = queries.extent(0);
+        std::array<std::future<void>, kMaxJobs> futures;
+        for (int64_t i = 0; i < work_size + kMaxJobs; i++) {
+          // wait for previous job in the same slot to finish
+          if (i >= kMaxJobs) {
+            futures[i % kMaxJobs].wait();
+          }
+          // submit a new job
+          if (i < work_size) {
+            auto &res = resource_pool[i % kNumWorkerStreams];
+            cagra::search(res, orig_search_params, orig_index,
+                          slice_matrix(queries, i, 1),
+                          slice_matrix(neighbors, i, 1),
+                          slice_matrix(distances, i, 1));
+            futures[i % kMaxJobs] =
+                cuda_work_completion_promise(res).get_future();
+          }
+        }
+      };
+
+  // Streaming scenario with dynamic batching:
+  // send queries one-by-one, with a maximum kMaxJobs in-flight,
+  // yet allow grouping the sequential requests (subject to deadlines)
+  auto search_async_dynb =
+      [&resource_pool, &dynb_index, &dynb_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        auto work_size = queries.extent(0);
+        std::array<std::future<void>, kMaxJobs> futures;
+        for (int64_t i = 0; i < work_size + kMaxJobs; i++) {
+          // wait for previous job in the same slot to finish
+          if (i >= kMaxJobs) {
+            futures[i % kMaxJobs].wait();
+          }
+          // submit a new job
+          if (i < work_size) {
+            auto &res = resource_pool[i % kNumWorkerStreams];
+            dynamic_batching::search(res, dynb_search_params, dynb_index,
+                                     slice_matrix(queries, i, 1),
+                                     slice_matrix(neighbors, i, 1),
+                                     slice_matrix(distances, i, 1));
+            futures[i % kMaxJobs] =
+                cuda_work_completion_promise(res).get_future();
+          }
+        }
+      };
+
+  // Try to handle the same amount of work in the async setting using the
+  // standard implementation.
+  time_it("standard/async A", search_async_orig, queries_a, neighbors_a,
+          distances_a);
+  time_it("standard/async B", search_async_orig, queries_b, neighbors_b,
+          distances_b);
+
+  // Do the same using dynamic batching
+  time_it("dynamic_batching/async A", search_async_dynb, queries_a, neighbors_a,
+          distances_a);
+  time_it("dynamic_batching/async B", search_async_dynb, queries_b, neighbors_b,
+          distances_b);
+}
+
+int main() {
+  raft::device_resources res;
+
+  // Set the raft resource to use a pool for internal memory allocations
+  // (workspace) and limit the available workspace size.
+  raft::resource::set_workspace_to_pool_resource(res,
+                                                 12ull * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 1000000;
+  int64_t n_dim = 128;
+  int64_t n_queries = 10000;
+  auto dataset =
+      raft::make_device_matrix<float, int64_t>(res, n_samples, n_dim);
+  auto queries =
+      raft::make_device_matrix<float, int64_t>(res, n_queries, n_dim);
+  generate_dataset(res, dataset.view(), queries.view());
+
+  // run the interesting part of the program
+  dynamic_batching_example(res, raft::make_const_mdspan(dataset.view()),
+                           raft::make_const_mdspan(queries.view()));
+}
diff --git a/img/tech_stack.png b/img/tech_stack.png
new file mode 100644
index 000000000..2b3eeedba
Binary files /dev/null and b/img/tech_stack.png differ
diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb
index 21d59975b..1115a5920 100644
--- a/notebooks/VectorSearch_QuestionRetrieval.ipynb
+++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb
@@ -160,7 +160,7 @@
    },
    "outputs": [],
    "source": [
-    "pq_index_mem = pq_index.pq_dim * pq_index.size * pq_index.pq_bits\n",
+    "pq_index_mem = params.pq_dim * corpus_embeddings.shape[0] * params.pq_bits\n",
     "print(\"IVF-PQ memory footprint: {:.1f} MB\".format(pq_index_mem / 2**20))\n",
     "\n",
     "original_mem = corpus_embeddings.shape[0] * corpus_embeddings.shape[1] * 4\n",
diff --git a/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb
new file mode 100644
index 000000000..09a6cca43
--- /dev/null
+++ b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb
@@ -0,0 +1,732 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f5499b54",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Similar Questions Retrieval - Milvus - CAGRA-HNSW\n",
+    "\n",
+    "This notebook is inspired by the [similar search example of Sentence-Transformers](https://www.sbert.net/examples/applications/semantic-search/README.html#similar-questions-retrieval), and adapted to be used with [Milvus](https://milvus.io) and [cuVS](https://rapids.ai/cuvs/).\n",
+    "\n",
+    "The model was pre-trained on the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions). It consists of about 100k real Google search queries, together with an annotated passage from Wikipedia that provides the answer. It is an example of an asymmetric search task. As corpus, we use the smaller [Simple English Wikipedia](http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz) so that it fits easily into memory.\n",
+    "\n",
+    "The steps to install the latest Milvus package are available in the [Milvus documentation](https://milvus.io/docs/quickstart.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8d55ede",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:21.149465Z",
+     "iopub.status.busy": "2024-11-08T14:47:21.149218Z",
+     "iopub.status.idle": "2024-11-08T14:47:23.440275Z",
+     "shell.execute_reply": "2024-11-08T14:47:23.439436Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!pip install sentence_transformers torch pymilvus pymilvus[bulk_writer] dask dask[distributed]\n",
+    "\n",
+    "# Note: if you have a Hopper based GPU, like an H100, use these to install:\n",
+    "# pip install torch --index-url https://download.pytorch.org/whl/cu118\n",
+    "# pip install sentence_transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb1e81c3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:23.444058Z",
+     "iopub.status.busy": "2024-11-08T14:47:23.443683Z",
+     "iopub.status.idle": "2024-11-08T14:47:24.219903Z",
+     "shell.execute_reply": "2024-11-08T14:47:24.219228Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee4c5cc0",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:24.223131Z",
+     "iopub.status.busy": "2024-11-08T14:47:24.222874Z",
+     "iopub.status.idle": "2024-11-08T14:47:34.024085Z",
+     "shell.execute_reply": "2024-11-08T14:47:34.023435Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import dask.array as da\n",
+    "import gzip\n",
+    "import json\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import pymilvus\n",
+    "import time\n",
+    "import torch\n",
+    "\n",
+    "from minio import Minio\n",
+    "from multiprocessing import Process\n",
+    "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n",
+    "from typing import List\n",
+    "\n",
+    "\n",
+    "from pymilvus import (\n",
+    "    connections, utility\n",
+    ")\n",
+    "from pymilvus.bulk_writer import LocalBulkWriter, BulkFileType  # pip install pymilvus[bulk_writer]\n",
+    "\n",
+    "if not torch.cuda.is_available():\n",
+    "  print(\"Warning: No GPU found. Please add GPU to your notebook\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47cabaca",
+   "metadata": {},
+   "source": [
+    "# Setup Milvus Collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fcd259c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:34.027677Z",
+     "iopub.status.busy": "2024-11-08T14:47:34.027288Z",
+     "iopub.status.idle": "2024-11-08T14:47:34.109212Z",
+     "shell.execute_reply": "2024-11-08T14:47:34.108609Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "DIM = 768\n",
+    "MILVUS_PORT = 30004\n",
+    "MILVUS_HOST = f\"http://localhost:{MILVUS_PORT}\"\n",
+    "ID_FIELD=\"id\"\n",
+    "EMBEDDING_FIELD=\"embedding\"\n",
+    "\n",
+    "collection_name = \"simple_wiki\"\n",
+    "\n",
+    "def get_milvus_client():\n",
+    "    return pymilvus.MilvusClient(uri=MILVUS_HOST)\n",
+    "\n",
+    "client = get_milvus_client()\n",
+    "\n",
+    "fields = [\n",
+    "    pymilvus.FieldSchema(name=ID_FIELD, dtype=pymilvus.DataType.INT64, is_primary=True),\n",
+    "    pymilvus.FieldSchema(name=EMBEDDING_FIELD, dtype=pymilvus.DataType.FLOAT_VECTOR, dim=DIM)\n",
+    "]\n",
+    "\n",
+    "schema = pymilvus.CollectionSchema(fields)\n",
+    "schema.verify()\n",
+    "\n",
+    "if collection_name in client.list_collections():\n",
+    "    print(f\"Collection '{collection_name}' already exists. Deleting collection...\")\n",
+    "    client.drop_collection(collection_name)\n",
+    "\n",
+    "client.create_collection(collection_name, schema=schema, dimension=DIM, vector_field_name=EMBEDDING_FIELD)\n",
+    "collection = pymilvus.Collection(name=collection_name, using=client._using)\n",
+    "collection.release()\n",
+    "collection.drop_index()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00bd20f5",
+   "metadata": {},
+   "source": [
+    "# Setup Sentence Transformer model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a1a6307",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:34.111782Z",
+     "iopub.status.busy": "2024-11-08T14:47:34.111556Z",
+     "iopub.status.idle": "2024-11-08T14:47:39.654323Z",
+     "shell.execute_reply": "2024-11-08T14:47:39.653386Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search\n",
+    "model_name = 'nq-distilbert-base-v1'\n",
+    "bi_encoder = SentenceTransformer(model_name)\n",
+    "\n",
+    "# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only\n",
+    "# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder\n",
+    "\n",
+    "wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'\n",
+    "\n",
+    "if not os.path.exists(wikipedia_filepath):\n",
+    "    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n",
+    "\n",
+    "passages = []\n",
+    "with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:\n",
+    "    for line in fIn:\n",
+    "        data = json.loads(line.strip())\n",
+    "        for paragraph in data['paragraphs']:\n",
+    "            # We encode the passages as [title, text]\n",
+    "            passages.append([data['title'], paragraph])\n",
+    "\n",
+    "# If you like, you can also limit the number of passages you want to use\n",
+    "print(\"Passages:\", len(passages))\n",
+    "\n",
+    "# To speed things up, pre-computed embeddings are downloaded.\n",
+    "# The provided file encoded the passages with the model 'nq-distilbert-base-v1'\n",
+    "if model_name == 'nq-distilbert-base-v1':\n",
+    "    embeddings_filepath = 'simplewiki-2020-11-01-nq-distilbert-base-v1.pt'\n",
+    "    if not os.path.exists(embeddings_filepath):\n",
+    "        util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt', embeddings_filepath)\n",
+    "\n",
+    "    corpus_embeddings = torch.load(embeddings_filepath, map_location='cpu', weights_only=True).float()  # Convert embedding file to float\n",
+    "    #if torch.cuda.is_available():\n",
+    "    #    corpus_embeddings = corpus_embeddings.to('cuda')\n",
+    "else:  # Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)\n",
+    "    corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True).to('cpu')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f4e9b9d",
+   "metadata": {},
+   "source": [
+    "# Vector Search using Milvus and RAPIDS cuVS \n",
+    "Now that our embeddings are ready to be indexed and that the model has been loaded, we can use Milvus and RAPIDS cuVS to do our vector search.\n",
+    "\n",
+    "This is done in 3 steps: First we ingest all the vectors in the Milvus collection, then we build the Milvus index, to finally search it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "563751c1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:39.658832Z",
+     "iopub.status.busy": "2024-11-08T14:47:39.658374Z",
+     "iopub.status.idle": "2024-11-08T14:49:47.244768Z",
+     "shell.execute_reply": "2024-11-08T14:49:47.244162Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# minio\n",
+    "MINIO_PORT = 30009\n",
+    "MINIO_URL = f\"localhost:{MINIO_PORT}\"\n",
+    "MINIO_SECRET_KEY = \"minioadmin\"\n",
+    "MINIO_ACCESS_KEY = \"minioadmin\"\n",
+    "\n",
+    "def upload_to_minio(file_paths: List[List[str]], remote_paths: List[List[str]], bucket_name=\"milvus-bucket\"):\n",
+    "    minio_client = Minio(endpoint=MINIO_URL, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)\n",
+    "    if not minio_client.bucket_exists(bucket_name):\n",
+    "        minio_client.make_bucket(bucket_name)\n",
+    "\n",
+    "    for local_batch, remote_batch in zip(file_paths, remote_paths):\n",
+    "        for local_file, remote_file in zip(local_batch, remote_batch):\n",
+    "            minio_client.fput_object(bucket_name, \n",
+    "                                     object_name=remote_file,\n",
+    "                                     file_path=local_file,\n",
+    "                                     part_size=512 * 1024 * 1024,\n",
+    "                                     num_parallel_uploads=5)\n",
+    "     \n",
+    "    \n",
+    "def ingest_data_bulk(collection_name, vectors, schema: pymilvus.CollectionSchema, log_times=True, bulk_writer_type=\"milvus\", debug=False):\n",
+    "    print(f\"-  Ingesting {len(vectors) // 1000}k vectors, Bulk\")\n",
+    "    tic = time.perf_counter()\n",
+    "    collection = pymilvus.Collection(collection_name, using=get_milvus_client()._using)\n",
+    "    remote_path = None\n",
+    "\n",
+    "    if bulk_writer_type == 'milvus':\n",
+    "        # # Prepare source data for faster ingestion\n",
+    "        writer = LocalBulkWriter(\n",
+    "            schema=schema,\n",
+    "            local_path='bulk_data',\n",
+    "            segment_size=512 * 1024 * 1024, # Default value\n",
+    "            file_type=BulkFileType.NPY\n",
+    "        )\n",
+    "        for id, vec in enumerate(vectors):\n",
+    "            writer.append_row({ID_FIELD: id, EMBEDDING_FIELD: vec})\n",
+    "\n",
+    "        if debug:\n",
+    "            print(writer.batch_files)\n",
+    "        def callback(file_list):\n",
+    "            if debug:\n",
+    "                print(f\"  -  Commit successful\")\n",
+    "                print(file_list)\n",
+    "        writer.commit(call_back=callback)\n",
+    "        files_to_upload = writer.batch_files\n",
+    "    elif bulk_writer_type == 'dask':\n",
+    "        # Prepare source data for faster ingestion\n",
+    "        if not os.path.isdir(\"bulk_data\"):\n",
+    "            os.mkdir(\"bulk_data\")\n",
+    "\n",
+    "        from dask.distributed import Client, LocalCluster\n",
+    "        cluster = LocalCluster(n_workers=1, threads_per_worker=1)\n",
+    "        client = Client(cluster)\n",
+    "\n",
+    "        chunk_size = 100000\n",
+    "        da_vectors = da.from_array(vectors, chunks=(chunk_size, vectors.shape[1]))\n",
+    "        da_ids = da.arange(len(vectors), chunks=(chunk_size,))\n",
+    "        da.to_npy_stack(\"bulk_data/da_embedding/\", da_vectors)\n",
+    "        da.to_npy_stack(\"bulk_data/da_id/\", da_ids)\n",
+    "        files_to_upload = []\n",
+    "        remote_path = []\n",
+    "        for chunk_nb in range(math.ceil(len(vectors) / chunk_size)):\n",
+    "            files_to_upload.append([f\"bulk_data/da_embedding/{chunk_nb}.npy\", f\"bulk_data/da_id/{chunk_nb}.npy\"])\n",
+    "            remote_path.append([f\"bulk_data/da_{chunk_nb}/embedding.npy\", f\"bulk_data/da__{chunk_nb}/id.npy\"])\n",
+    "\n",
+    "    elif bulk_writer_type == 'numpy':\n",
+    "        # Directly save NPY files\n",
+    "        np.save(\"bulk_data/embedding.npy\", vectors)\n",
+    "        np.save(\"bulk_data/id.npy\", np.arange(len(vectors)))\n",
+    "        files_to_upload = [[\"bulk_data/embedding.npy\", \"bulk_data/id.npy\"]]\n",
+    "    else:\n",
+    "        raise ValueError(\"Invalid bulk writer type\")\n",
+    "    \n",
+    "    toc = time.perf_counter()\n",
+    "    if log_times:\n",
+    "        print(f\"  -  File save time: {toc - tic:.2f} seconds\")\n",
+    "    # Import data\n",
+    "    if remote_path is None:\n",
+    "        remote_path = files_to_upload\n",
+    "    upload_to_minio(files_to_upload, remote_path)\n",
+    "    \n",
+    "    job_ids = [utility.do_bulk_insert(collection_name, batch, using=get_milvus_client()._using) for batch in remote_path]\n",
+    "\n",
+    "    while True:\n",
+    "        tasks = [utility.get_bulk_insert_state(job_id, using=get_milvus_client()._using) for job_id in job_ids]\n",
+    "        success = all(task.state_name == \"Completed\" for task in tasks)\n",
+    "        failure = any(task.state_name == \"Failed\" for task in tasks)\n",
+    "        for i in range(len(tasks)):\n",
+    "            task = tasks[i]\n",
+    "            if debug:\n",
+    "                print(f\"  -  Task {i}/{len(tasks)} state: {task.state_name}, Progress percent: {task.infos['progress_percent']}, Imported row count: {task.row_count}\")\n",
+    "            if task.state_name == \"Failed\":\n",
+    "                print(task)\n",
+    "        if success or failure:\n",
+    "            break\n",
+    "        time.sleep(2)\n",
+    "\n",
+    "    added_entities = str(sum([task.row_count for task in tasks]))\n",
+    "    failure = failure or added_entities != str(len(vectors))\n",
+    "    if failure:\n",
+    "        print(f\"-  Ingestion failed. Added entities: {added_entities}\")\n",
+    "    toc = time.perf_counter()\n",
+    "    if log_times:\n",
+    "        datasize = vectors.nbytes / 1024 / 1024\n",
+    "        print(f\"-  Ingestion time: {toc - tic:.2f} seconds. ({(datasize / (toc-tic)):.2f}MB/s)\")\n",
+    "\n",
+    "ingest_data_bulk(collection_name, np.array(corpus_embeddings), schema, bulk_writer_type='dask', log_times=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad90b4be",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:49:47.247498Z",
+     "iopub.status.busy": "2024-11-08T14:49:47.247268Z",
+     "iopub.status.idle": "2024-11-08T14:50:00.737502Z",
+     "shell.execute_reply": "2024-11-08T14:50:00.736808Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Setups the IVFPQ index\n",
+    "\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_IVF_PQ\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"nlist\": 150, # Number of clusters\n",
+    "            \"m\": 96})      # Product Quantization dimension\n",
+    "\n",
+    "# Drop the index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the index\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c75acea7",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:00.740443Z",
+     "iopub.status.busy": "2024-11-08T14:50:00.740142Z",
+     "iopub.status.idle": "2024-11-08T14:50:00.745403Z",
+     "shell.execute_reply": "2024-11-08T14:50:00.744672Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Search the index\n",
+    "def search_cuvs_pq(query, top_k = 5, n_probe = 30):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "\n",
+    "    search_params = {\"nprobe\": n_probe}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07935bca",
+   "metadata": {},
+   "source": [
+    "The ideal use-case for the IVF-PQ algorithm is when there is a need to reduce the memory footprint while keeping a good accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c27d4715",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:00.748001Z",
+     "iopub.status.busy": "2024-11-08T14:50:00.747783Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.785914Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.785223Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc375518",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.788877Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.788640Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.813820Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.813153Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab154181",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.816625Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.816362Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.839593Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.838986Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query = \"What is creating tides?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "836344ec",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.842319Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.842022Z",
+     "iopub.status.idle": "2024-11-08T14:50:15.969324Z",
+     "shell.execute_reply": "2024-11-08T14:50:15.968562Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Drop the current index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the IVF Flat index\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_IVF_FLAT\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"nlist\": 150}) # Number of clusters)\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "2d6017ed",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:15.972764Z",
+     "iopub.status.busy": "2024-11-08T14:50:15.972368Z",
+     "iopub.status.idle": "2024-11-08T14:50:15.977806Z",
+     "shell.execute_reply": "2024-11-08T14:50:15.977064Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def search_cuvs_flat(query, top_k = 5, n_probe = 30):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "    \n",
+    "    search_params = {\"nprobe\": n_probe}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5cfb644",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:15.980796Z",
+     "iopub.status.busy": "2024-11-08T14:50:15.980408Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.009271Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.008579Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5694d00",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.012253Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.011924Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.043432Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.042751Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcfc3c5b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.046439Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.046093Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.071322Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.070614Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query = \"What is creating tides?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a59d7b32-0832-4c3a-864e-aeb2e6e7fe1f",
+   "metadata": {},
+   "source": [
+    "## Using CAGRA: Hybrid GPU-CPU graph-based Vector Search\n",
+    "\n",
+    "CAGRA is a graph-based nearest neighbors implementation with state-of-the art performance for both small- and large-batch sized vector searches. \n",
+    "\n",
+    "CAGRA follows the same steps as IVF-FLAT and IVF-PQ in Milvus, but is also able to be adapted for querying on CPU.\n",
+    "This means that CAGRA is able to profit from a high training speed on GPU, as well as a low inference time on CPU, that minimize latency even on the smallest queries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5ce4dab",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.074449Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.074128Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.479027Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.478265Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Drop the current index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the IVF Flat index\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_CAGRA\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"graph_degree\": 64, \"intermediate_graph_degree\": 128, \"build_algo\": \"NN_DESCENT\", \"adapt_for_cpu\": True})\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "df229e21-f6b6-4d6c-ad54-2724f8738934",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.481748Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.481474Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.486324Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.485696Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def search_cuvs_cagra(query, top_k = 5, itopk = 32):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "\n",
+    "    search_params = {\"params\": {\"itopk\": itopk, \"ef\": 35}}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5e862fd-b7e5-4423-8fbf-36918f02c8f3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.489077Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.488790Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.513998Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.513319Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb8a5b7b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.516748Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.516521Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.538982Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.538269Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c89810a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.541508Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.541287Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.562722Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.562085Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"What is creating tides?\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/cuvs/README.md b/python/cuvs/README.md
index e69de29bb..27b494811 100644
--- a/python/cuvs/README.md
+++ b/python/cuvs/README.md
@@ -0,0 +1,3 @@
+# cuVS
+
+cuVS contains state-of-the-art implementations of several algorithms for running approximate nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering.
diff --git a/python/cuvs/cuvs/neighbors/brute_force/__init__.py b/python/cuvs/cuvs/neighbors/brute_force/__init__.py
index b88c4b464..6aa0e4bb2 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/__init__.py
+++ b/python/cuvs/cuvs/neighbors/brute_force/__init__.py
@@ -13,6 +13,6 @@
 # limitations under the License.
 
 
-from .brute_force import Index, build, search
+from .brute_force import Index, build, load, save, search
 
-__all__ = ["Index", "build", "search"]
+__all__ = ["Index", "build", "search", "save", "load"]
diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
index 183827916..f1fc14ba7 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
@@ -47,3 +47,11 @@ cdef extern from "cuvs/neighbors/brute_force.h" nogil:
                                      DLManagedTensor* neighbors,
                                      DLManagedTensor* distances,
                                      cuvsFilter filter) except +
+
+    cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                        const char * filename,
+                                        cuvsBruteForceIndex_t index) except +
+
+    cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                          const char * filename,
+                                          cuvsBruteForceIndex_t index) except +
diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
index 559302ccc..9d43bfb29 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
@@ -24,6 +24,7 @@ from cuvs.common.resources import auto_sync_resources
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t
 from libcpp cimport bool
+from libcpp.string cimport string
 
 from cuvs.common cimport cydlpack
 from cuvs.distance_type cimport cuvsDistanceType
@@ -31,9 +32,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from cuvs.common.c_api cimport cuvsResources_t
 
@@ -256,3 +257,88 @@ def search(Index index,
         ))
 
     return (distances, neighbors)
+
+
+@auto_sync_resources
+def save(filename, Index index, bool include_dataset=True, resources=None):
+    """
+    Saves the index to a file.
+
+    The serialization format can be subject to changes, therefore loading
+    an index saved with a previous version of cuvs is not guaranteed
+    to work.
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    index : Index
+        Trained Brute Force index.
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import brute_force
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = brute_force.build(dataset)
+    >>> # Serialize and deserialize the brute_force index built
+    >>> brute_force.save("my_index.bin", index)
+    >>> index_loaded = brute_force.load("my_index.bin")
+    """
+    cdef string c_filename = filename.encode('utf-8')
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    check_cuvs(cuvsBruteForceSerialize(res,
+                                       c_filename.c_str(),
+                                       index.index))
+
+
+@auto_sync_resources
+def load(filename, resources=None):
+    """
+    Loads index from file.
+
+    The serialization format can be subject to changes, therefore loading
+    an index saved with a previous version of cuvs is not guaranteed
+    to work.
+
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    {resources_docstring}
+
+    Returns
+    -------
+    index : Index
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import brute_force
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = brute_force.build(dataset)
+    >>> # Serialize and deserialize the brute_force index built
+    >>> brute_force.save("my_index.bin", index)
+    >>> index_loaded = brute_force.load("my_index.bin")
+    """
+    cdef Index idx = Index()
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    cdef string c_filename = filename.encode('utf-8')
+
+    check_cuvs(cuvsBruteForceDeserialize(
+        res,
+        c_filename.c_str(),
+        idx.index
+    ))
+    idx.trained = True
+    return idx
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
index 95209dbeb..752aef741 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
@@ -32,7 +32,8 @@ from cuvs.common cimport cydlpack
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
+
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/common.py b/python/cuvs/cuvs/neighbors/common.py
new file mode 100644
index 000000000..c14b9f8c9
--- /dev/null
+++ b/python/cuvs/cuvs/neighbors/common.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _check_input_array(cai, exp_dt, exp_rows=None, exp_cols=None):
+    if cai.dtype not in exp_dt:
+        raise TypeError("dtype %s not supported" % cai.dtype)
+
+    if not cai.c_contiguous:
+        raise ValueError("Row major input is expected")
+
+    if exp_cols is not None and cai.shape[1] != exp_cols:
+        raise ValueError(
+            "Incorrect number of columns, expected {} got {}".format(
+                exp_cols, cai.shape[1]
+            )
+        )
+
+    if exp_rows is not None and cai.shape[0] != exp_rows:
+        raise ValueError(
+            "Incorrect number of rows, expected {} , got {}".format(
+                exp_rows, cai.shape[0]
+            )
+        )
diff --git a/python/cuvs/cuvs/neighbors/filters/filters.pyx b/python/cuvs/cuvs/neighbors/filters/filters.pyx
index 3a81cb786..9bc2a905c 100644
--- a/python/cuvs/cuvs/neighbors/filters/filters.pyx
+++ b/python/cuvs/cuvs/neighbors/filters/filters.pyx
@@ -20,11 +20,11 @@ import numpy as np
 from libc.stdint cimport uintptr_t
 
 from cuvs.common cimport cydlpack
+from cuvs.neighbors.common import _check_input_array
 
 from .filters cimport BITMAP, NO_FILTER, cuvsFilter
 
 from pylibraft.common.cai_wrapper import wrap_array
-from pylibraft.neighbors.common import _check_input_array
 
 
 cdef class Prefilter:
diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.py b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
index 5efcdf68b..fafff7d03 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/__init__.py
+++ b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
@@ -13,10 +13,23 @@
 # limitations under the License.
 
 
-from .hnsw import Index, SearchParams, from_cagra, load, save, search
+from .hnsw import (
+    ExtendParams,
+    Index,
+    IndexParams,
+    SearchParams,
+    extend,
+    from_cagra,
+    load,
+    save,
+    search,
+)
 
 __all__ = [
+    "IndexParams",
     "Index",
+    "ExtendParams",
+    "extend",
     "SearchParams",
     "load",
     "save",
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
index 1cdc97406..e0c517933 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
@@ -20,14 +20,25 @@ from libc.stdint cimport int32_t, uintptr_t
 from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
 from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
 from cuvs.distance_type cimport cuvsDistanceType
+from cuvs.neighbors.cagra.cagra cimport cuvsCagraIndex_t
 
 
 cdef extern from "cuvs/neighbors/hnsw.h" nogil:
-    ctypedef struct cuvsHnswSearchParams:
-        int32_t ef
-        int32_t numThreads
 
-    ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t
+    ctypedef enum cuvsHnswHierarchy:
+        NONE
+        CPU
+
+    ctypedef struct cuvsHnswIndexParams:
+        cuvsHnswHierarchy hierarchy
+        int32_t ef_construction
+        int32_t num_threads
+
+    ctypedef cuvsHnswIndexParams* cuvsHnswIndexParams_t
+
+    cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
+
+    cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params)
 
     ctypedef struct cuvsHnswIndex:
         uintptr_t addr
@@ -39,6 +50,31 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
 
     cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index)
 
+    ctypedef struct cuvsHnswExtendParams:
+        int32_t num_threads
+
+    ctypedef cuvsHnswExtendParams* cuvsHnswExtendParams_t
+
+    cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params)
+
+    cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params)
+
+    cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                                  cuvsHnswIndexParams_t params,
+                                  cuvsCagraIndex_t cagra_index,
+                                  cuvsHnswIndex_t hnsw_index) except +
+
+    cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                               cuvsHnswExtendParams_t params,
+                               DLManagedTensor* data,
+                               cuvsHnswIndex_t index) except +
+
+    ctypedef struct cuvsHnswSearchParams:
+        int32_t ef
+        int32_t num_threads
+
+    ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t
+
     cuvsError_t cuvsHnswSearch(cuvsResources_t res,
                                cuvsHnswSearchParams* params,
                                cuvsHnswIndex_t index,
@@ -46,7 +82,12 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
                                DLManagedTensor* neighbors,
                                DLManagedTensor* distances) except +
 
+    cuvsError_t cuvsHnswSerialize(cuvsResources_t res,
+                                  const char * filename,
+                                  cuvsHnswIndex_t index) except +
+
     cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                    cuvsHnswIndexParams_t params,
                                     const char * filename,
                                     int32_t dim,
                                     cuvsDistanceType metric,
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index 018fcfef9..4c44350e8 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -21,6 +21,7 @@ from libcpp.string cimport string
 
 from cuvs.common.exceptions import check_cuvs
 from cuvs.common.resources import auto_sync_resources
+from cuvs.neighbors.common import _check_input_array
 
 from cuvs.common cimport cydlpack
 
@@ -36,44 +37,65 @@ import uuid
 from pylibraft.common import auto_convert_output
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 
-cdef class SearchParams:
+cdef class IndexParams:
     """
-    HNSW search parameters
+    Parameters to build index for HNSW nearest neighbor search
 
     Parameters
     ----------
-    ef: int, default = 200
-        Maximum number of candidate list size used during search.
-    num_threads: int, default = 0
-        Number of CPU threads used to increase search parallelism.
-        When set to 0, the number of threads is automatically determined
-        using OpenMP's `omp_get_max_threads()`.
+    hierarchy : string, default = "none" (optional)
+        The hierarchy of the HNSW index. Valid values are ["none", "cpu"].
+        - "none": No hierarchy is built.
+        - "cpu": Hierarchy is built using CPU.
+    ef_construction : int, default = 200 (optional)
+        Maximum number of candidate list size used during construction
+        when hierarchy is `cpu`.
+    num_threads : int, default = 2 (optional)
+        Number of CPU threads used to increase construction parallelism
+        when hierarchy is `cpu`.
+        NOTE: Constructing the hierarchy when converting from a CAGRA graph
+        is highly sensitive to parallelism, and increasing the number of
+        threads can reduce the quality of the index.
     """
 
-    cdef cuvsHnswSearchParams params
+    cdef cuvsHnswIndexParams* params
+
+    def __cinit__(self):
+        check_cuvs(cuvsHnswIndexParamsCreate(&self.params))
+
+    def __dealloc__(self):
+        check_cuvs(cuvsHnswIndexParamsDestroy(self.params))
 
     def __init__(self, *,
-                 ef=200,
-                 num_threads=0):
-        self.params.ef = ef
-        self.params.numThreads = num_threads
+                 hierarchy="none",
+                 ef_construction=200,
+                 num_threads=2):
+        if hierarchy == "none":
+            self.params.hierarchy = cuvsHnswHierarchy.NONE
+        elif hierarchy == "cpu":
+            self.params.hierarchy = cuvsHnswHierarchy.CPU
+        else:
+            raise ValueError("Invalid hierarchy type."
+                             " Valid values are 'none' and 'cpu'.")
+        self.params.ef_construction = ef_construction
+        self.params.num_threads = num_threads
 
-    def __repr__(self):
-        attr_str = [attr + "=" + str(getattr(self, attr))
-                    for attr in [
-                        "ef", "num_threads"]]
-        return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")"
+    @property
+    def hierarchy(self):
+        if self.params.hierarchy == cuvsHnswHierarchy.NONE:
+            return "none"
+        elif self.params.hierarchy == cuvsHnswHierarchy.CPU:
+            return "cpu"
 
     @property
-    def ef(self):
-        return self.params.ef
+    def ef_construction(self):
+        return self.params.ef_construction
 
     @property
     def num_threads(self):
-        return self.params.numThreads
+        return self.params.num_threads
 
 
 cdef class Index:
@@ -103,13 +125,44 @@ cdef class Index:
         return "Index(type=HNSW, metric=L2" + (", ".join(attr_str)) + ")"
 
 
+cdef class ExtendParams:
+    """
+    Parameters to extend the HNSW index with new data
+
+    Parameters
+    ----------
+    num_threads : int, default = 0 (optional)
+        Number of CPU threads used to increase construction parallelism.
+        When set to 0, the number of threads is automatically determined.
+    """
+
+    cdef cuvsHnswExtendParams* params
+
+    def __cinit__(self):
+        check_cuvs(cuvsHnswExtendParamsCreate(&self.params))
+
+    def __dealloc__(self):
+        check_cuvs(cuvsHnswExtendParamsDestroy(self.params))
+
+    def __init__(self, *,
+                 num_threads=0):
+        self.params.num_threads = num_threads
+
+    @property
+    def num_threads(self):
+        return self.params.num_threads
+
+
 @auto_sync_resources
-def save(filename, cagra.Index index, resources=None):
+def save(filename, Index index, resources=None):
     """
     Saves the CAGRA index to a file as an hnswlib index.
-    The saved index is immutable and can only be searched by the hnswlib
-    wrapper in cuVS, as the format is not compatible with the original
-    hnswlib.
+    If the index was constructed with `hnsw.IndexParams(hierarchy="none")`,
+    then the saved index is immutable and can only be searched by the hnswlib
+    wrapper in cuVS, as the format is not compatible with the original hnswlib.
+    However, if the index was constructed with
+    `hnsw.IndexParams(hierarchy="cpu")`, then the saved index is mutable and
+    compatible with the original hnswlib.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
@@ -119,7 +172,7 @@ def save(filename, cagra.Index index, resources=None):
     filename : string
         Name of the file.
     index : Index
-        Trained CAGRA index.
+        Trained HNSW index.
     {resources_docstring}
 
     Examples
@@ -131,23 +184,28 @@ def save(filename, cagra.Index index, resources=None):
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> # Build index
-    >>> index = cagra.build(cagra.IndexParams(), dataset)
+    >>> cagra_index = cagra.build(cagra.IndexParams(), dataset)
     >>> # Serialize and deserialize the cagra index built
-    >>> hnsw.save("my_index.bin", index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), cagra_index)
+    >>> hnsw.save("my_index.bin", hnsw_index)
     """
     cdef string c_filename = filename.encode('utf-8')
     cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
-    check_cuvs(cagra.cuvsCagraSerializeToHnswlib(res,
-                                                 c_filename.c_str(),
-                                                 index.index))
+    check_cuvs(cuvsHnswSerialize(res,
+                                 c_filename.c_str(),
+                                 index.index))
 
 
 @auto_sync_resources
-def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
+def load(IndexParams index_params, filename, dim, dtype, metric="sqeuclidean",
+         resources=None):
     """
-    Loads base-layer-only hnswlib index from file, which was originally
-    saved as a built CAGRA index. The loaded index is immutable and can only
-    be searched by the hnswlib wrapper in cuVS, as the format is not
+    Loads an HNSW index.
+    If the index was constructed with `hnsw.IndexParams(hierarchy="none")`,
+    then the loaded index is immutable and can only be searched by the hnswlib
+    wrapper in cuVS, as the format is not compatible with the original hnswlib.
+    However, if the index was constructed with
+    `hnsw.IndexParams(hierarchy="cpu")`, then the loaded index is mutable and
     compatible with the original hnswlib.
 
     Saving / loading the index is experimental. The serialization format is
@@ -156,6 +214,8 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
     Parameters
     ----------
+    index_params : IndexParams
+        Parameters that were used to convert CAGRA index to HNSW index.
     filename : string
         Name of the file.
     dim : int
@@ -214,6 +274,7 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
     check_cuvs(cuvsHnswDeserialize(
         res,
+        index_params.params,
         c_filename.c_str(),
         dim,
         distance_type,
@@ -224,26 +285,30 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
 
 @auto_sync_resources
-def from_cagra(cagra.Index index, temporary_index_path=None, resources=None):
+def from_cagra(IndexParams index_params, cagra.Index cagra_index,
+               temporary_index_path=None, resources=None):
     """
-    Returns an hnsw base-layer-only index from a CAGRA index.
-
-    NOTE: This method uses the filesystem to write the CAGRA index in
-          `/tmp/<random_number>.bin` or the parameter `temporary_index_path`
-          if not None before reading it as an hnsw index,
-          then deleting the temporary file. The returned index is immutable
-          and can only be searched by the hnsw wrapper in cuVS, as the
-          format is not compatible with the original hnswlib library.
-          By `base_layer_only`, we mean that the hnsw index is created
-          without the additional layers that are used for the hierarchical
-          search in hnswlib. Instead, the base layer is used for the search.
+    Returns an HNSW index from a CAGRA index.
+
+    NOTE: When `index_params.hierarchy` is:
+          1. `NONE`: This method uses the filesystem to write the CAGRA index
+                     in `/tmp/<random_number>.bin` before reading it as an
+                     hnswlib index, then deleting the temporary file. The
+                     returned index is immutable and can only be searched by
+                     the hnswlib wrapper in cuVS, as the format is not
+                    compatible with the original hnswlib.
+          2. `CPU`: The returned index is mutable and can be extended with
+                    additional vectors. The serialized index is also compatible
+                    with the original hnswlib library.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
 
     Parameters
     ----------
-    index : Index
+    index_params : IndexParams
+        Parameters to convert the CAGRA index to HNSW index.
+    cagra_index : cagra.Index
         Trained CAGRA index.
     temporary_index_path : string, default = None
         Path to save the temporary index file. If None, the temporary file
@@ -262,18 +327,107 @@ def from_cagra(cagra.Index index, temporary_index_path=None, resources=None):
     >>> # Build index
     >>> index = cagra.build(cagra.IndexParams(), dataset)
     >>> # Serialize the CAGRA index to hnswlib base layer only index format
-    >>> hnsw_index = hnsw.from_cagra(index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index)
     """
-    uuid_num = uuid.uuid4()
-    filename = temporary_index_path if temporary_index_path else \
-        f"/tmp/{uuid_num}.bin"
-    save(filename, index, resources=resources)
-    hnsw_index = load(filename, index.dim, np.dtype(index.active_index_type),
-                      "sqeuclidean", resources=resources)
-    os.remove(filename)
+
+    cdef Index hnsw_index = Index()
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    check_cuvs(cuvsHnswFromCagra(
+        res,
+        index_params.params,
+        cagra_index.index,
+        hnsw_index.index
+    ))
+
+    hnsw_index.trained = True
     return hnsw_index
 
 
+@auto_sync_resources
+def extend(ExtendParams extend_params, Index index, data, resources=None):
+    """
+    Extends the HNSW index with new data.
+
+    Parameters
+    ----------
+    extend_params : ExtendParams
+    index : Index
+        Trained HNSW index.
+    data : Host array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [float32, int8, uint8]
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cuvs.neighbors import hnsw, cagra
+    >>>
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = np.random.random_sample((n_samples, n_features))
+    >>>
+    >>> # Build index
+    >>> index = cagra.build(hnsw.IndexParams(), dataset)
+    >>> # Load index
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(hierarchy="cpu"), index)
+    >>> # Extend the index with new data
+    >>> new_data = np.random.random_sample((n_samples, n_features))
+    >>> hnsw.extend(hnsw.ExtendParams(), hnsw_index, new_data)
+    """
+
+    data_ai = wrap_array(data)
+    _check_input_array(data_ai, [np.dtype('float32'),
+                                 np.dtype('uint8'),
+                                 np.dtype('int8')])
+
+    cdef cydlpack.DLManagedTensor* data_dlpack = cydlpack.dlpack_c(data_ai)
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+
+    check_cuvs(cuvsHnswExtend(
+        res,
+        extend_params.params,
+        data_dlpack,
+        index.index
+    ))
+
+
+cdef class SearchParams:
+    """
+    HNSW search parameters
+
+    Parameters
+    ----------
+    ef: int, default = 200
+        Maximum number of candidate list size used during search.
+    num_threads: int, default = 0
+        Number of CPU threads used to increase search parallelism.
+        When set to 0, the number of threads is automatically determined
+        using OpenMP's `omp_get_max_threads()`.
+    """
+
+    cdef cuvsHnswSearchParams params
+
+    def __init__(self, *,
+                 ef=200,
+                 num_threads=0):
+        self.params.ef = ef
+        self.params.num_threads = num_threads
+
+    def __repr__(self):
+        attr_str = [attr + "=" + str(getattr(self, attr))
+                    for attr in [
+                        "ef", "num_threads"]]
+        return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")"
+
+    @property
+    def ef(self):
+        return self.params.ef
+
+    @property
+    def num_threads(self):
+        return self.params.num_threads
+
+
 @auto_sync_resources
 @auto_convert_output
 def search(SearchParams search_params,
@@ -290,15 +444,15 @@ def search(SearchParams search_params,
     ----------
     search_params : SearchParams
     index : Index
-        Trained CAGRA index.
-    queries : CUDA array interface compliant matrix shape (n_samples, dim)
+        Trained HNSW index.
+    queries : CPU array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int]
     k : int
         The number of neighbors.
-    neighbors : Optional CUDA array interface compliant matrix shape
+    neighbors : Optional CPU array interface compliant matrix shape
                 (n_queries, k), dtype uint64_t. If supplied, neighbor
                 indices will be written here in-place. (default None)
-    distances : Optional CUDA array interface compliant matrix shape
+    distances : Optional CPU array interface compliant matrix shape
                 (n_queries, k) If supplied, the distances to the
                 neighbors will be written here in-place. (default None)
     {resources_docstring}
@@ -323,7 +477,7 @@ def search(SearchParams search_params,
     ...     num_threads=0
     ... )
     >>> # Convert CAGRA index to HNSW
-    >>> hnsw_index = hnsw.from_cagra(index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index)
     >>> # Using a pooling allocator reduces overhead of temporary array
     >>> # creation during search. This is useful if multiple searches
     >>> # are performed with same query size.
diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
index 25b9b2aee..7a169e1a0 100644
--- a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
@@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
index 3add1df75..531302ee6 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
@@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/refine.pyx b/python/cuvs/cuvs/neighbors/refine.pyx
index 0eccc4108..b7aa35dca 100644
--- a/python/cuvs/cuvs/neighbors/refine.pyx
+++ b/python/cuvs/cuvs/neighbors/refine.pyx
@@ -31,13 +31,13 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
 
 from cuvs.common.c_api cimport cuvsResources_t
 
 from cuvs.common.exceptions import check_cuvs
+from cuvs.neighbors.common import _check_input_array
 
 
 @auto_sync_resources
diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py
index 92b88f013..56e132c23 100644
--- a/python/cuvs/cuvs/test/test_cagra.py
+++ b/python/cuvs/cuvs/test/test_cagra.py
@@ -122,8 +122,9 @@ def run_cagra_build_search_test(
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("array_type", ["device", "host"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+@pytest.mark.parametrize("metric", ["euclidean"])
 def test_cagra_dataset_dtype_host_device(
-    dtype, array_type, inplace, build_algo
+    dtype, array_type, inplace, build_algo, metric
 ):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
@@ -132,6 +133,7 @@ def test_cagra_dataset_dtype_host_device(
         inplace=inplace,
         array_type=array_type,
         build_algo=build_algo,
+        metric=metric,
     )
 
 
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
index 0ae97266b..20f583ae8 100644
--- a/python/cuvs/cuvs/test/test_hnsw.py
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -23,7 +23,7 @@
 
 
 def run_hnsw_build_search_test(
-    n_rows=1000,
+    n_rows=10000,
     n_cols=10,
     n_queries=100,
     k=10,
@@ -32,6 +32,7 @@ def run_hnsw_build_search_test(
     build_algo="ivf_pq",
     intermediate_graph_degree=128,
     graph_degree=64,
+    hierarchy="none",
     search_params={},
 ):
     dataset = generate_data((n_rows, n_cols), dtype)
@@ -41,6 +42,85 @@ def run_hnsw_build_search_test(
             pytest.skip(
                 "inner_product metric is not supported for int8/uint8 data"
             )
+
+    build_params = cagra.IndexParams(
+        metric=metric,
+        intermediate_graph_degree=intermediate_graph_degree,
+        graph_degree=graph_degree,
+        build_algo=build_algo,
+    )
+
+    index = cagra.build(build_params, dataset)
+
+    assert index.trained
+
+    hnsw_params = hnsw.IndexParams(hierarchy=hierarchy, num_threads=1)
+    hnsw_index = hnsw.from_cagra(hnsw_params, index)
+
+    queries = generate_data((n_queries, n_cols), dtype)
+
+    search_params = hnsw.SearchParams(**search_params)
+
+    out_dist, out_idx = hnsw.search(search_params, hnsw_index, queries, k)
+
+    # Calculate reference values with sklearn
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
+    nn_skl = NearestNeighbors(
+        n_neighbors=k, algorithm="brute", metric=skl_metric
+    )
+    nn_skl.fit(dataset)
+    skl_dist, skl_idx = nn_skl.kneighbors(queries, return_distance=True)
+
+    recall = calc_recall(out_idx, skl_idx)
+    assert recall > 0.95
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
+@pytest.mark.parametrize("k", [10, 20])
+@pytest.mark.parametrize("ef", [30, 40])
+@pytest.mark.parametrize("num_threads", [2, 4])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+@pytest.mark.parametrize("hierarchy", ["none", "cpu"])
+def test_hnsw(dtype, k, ef, num_threads, metric, build_algo, hierarchy):
+    # Note that inner_product tests use normalized input which we cannot
+    # represent in int8, therefore we test only sqeuclidean metric here.
+    run_hnsw_build_search_test(
+        dtype=dtype,
+        k=k,
+        metric=metric,
+        build_algo=build_algo,
+        hierarchy=hierarchy,
+        search_params={"ef": ef, "num_threads": num_threads},
+    )
+
+
+def run_hnsw_extend_test(
+    n_rows=10000,
+    add_rows=2000,
+    n_cols=10,
+    n_queries=100,
+    k=10,
+    dtype=np.float32,
+    metric="sqeuclidean",
+    build_algo="ivf_pq",
+    intermediate_graph_degree=128,
+    graph_degree=64,
+    search_params={},
+):
+    dataset = generate_data((n_rows, n_cols), dtype)
+    add_dataset = generate_data((add_rows, n_cols), dtype)
+    if metric == "inner_product":
+        dataset = normalize(dataset, norm="l2", axis=1)
+        add_dataset = normalize(add_dataset, norm="l2", axis=1)
+        if dtype in [np.int8, np.uint8]:
+            pytest.skip(
+                "inner_product metric is not supported for int8/uint8 data"
+            )
         if build_algo == "nn_descent":
             pytest.skip("inner_product metric is not supported for nn_descent")
 
@@ -55,7 +135,9 @@ def run_hnsw_build_search_test(
 
     assert index.trained
 
-    hnsw_index = hnsw.from_cagra(index)
+    hnsw_params = hnsw.IndexParams(hierarchy="cpu", num_threads=1)
+    hnsw_index = hnsw.from_cagra(hnsw_params, index)
+    hnsw.extend(hnsw.ExtendParams(), hnsw_index, add_dataset)
 
     queries = generate_data((n_queries, n_cols), dtype)
 
@@ -72,10 +154,11 @@ def run_hnsw_build_search_test(
     nn_skl = NearestNeighbors(
         n_neighbors=k, algorithm="brute", metric=skl_metric
     )
-    nn_skl.fit(dataset)
+    nn_skl.fit(np.vstack([dataset, add_dataset]))
     skl_dist, skl_idx = nn_skl.kneighbors(queries, return_distance=True)
 
     recall = calc_recall(out_idx, skl_idx)
+    print(recall)
     assert recall > 0.95
 
 
@@ -85,10 +168,10 @@ def run_hnsw_build_search_test(
 @pytest.mark.parametrize("num_threads", [2, 4])
 @pytest.mark.parametrize("metric", ["sqeuclidean"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
-def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
+def test_hnsw_extend(dtype, k, ef, num_threads, metric, build_algo):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
-    run_hnsw_build_search_test(
+    run_hnsw_extend_test(
         dtype=dtype,
         k=k,
         metric=metric,
diff --git a/python/cuvs/cuvs/test/test_serialization.py b/python/cuvs/cuvs/test/test_serialization.py
index 4ffccf121..1f4a54e87 100644
--- a/python/cuvs/cuvs/test/test_serialization.py
+++ b/python/cuvs/cuvs/test/test_serialization.py
@@ -17,7 +17,7 @@
 import pytest
 from pylibraft.common import device_ndarray
 
-from cuvs.neighbors import cagra, ivf_flat, ivf_pq
+from cuvs.neighbors import brute_force, cagra, ivf_flat, ivf_pq
 from cuvs.test.ann_utils import generate_data
 
 
@@ -35,6 +35,10 @@ def test_save_load_ivf_pq():
     run_save_load(ivf_pq, np.float32)
 
 
+def test_save_load_brute_force():
+    run_save_load(brute_force, np.float32)
+
+
 def run_save_load(ann_module, dtype):
     n_rows = 10000
     n_cols = 50
@@ -43,8 +47,11 @@ def run_save_load(ann_module, dtype):
     dataset = generate_data((n_rows, n_cols), dtype)
     dataset_device = device_ndarray(dataset)
 
-    build_params = ann_module.IndexParams()
-    index = ann_module.build(build_params, dataset_device)
+    if ann_module == brute_force:
+        index = ann_module.build(dataset_device)
+    else:
+        build_params = ann_module.IndexParams()
+        index = ann_module.build(build_params, dataset_device)
 
     assert index.trained
     filename = "my_index.bin"
@@ -54,20 +61,29 @@ def run_save_load(ann_module, dtype):
     queries = generate_data((n_queries, n_cols), dtype)
 
     queries_device = device_ndarray(queries)
-    search_params = ann_module.SearchParams()
     k = 10
-
-    distance_dev, neighbors_dev = ann_module.search(
-        search_params, index, queries_device, k
-    )
+    if ann_module == brute_force:
+        distance_dev, neighbors_dev = ann_module.search(
+            index, queries_device, k
+        )
+    else:
+        search_params = ann_module.SearchParams()
+        distance_dev, neighbors_dev = ann_module.search(
+            search_params, index, queries_device, k
+        )
 
     neighbors = neighbors_dev.copy_to_host()
     dist = distance_dev.copy_to_host()
     del index
 
-    distance_dev, neighbors_dev = ann_module.search(
-        search_params, loaded_index, queries_device, k
-    )
+    if ann_module == brute_force:
+        distance_dev, neighbors_dev = ann_module.search(
+            loaded_index, queries_device, k
+        )
+    else:
+        distance_dev, neighbors_dev = ann_module.search(
+            search_params, loaded_index, queries_device, k
+        )
 
     neighbors2 = neighbors_dev.copy_to_host()
     dist2 = distance_dev.copy_to_host()
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index e26b9e7fc..92e4993c7 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "nvidia-curand",
     "nvidia-cusolver",
     "nvidia-cusparse",
-    "pylibraft==24.10.*",
+    "pylibraft==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -133,7 +133,14 @@ build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 
+[tool.pydistcheck]
+select = [
+    # NOTE: size threshold is managed via CLI args in CI scripts
+    "distro-too-large-compressed",
+]
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
+    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning"
 ]
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index f1a7f272c..90a561bca 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -4,8 +4,11 @@ constraints:
 groups:
   base:
     build:
-      graph_degree: [32, 64, 128, 256]
+      graph_degree: [32, 64, 96, 128]
       intermediate_graph_degree: [32, 64, 96, 128]
       graph_build_algo: ["NN_DESCENT"]
+      hierarchy: ["none", "cpu"]
+      ef_construction: [64, 128, 256, 512]
+      num_threads: [2, 5, 10]
     search:
       ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
index 2b4213016..88ec55dfa 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
@@ -15,69 +15,206 @@
 # limitations under the License.
 #
 import argparse
+import importlib
 import os
 import sys
+import warnings
 
-import cupy as cp
-import numpy as np
-import rmm
-from pylibraft.common import DeviceResources
-from rmm.allocators.cupy import rmm_cupy_allocator
+from .utils import memmap_bin_file, suffix_from_dtype, write_bin
 
-from cuvs.neighbors.brute_force import knn
 
-from .utils import memmap_bin_file, suffix_from_dtype, write_bin
+def import_with_fallback(primary_lib, secondary_lib=None, alias=None):
+    """
+    Attempt to import a primary library, with an optional fallback to a
+    secondary library.
+    Optionally assigns the imported module to a global alias.
+
+    Parameters
+    ----------
+    primary_lib : str
+        Name of the primary library to import.
+    secondary_lib : str, optional
+        Name of the secondary library to use as a fallback. If `None`,
+        no fallback is attempted.
+    alias : str, optional
+        Alias to assign the imported module globally.
+
+    Returns
+    -------
+    module or None
+        The imported module if successful; otherwise, `None`.
+
+    Examples
+    --------
+    >>> xp = import_with_fallback('cupy', 'numpy')
+    >>> mod = import_with_fallback('nonexistent_lib')
+    >>> if mod is None:
+    ...     print("Library not found.")
+    """
+    try:
+        module = importlib.import_module(primary_lib)
+    except ImportError:
+        if secondary_lib is not None:
+            try:
+                module = importlib.import_module(secondary_lib)
+            except ImportError:
+                module = None
+        else:
+            module = None
+    if alias and module is not None:
+        globals()[alias] = module
+    return module
+
+
+xp = import_with_fallback("cupy", "numpy")
+rmm = import_with_fallback("rmm")
+gpu_system = False
+
 
+def force_fallback_to_numpy():
+    global xp, gpu_system
+    xp = import_with_fallback("numpy")
+    gpu_system = False
+    warnings.warn(
+        "Consider using a GPU-based system to greatly accelerate "
+        " generating groundtruths using cuVS."
+    )
+
+
+if rmm is not None:
+    gpu_system = True
+    try:
+        from pylibraft.common import DeviceResources
+        from rmm.allocators.cupy import rmm_cupy_allocator
 
-def generate_random_queries(n_queries, n_features, dtype=np.float32):
+        from cuvs.neighbors.brute_force import build, search
+    except ImportError:
+        # RMM is available, cupy is available, but cuVS is not
+        force_fallback_to_numpy()
+else:
+    # No RMM, no cuVS, but cupy is available
+    force_fallback_to_numpy()
+
+
+def generate_random_queries(n_queries, n_features, dtype=xp.float32):
     print("Generating random queries")
-    if np.issubdtype(dtype, np.integer):
-        queries = cp.random.randint(
+    if xp.issubdtype(dtype, xp.integer):
+        queries = xp.random.randint(
             0, 255, size=(n_queries, n_features), dtype=dtype
         )
     else:
-        queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
+        queries = xp.random.uniform(size=(n_queries, n_features)).astype(dtype)
     return queries
 
 
 def choose_random_queries(dataset, n_queries):
     print("Choosing random vector from dataset as query vectors")
-    query_idx = np.random.choice(
+    query_idx = xp.random.choice(
         dataset.shape[0], size=(n_queries,), replace=False
     )
     return dataset[query_idx, :]
 
 
+def cpu_search(dataset, queries, k, metric="squeclidean"):
+    """
+    Find the k nearest neighbors for each query point in the dataset using the
+    specified metric.
+
+    Parameters
+    ----------
+    dataset : numpy.ndarray
+        An array of shape (n_samples, n_features) representing the dataset.
+    queries : numpy.ndarray
+        An array of shape (n_queries, n_features) representing the query
+        points.
+    k : int
+        The number of nearest neighbors to find.
+    metric : str, optional
+        The distance metric to use. Can be 'squeclidean' or 'inner_product'.
+        Default is 'squeclidean'.
+
+    Returns
+    -------
+    distances : numpy.ndarray
+        An array of shape (n_queries, k) containing the distances
+        (for 'squeclidean') or similarities
+        (for 'inner_product') to the k nearest neighbors for each query.
+    indices : numpy.ndarray
+        An array of shape (n_queries, k) containing the indices of the
+        k nearest neighbors in the dataset for each query.
+
+    """
+    if metric == "squeclidean":
+        diff = queries[:, xp.newaxis, :] - dataset[xp.newaxis, :, :]
+        dist_sq = xp.sum(diff**2, axis=2)  # Shape: (n_queries, n_samples)
+
+        indices = xp.argpartition(dist_sq, kth=k - 1, axis=1)[:, :k]
+        distances = xp.take_along_axis(dist_sq, indices, axis=1)
+
+        sorted_idx = xp.argsort(distances, axis=1)
+        distances = xp.take_along_axis(distances, sorted_idx, axis=1)
+        indices = xp.take_along_axis(indices, sorted_idx, axis=1)
+
+    elif metric == "inner_product":
+        similarities = xp.dot(
+            queries, dataset.T
+        )  # Shape: (n_queries, n_samples)
+
+        neg_similarities = -similarities
+        indices = xp.argpartition(neg_similarities, kth=k - 1, axis=1)[:, :k]
+        distances = xp.take_along_axis(similarities, indices, axis=1)
+
+        sorted_idx = xp.argsort(-distances, axis=1)
+
+    else:
+        raise ValueError(
+            "Unsupported metric in cuvs-bench-cpu. "
+            "Use 'squeclidean' or 'inner_product' or use the GPU package"
+            "to use any distance supported by cuVS."
+        )
+
+    distances = xp.take_along_axis(distances, sorted_idx, axis=1)
+    indices = xp.take_along_axis(indices, sorted_idx, axis=1)
+
+    return distances, indices
+
+
 def calc_truth(dataset, queries, k, metric="sqeuclidean"):
-    handle = DeviceResources()
     n_samples = dataset.shape[0]
     n = 500000  # batch size for processing neighbors
     i = 0
     indices = None
     distances = None
-    queries = cp.asarray(queries, dtype=cp.float32)
+    queries = xp.asarray(queries, dtype=xp.float32)
+
+    if gpu_system:
+        resources = DeviceResources()
 
     while i < n_samples:
         print("Step {0}/{1}:".format(i // n, n_samples // n))
         n_batch = n if i + n <= n_samples else n_samples - i
 
-        X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
+        X = xp.asarray(dataset[i : i + n_batch, :], xp.float32)
 
-        D, Ind = knn(X, queries, k, metric=metric, handle=handle)
-        handle.sync()
+        if gpu_system:
+            index = build(X, metric=metric, resources=resources)
+            D, Ind = search(index, queries, k, resources=resources)
+            resources.sync()
+        else:
+            D, Ind = cpu_search(X, queries, metric=metric)
 
-        D, Ind = cp.asarray(D), cp.asarray(Ind)
+        D, Ind = xp.asarray(D), xp.asarray(Ind)
         Ind += i  # shift neighbor index by offset i
 
         if distances is None:
             distances = D
             indices = Ind
         else:
-            distances = cp.concatenate([distances, D], axis=1)
-            indices = cp.concatenate([indices, Ind], axis=1)
-            idx = cp.argsort(distances, axis=1)[:, :k]
-            distances = cp.take_along_axis(distances, idx, axis=1)
-            indices = cp.take_along_axis(indices, idx, axis=1)
+            distances = xp.concatenate([distances, D], axis=1)
+            indices = xp.concatenate([indices, Ind], axis=1)
+            idx = xp.argsort(distances, axis=1)[:, :k]
+            distances = xp.take_along_axis(distances, idx, axis=1)
+            indices = xp.take_along_axis(indices, idx, axis=1)
 
         i += n_batch
 
@@ -85,11 +222,15 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
 
 
 def main():
-    pool = rmm.mr.PoolMemoryResource(
-        rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
-    )
-    rmm.mr.set_current_device_resource(pool)
-    cp.cuda.set_allocator(rmm_cupy_allocator)
+    if gpu_system and xp.__name__ == "cupy":
+        pool = rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
+        )
+        rmm.mr.set_current_device_resource(pool)
+        xp.cuda.set_allocator(rmm_cupy_allocator)
+    else:
+        # RMM is available, but cupy is not
+        force_fallback_to_numpy()
 
     parser = argparse.ArgumentParser(
         prog="generate_groundtruth",
@@ -196,7 +337,7 @@ def main():
         "Dataset size {:6.1f} GB, shape {}, dtype {}".format(
             dataset.size * dataset.dtype.itemsize / 1e9,
             dataset.shape,
-            np.dtype(dtype),
+            xp.dtype(dtype),
         )
     )
 
@@ -229,11 +370,11 @@ def main():
 
     write_bin(
         os.path.join(args.output, "groundtruth.neighbors.ibin"),
-        indices.astype(np.uint32),
+        indices.astype(xp.uint32),
     )
     write_bin(
         os.path.join(args.output, "groundtruth.distances.fbin"),
-        distances.astype(np.float32),
+        distances.astype(xp.float32),
     )
 
 
diff --git a/python/cuvs_bench/cuvs_bench/run/data_export.py b/python/cuvs_bench/cuvs_bench/run/data_export.py
index 997dab500..1d0ac40a0 100644
--- a/python/cuvs_bench/cuvs_bench/run/data_export.py
+++ b/python/cuvs_bench/cuvs_bench/run/data_export.py
@@ -17,7 +17,6 @@
 import json
 import os
 import traceback
-import warnings
 
 import pandas as pd
 
@@ -170,44 +169,6 @@ def convert_json_to_csv_build(dataset, dataset_path):
             traceback.print_exc()
 
 
-def append_build_data(write, build_file):
-    """
-    Append build data to the search DataFrame.
-
-    Parameters
-    ----------
-    write : pandas.DataFrame
-        The DataFrame containing the search data to which build
-        data will be appended.
-    build_file : str
-        The file path to the build CSV file.
-    """
-    if os.path.exists(build_file):
-        build_df = pd.read_csv(build_file)
-        write_ncols = len(write.columns)
-        # Initialize columns for build data
-        build_columns = [
-            "build time",
-            "build threads",
-            "build cpu_time",
-            "build GPU",
-        ]
-        write = write.assign(**{col: None for col in build_columns})
-        # Append additional columns if available
-        for col_name in build_df.columns[6:]:
-            write[col_name] = None
-        # Match build rows with search rows by index_name
-        for s_index, search_row in write.iterrows():
-            for b_index, build_row in build_df.iterrows():
-                if search_row["index_name"] == build_row["index_name"]:
-                    write.iloc[s_index, write_ncols:] = build_row[2:].values
-                    break
-    else:
-        warnings.warn(
-            f"Build CSV not found for {build_file}, build params not appended."
-        )
-
-
 def convert_json_to_csv_search(dataset, dataset_path):
     """
     Convert search JSON files to CSV format.
@@ -232,7 +193,7 @@ def convert_json_to_csv_search(dataset, dataset_path):
             )
             algo_name = clean_algo_name(algo_name)
             df["name"] = df["name"].str.split("/").str[0]
-            write_data = pd.DataFrame(
+            write = pd.DataFrame(
                 {
                     "algo_name": [algo_name] * len(df),
                     "index_name": df["name"],
@@ -242,11 +203,35 @@ def convert_json_to_csv_search(dataset, dataset_path):
                 }
             )
             # Append build data
-            append_build_data(write_data, build_file)
+            for name in df:
+                if name not in skip_search_cols:
+                    write[name] = df[name]
+            if os.path.exists(build_file):
+                build_df = pd.read_csv(build_file)
+                write_ncols = len(write.columns)
+                write["build time"] = None
+                write["build threads"] = None
+                write["build cpu_time"] = None
+                write["build GPU"] = None
+
+                for col_idx in range(6, len(build_df.columns)):
+                    col_name = build_df.columns[col_idx]
+                    write[col_name] = None
+
+                for s_index, search_row in write.iterrows():
+                    for b_index, build_row in build_df.iterrows():
+                        if search_row["index_name"] == build_row["index_name"]:
+                            write.iloc[s_index, write_ncols] = build_df.iloc[
+                                b_index, 2
+                            ]
+                            write.iloc[
+                                s_index, write_ncols + 1 :
+                            ] = build_df.iloc[b_index, 3:]
+                            break
             # Write search data and compute frontiers
-            write_data.to_csv(file.replace(".json", ",raw.csv"), index=False)
-            write_frontier(file, write_data, "throughput")
-            write_frontier(file, write_data, "latency")
+            write.to_csv(file.replace(".json", ",raw.csv"), index=False)
+            write_frontier(file, write, "throughput")
+            write_frontier(file, write, "latency")
         except Exception as e:
             print(f"Error processing search file {file}: {e}. Skipping...")
             traceback.print_exc()
diff --git a/python/cuvs_bench/pyproject.toml b/python/cuvs_bench/pyproject.toml
index 41ebad116..5b17f7228 100644
--- a/python/cuvs_bench/pyproject.toml
+++ b/python/cuvs_bench/pyproject.toml
@@ -19,6 +19,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "click",
+    "cuvs==24.12.*,>=0.0.0a0",
     "matplotlib",
     "pandas",
     "pyyaml",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 52125bef3..79aa5756a 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -6,7 +6,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "24.10.0"
+version = "24.12.0"
 edition = "2021"
 repository = "https://github.com/rapidsai/cuvs"
 homepage = "https://github.com/rapidsai/cuvs"
diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml
index 7e5b18143..13cc658e3 100644
--- a/rust/cuvs/Cargo.toml
+++ b/rust/cuvs/Cargo.toml
@@ -9,7 +9,7 @@ authors.workspace = true
 license.workspace = true
 
 [dependencies]
-ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "24.10.0" }
+ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "24.12.0" }
 ndarray = "0.15"
 
 [dev-dependencies]