From 6f72fe7c4161b030c212e68c5858dcd59199a813 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 19 Sep 2024 12:06:59 -0400
Subject: [PATCH 01/47] DOC v24.12 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda11.8-pip/devcontainer.json  |  8 +++----
 .../cuda12.5-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.5-pip/devcontainer.json  |  8 +++----
 .github/workflows/build.yaml                  | 14 +++++------
 .github/workflows/pr.yaml                     | 24 +++++++++----------
 .github/workflows/test.yaml                   |  8 +++----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            |  4 ++--
 .../all_cuda-118_arch-x86_64.yaml             |  4 ++--
 .../all_cuda-125_arch-aarch64.yaml            |  4 ++--
 .../all_cuda-125_arch-x86_64.yaml             |  4 ++--
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  4 ++--
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  4 ++--
 .../bench_ann_cuda-125_arch-aarch64.yaml      |  4 ++--
 .../bench_ann_cuda-125_arch-x86_64.yaml       |  4 ++--
 dependencies.yaml                             | 12 +++++-----
 docs/source/developer_guide.md                |  6 ++---
 examples/cmake/thirdparty/fetch_rapids.cmake  |  2 +-
 python/cuvs/pyproject.toml                    |  2 +-
 rust/Cargo.toml                               |  2 +-
 rust/cuvs/Cargo.toml                          |  2 +-
 23 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 13103e8f7..05f11c005 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 74d62afcc..b4c507f86 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index d6902d3f9..4f8d628c2 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 3dcf52e83..8e6ba4de8 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
       "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index db20bdbc1..7ac02e365 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   rust-build:
     needs: cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -82,7 +82,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuvs:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-publish-cuvs:
     needs: wheel-build-cuvs
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 07b10e85a..4e3fb600a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -24,29 +24,29 @@ jobs:
       - wheel-tests-cuvs
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -54,19 +54,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -76,7 +76,7 @@ jobs:
   rust-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -86,20 +86,20 @@ jobs:
   wheel-build-cuvs:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_cuvs.sh
   wheel-tests-cuvs:
     needs: wheel-build-cuvs
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/test_wheel_cuvs.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0821233a1..5f60c0a34 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-cuvs:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index e697c61ed..e23b94616 100755
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ mamba install -c conda-forge -c nvidia -c rapidsai cuvs
 If installing a version that has not yet been released, the `rapidsai` channel can be replaced with `rapidsai-nightly`:
 
 ```bash
-mamba install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.10
+mamba install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.12
 ```
 
 Please see the [Build and Install Guide](https://docs.rapids.ai/api/cuvs/stable/build/) for more information on installing cuVS and building from source.
diff --git a/VERSION b/VERSION
index 7c7ba0443..af28c42b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index cfcb56225..cb5c804d9 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -45,7 +45,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index dc519d1b5..3b126c1dc 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -45,7 +45,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index b32650e44..0eafb709c 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index d40fc3b99..fc15743c0 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - make
 - nccl>=2.9.9
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index c6e8b05a2..47d012c03 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - nvcc_linux-aarch64=11.8
 - openblas
 - pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index d6c023ae9..ae7a64e44 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - nvcc_linux-64=11.8
 - openblas
 - pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 4d0ca9496..3807661eb 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -30,14 +30,14 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
 - pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-125_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 7dd67ab5e..14182f865 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -30,14 +30,14 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.10.*,>=0.0.0a0
+- librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
 - pandas
-- pylibraft==24.10.*,>=0.0.0a0
+- pylibraft==24.12.*,>=0.0.0a0
 - pyyaml
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 9fcbeaae2..956f33196 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -481,7 +481,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==24.10.*,>=0.0.0a0
+          - &librmm_unsuffixed librmm==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -494,18 +494,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==24.10.*,>=0.0.0a0
+              - librmm-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu11==24.10.*,>=0.0.0a0
+              - librmm-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==24.10.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==24.12.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -518,10 +518,10 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==24.10.*,>=0.0.0a0
+              - pylibraft-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu11==24.10.*,>=0.0.0a0
+              - pylibraft-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibraft_unsuffixed]}
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 516819b1c..c4a099fab 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/examples/cmake/thirdparty/fetch_rapids.cmake b/examples/cmake/thirdparty/fetch_rapids.cmake
index f64a924cf..6f4c627ed 100644
--- a/examples/cmake/thirdparty/fetch_rapids.cmake
+++ b/examples/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.10")
+set(RAPIDS_VERSION "24.12")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index 68bd9a868..bf62f5adf 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "nvidia-curand",
     "nvidia-cusolver",
     "nvidia-cusparse",
-    "pylibraft==24.10.*,>=0.0.0a0",
+    "pylibraft==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 52125bef3..79aa5756a 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -6,7 +6,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "24.10.0"
+version = "24.12.0"
 edition = "2021"
 repository = "https://github.com/rapidsai/cuvs"
 homepage = "https://github.com/rapidsai/cuvs"
diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml
index 7e5b18143..13cc658e3 100644
--- a/rust/cuvs/Cargo.toml
+++ b/rust/cuvs/Cargo.toml
@@ -9,7 +9,7 @@ authors.workspace = true
 license.workspace = true
 
 [dependencies]
-ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "24.10.0" }
+ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "24.12.0" }
 ndarray = "0.15"
 
 [dev-dependencies]

From 2ad639702b3912e4eb037a6817335e48dc90ad73 Mon Sep 17 00:00:00 2001
From: Micka <ide.mickael@gmail.com>
Date: Fri, 27 Sep 2024 20:21:10 +0200
Subject: [PATCH 02/47] Fix Question Retrieval notebook (#352)

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)
  - rhdong (https://github.com/rhdong)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/352
---
 notebooks/VectorSearch_QuestionRetrieval.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb
index 21d59975b..1115a5920 100644
--- a/notebooks/VectorSearch_QuestionRetrieval.ipynb
+++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb
@@ -160,7 +160,7 @@
    },
    "outputs": [],
    "source": [
-    "pq_index_mem = pq_index.pq_dim * pq_index.size * pq_index.pq_bits\n",
+    "pq_index_mem = params.pq_dim * corpus_embeddings.shape[0] * params.pq_bits\n",
     "print(\"IVF-PQ memory footprint: {:.1f} MB\".format(pq_index_mem / 2**20))\n",
     "\n",
     "original_mem = corpus_embeddings.shape[0] * corpus_embeddings.shape[1] * 4\n",

From 397e56e0df4a430edc7f8e16e572fc8a03a0e0c0 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 4 Oct 2024 13:21:33 -0400
Subject: [PATCH 03/47] Prune workflows based on changed files (#392)

Contributes to https://github.com/rapidsai/build-planning/issues/94

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/392
---
 .github/workflows/pr.yaml | 46 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4e3fb600a..e18e82df0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-tests
@@ -25,6 +26,42 @@ jobs:
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!README.md'
+          - '!docs/**'
+          - '!img/**'
+          - '!notebooks/**'
+          - '!python/**'
+          - '!rust/**'
+          - '!thirdparty/LICENSES/**'
+        test_notebooks:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!README.md'
+          - '!rust/**'
+          - '!thirdparty/LICENSES/**'
+        test_python:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!README.md'
+          - '!docs/**'
+          - '!img/**'
+          - '!notebooks/**'
+          - '!rust/**'
+          - '!thirdparty/LICENSES/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
@@ -38,9 +75,10 @@ jobs:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
@@ -58,9 +96,10 @@ jobs:
     with:
       build_type: pull-request
   conda-python-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
@@ -91,9 +130,10 @@ jobs:
       build_type: pull-request
       script: ci/build_wheel_cuvs.sh
   wheel-tests-cuvs:
-    needs: wheel-build-cuvs
+    needs: [wheel-build-cuvs, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cuvs.sh

From 7debf51ae3bd9817143544b4f6593688fcb159f2 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Tue, 8 Oct 2024 22:19:51 +0200
Subject: [PATCH 04/47] Fix NVTX annotations (#400)

1) Replace the domain name from `raft` to `cuvs` to avoid confusion when using tools such as NSYS to analyze the program timeline.
2) Use C++17 feature `__has_include` instead of a CMake script to find out if NVTX available in the benchmark executable. It turns out our CMake check has been not reliable due to not finding include directories correctly.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/400
---
 cpp/bench/ann/CMakeLists.txt                  | 25 +++----------------
 cpp/bench/ann/src/common/util.hpp             |  3 ++-
 cpp/src/cluster/detail/kmeans.cuh             | 18 ++++++-------
 cpp/src/cluster/detail/kmeans_balanced.cuh    | 10 ++++----
 .../neighbors/detail/cagra/cagra_build.cuh    |  3 ++-
 .../neighbors/detail/cagra/cagra_search.cuh   |  4 +--
 .../detail/cagra/cagra_serialize.cuh          |  8 +++---
 cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh     |  6 +++++
 8 files changed, 33 insertions(+), 44 deletions(-)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 8cbf8c8b3..ac1301221 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -87,21 +87,6 @@ if(CUVS_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss)
 endif()
 
-# ##################################################################################################
-# * Enable NVTX if available
-
-# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench
-# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just
-# raft/cuVS.
-if(TARGET CUDA::nvtx3)
-  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
-  get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES)
-  unset(NVTX3_HEADERS_FOUND CACHE)
-  # Check the headers explicitly to make sure the cpu-only build succeeds
-  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
-  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
-endif()
-
 # ##################################################################################################
 # * Target function -------------------------------------------------------------
 
@@ -127,12 +112,9 @@ function(ConfigureAnnBench)
     add_dependencies(${BENCH_NAME} ANN_BENCH)
   else()
     add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH})
-    target_compile_definitions(
-      ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN
-                            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
-    )
+    target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN>)
     target_link_libraries(
-      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
     )
   endif()
 
@@ -300,7 +282,7 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
   target_link_libraries(
     ANN_BENCH
     PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
-            spdlog::spdlog_header_only $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+            spdlog::spdlog_header_only $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
   )
   set_target_properties(
     ANN_BENCH
@@ -318,7 +300,6 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
     ANN_BENCH
     PRIVATE
       $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}">
-      $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
   )
 
   target_link_options(ANN_BENCH PRIVATE -export-dynamic)
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index c3db2bb4b..dbde74ccc 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -18,7 +18,8 @@
 #include "ann_types.hpp"
 #include "cuda_stub.hpp"  // cuda-related utils
 
-#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+#if __has_include(<nvtx3/nvToolsExt.h>)
+#define ANN_BENCH_NVTX3_HEADERS_FOUND
 #include <nvtx3/nvToolsExt.h>
 #endif
 
diff --git a/cpp/src/cluster/detail/kmeans.cuh b/cpp/src/cluster/detail/kmeans.cuh
index 9b673bca3..3d054f0fd 100644
--- a/cpp/src/cluster/detail/kmeans.cuh
+++ b/cpp/src/cluster/detail/kmeans.cuh
@@ -15,12 +15,12 @@
  */
 #pragma once
 
+#include "../../core/nvtx.hpp"
 #include "kmeans_common.cuh"
 
 #include <cuvs/cluster/kmeans.hpp>
 #include <cuvs/distance/distance.hpp>
 
-#include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
@@ -71,7 +71,7 @@ void initRandom(raft::resources const& handle,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 raft::device_matrix_view<DataT, IndexT> centroids)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("initRandom");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("initRandom");
   auto n_clusters = params.n_clusters;
   cuvs::cluster::kmeans::detail::shuffleAndGather<DataT, IndexT>(
     handle, X, centroids, n_clusters, params.rng_state.seed);
@@ -98,7 +98,7 @@ void kmeansPlusPlus(raft::resources const& handle,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                     rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeansPlusPlus");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeansPlusPlus");
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -372,7 +372,7 @@ void kmeans_fit_main(raft::resources const& handle,
                      raft::host_scalar_view<IndexT> n_iter,
                      rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_main");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit_main");
   raft::logger::get(RAFT_NAME).set_level(params.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
@@ -590,7 +590,7 @@ void initScalableKMeansPlusPlus(raft::resources const& handle,
                                 raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                                 rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "initScalableKMeansPlusPlus");
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
@@ -841,7 +841,7 @@ void kmeans_fit(raft::resources const& handle,
                 raft::host_scalar_view<DataT> inertia,
                 raft::host_scalar_view<IndexT> n_iter)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   auto n_clusters     = pams.n_clusters;
@@ -1009,7 +1009,7 @@ void kmeans_predict(raft::resources const& handle,
                     bool normalize_weight,
                     raft::host_scalar_view<DataT> inertia)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_predict");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_predict");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
@@ -1153,7 +1153,7 @@ void kmeans_fit_predict(raft::resources const& handle,
                         raft::host_scalar_view<DataT> inertia,
                         raft::host_scalar_view<IndexT> n_iter)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_predict");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit_predict");
   if (!centroids.has_value()) {
     auto n_features = X.extent(1);
     auto centroids_matrix =
@@ -1217,7 +1217,7 @@ void kmeans_transform(raft::resources const& handle,
                       raft::device_matrix_view<const DataT> centroids,
                       raft::device_matrix_view<DataT> X_new)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_transform");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_transform");
   raft::logger::get(RAFT_NAME).set_level(pams.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
diff --git a/cpp/src/cluster/detail/kmeans_balanced.cuh b/cpp/src/cluster/detail/kmeans_balanced.cuh
index 34bb22e85..3f1ad2334 100644
--- a/cpp/src/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/src/cluster/detail/kmeans_balanced.cuh
@@ -20,10 +20,10 @@
 #include "kmeans_common.cuh"
 #include <cuvs/cluster/kmeans.hpp>
 
+#include "../../core/nvtx.hpp"
 #include "../../distance/distance.cuh"
 
 #include <cuvs/distance/distance.hpp>
-#include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger-ext.hpp>
 #include <raft/core/operators.hpp>
@@ -378,7 +378,7 @@ void compute_norm(const raft::resources& handle,
                   FinOpT norm_fin_op,
                   std::optional<rmm::device_async_resource_ref> mr = std::nullopt)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("compute_norm");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("compute_norm");
   auto stream = raft::resource::get_cuda_stream(handle);
   rmm::device_uvector<MathT> mapped_dataset(
     0, stream, mr.value_or(raft::resource::get_workspace_resource(handle)));
@@ -434,7 +434,7 @@ void predict(const raft::resources& handle,
              const MathT* dataset_norm                        = nullptr)
 {
   auto stream = raft::resource::get_cuda_stream(handle);
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle));
   auto [max_minibatch_size, _mem_per_row] =
@@ -603,7 +603,7 @@ auto adjust_centers(MathT* centers,
                     rmm::cuda_stream_view stream,
                     rmm::device_async_resource_ref device_memory) -> bool
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   if (n_clusters == 0) { return false; }
   constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
@@ -1036,7 +1036,7 @@ void build_hierarchical(const raft::resources& handle,
   auto stream  = raft::resource::get_cuda_stream(handle);
   using LabelT = uint32_t;
 
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
 
   IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index e5495dc3e..9e4d453e3 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "../../../core/nvtx.hpp"
 #include "../../vpq_dataset.cuh"
 #include "graph_core.cuh"
 #include <cuvs/neighbors/cagra.hpp>
@@ -130,7 +131,7 @@ void build_knn_graph(
                "Currently only L2Expanded or InnerProduct metric are supported");
 
   uint32_t node_degree = knn_graph.extent(1);
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_graph(%zu, %zu, %u)",
     size_t(dataset.extent(0)),
     size_t(dataset.extent(1)),
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 4c15b8e14..95c158675 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "../../../core/nvtx.hpp"
 #include "factory.cuh"
 #include "sample_filter_utils.cuh"
 #include "search_plan.cuh"
@@ -23,7 +24,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 
@@ -66,7 +66,7 @@ void search_main_core(raft::resources const& res,
     params.max_queries = std::min<size_t>(queries.extent(0), deviceProp.maxGridSize[1]);
   }
 
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)",
     params.max_queries,
     topk,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
index f86ed9ef6..4c3fe5e81 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
@@ -21,10 +21,10 @@
 #include <raft/core/logger-ext.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/mdspan_types.hpp>
-#include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
 
+#include "../../../core/nvtx.hpp"
 #include "../dataset_serialize.hpp"
 
 #include <cstddef>
@@ -53,7 +53,7 @@ void serialize(raft::resources const& res,
                const index<T, IdxT>& index_,
                bool include_dataset)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::serialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
 
   RAFT_LOG_DEBUG(
     "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
@@ -103,7 +103,7 @@ void serialize_to_hnswlib(raft::resources const& res,
 {
   // static_assert(std::is_same_v<IdxT, int> or std::is_same_v<IdxT, uint32_t>,
   //               "An hnswlib index can only be trained with int32 or uint32 IdxT");
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::serialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
   RAFT_LOG_DEBUG("Saving CAGRA index to hnswlib format, size %zu, dim %u",
                  static_cast<size_t>(index_.size()),
                  index_.dim());
@@ -234,7 +234,7 @@ void serialize_to_hnswlib(raft::resources const& res,
 template <typename T, typename IdxT>
 void deserialize(raft::resources const& res, std::istream& is, index<T, IdxT>* index_)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::deserialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::deserialize");
 
   char dtype_string[4];
   is.read(dtype_string, 4);
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index c65ea8108..f0f464950 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -1729,6 +1729,12 @@ auto build(raft::resources const& handle,
     if constexpr (std::is_same_v<T, float>) {
       raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     } else {
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+        "   ivf_pq::build(%zu, %zu)/sample rows with tmp trainset (%zu rows).",
+        size_t(n_rows),
+        size_t(dim),
+        size_t(n_rows_train));
+
       // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp.
       auto trainset_tmp = raft::make_device_mdarray<T>(
         handle, big_memory_resource, raft::make_extents<int64_t>(n_rows_train, dim));

From e55e655e1ac6fb10ba846e808d3003ce20c580f5 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 11 Oct 2024 15:08:21 -0500
Subject: [PATCH 05/47] make conda installs in CI stricter (#406)

Contributes to https://github.com/rapidsai/build-planning/issues/106

Proposes specifying the RAPIDS version in `conda install` calls that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/cuvs/pull/406
---
 ci/build_docs.sh  | 11 ++++++-----
 ci/build_rust.sh  |  6 ++++--
 ci/test_cpp.sh    |  5 ++++-
 ci/test_python.sh |  5 ++++-
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 460cc3899..bce93c605 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,6 +6,9 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key docs \
@@ -28,11 +31,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcuvs cuvs
+  "libcuvs=${RAPIDS_VERSION}" \
+  "cuvs=${RAPIDS_VERSION}"
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
@@ -54,4 +55,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/cuvs/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/cuvs/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/build_rust.sh b/ci/build_rust.sh
index 31d0de053..309501c32 100755
--- a/ci/build_rust.sh
+++ b/ci/build_rust.sh
@@ -6,6 +6,8 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key rust \
@@ -32,7 +34,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # installing libcuvs/libraft will speed up the rust build substantially
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcuvs  \
-  libraft
+  "libcuvs=${RAPIDS_VERSION}" \
+  "libraft=${RAPIDS_VERSION}"
 
 bash ./build.sh rust
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 6dfc2cf71..134dc4421 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -26,7 +28,8 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcuvs libcuvs-tests
+  "libcuvs=${RAPIDS_VERSION}" \
+  "libcuvs-tests=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 93bc597cf..b9c394062 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -31,7 +33,8 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcuvs cuvs
+  "libcuvs=${RAPIDS_VERSION}" \
+  "cuvs=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi

From f62b217f97c9e14b340f11bcbfe556fcad9ed816 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Tue, 15 Oct 2024 10:07:41 +0200
Subject: [PATCH 06/47] Add --no-lap-sync cmd option to ann-bench (#405)

Add a command-line option to disable the CUDA event/stream synchronization on each iteration. Only one sync is done per benchmark loop in this case instead. As a result, the measured QPS is observed due to:
1) A small `cudaEventSynchronize` is removed from the benchmark loop;
2) If a GPU algorithm has little to no sync between the GPU and CPU, the kernel launch latency and other CPU overheads are completely hidden.

The new option is experimental and disabled by default.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/cuvs/pull/405
---
 cpp/bench/ann/src/common/benchmark.hpp | 161 +++++++++++++++----------
 1 file changed, 98 insertions(+), 63 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index db3e533e0..06e1e27af 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -119,7 +119,8 @@ template <typename T>
 void bench_build(::benchmark::State& state,
                  std::shared_ptr<const dataset<T>> dataset,
                  configuration::index index,
-                 bool force_overwrite)
+                 bool force_overwrite,
+                 bool no_lap_sync)
 {
   // NB: these two thread-local vars can be used within algo wrappers
   cuvs::bench::benchmark_thread_id = state.thread_index();
@@ -149,9 +150,22 @@ void bench_build(::benchmark::State& state,
   cuda_timer gpu_timer{algo};
   {
     nvtx_case nvtx{state.name()};
+    /* Note: GPU timing
+
+    The GPU time is measured between construction and destruction of `cuda_lap` objects (`gpu_all`
+    and `gpu_lap` variables) and added to the `gpu_timer` object.
+
+    We sync with the GPU (cudaEventSynchronize) either each iteration (lifetime of the `gpu_lap`
+    variable) or once per benchmark loop (lifetime of the `gpu_all` variable). The decision is
+
+    controlled by the `no_lap_sync` argument. In either case, we need at least one sync throughout
+    the benchmark loop to make sure the GPU has finished its work before we measure the total run
+    time.
+    */
+    [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync);
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync);
       try {
         algo->build(base_set, index_size);
       } catch (const std::exception& e) {
@@ -173,7 +187,8 @@ template <typename T>
 void bench_search(::benchmark::State& state,
                   configuration::index index,
                   std::size_t search_param_ix,
-                  std::shared_ptr<const dataset<T>> dataset)
+                  std::shared_ptr<const dataset<T>> dataset,
+                  bool no_lap_sync)
 {
   // NB: these two thread-local vars can be used within algo wrappers
   cuvs::bench::benchmark_thread_id = state.thread_index();
@@ -300,25 +315,29 @@ void bench_search(::benchmark::State& state,
     // Initialize with algo, so that the timer.lap() object can sync with algo::get_sync_stream()
     cuda_timer gpu_timer{a};
     auto start = std::chrono::high_resolution_clock::now();
-    for (auto _ : state) {
-      [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
-      try {
-        a->search(query_set + batch_offset * dataset->dim(),
-                  n_queries,
-                  k,
-                  neighbors_ptr + out_offset * k,
-                  distances_ptr + out_offset * k);
-      } catch (const std::exception& e) {
-        state.SkipWithError("Benchmark loop: " + std::string(e.what()));
-        break;
-      }
+    {
+      /* See the note above: GPU timing */
+      [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync);
+      for (auto _ : state) {
+        [[maybe_unused]] auto ntx_lap = nvtx.lap();
+        [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync);
+        try {
+          a->search(query_set + batch_offset * dataset->dim(),
+                    n_queries,
+                    k,
+                    neighbors_ptr + out_offset * k,
+                    distances_ptr + out_offset * k);
+        } catch (const std::exception& e) {
+          state.SkipWithError("Benchmark loop: " + std::string(e.what()));
+          break;
+        }
 
-      // advance to the next batch
-      batch_offset = (batch_offset + queries_stride) % query_set_size;
-      out_offset   = (out_offset + n_queries) % query_set_size;
+        // advance to the next batch
+        batch_offset = (batch_offset + queries_stride) % query_set_size;
+        out_offset   = (out_offset + n_queries) % query_set_size;
 
-      queries_processed += n_queries;
+        queries_processed += n_queries;
+      }
     }
     auto end      = std::chrono::high_resolution_clock::now();
     auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
@@ -379,44 +398,51 @@ void bench_search(::benchmark::State& state,
 inline void printf_usage()
 {
   ::benchmark::PrintDefaultHelp();
-  fprintf(stdout,
-          "          [--build|--search] \n"
-          "          [--force]\n"
-          "          [--data_prefix=<prefix>]\n"
-          "          [--index_prefix=<prefix>]\n"
-          "          [--override_kv=<key:value1:value2:...:valueN>]\n"
-          "          [--mode=<latency|throughput>\n"
-          "          [--threads=min[:max]]\n"
-          "          <conf>.json\n"
-          "\n"
-          "Note the non-standard benchmark parameters:\n"
-          "  --build: build mode, will build index\n"
-          "  --search: search mode, will search using the built index\n"
-          "            one and only one of --build and --search should be specified\n"
-          "  --force: force overwriting existing index files\n"
-          "  --data_prefix=<prefix>:"
-          " prepend <prefix> to dataset file paths specified in the <conf>.json (default = "
-          "'data/').\n"
-          "  --index_prefix=<prefix>:"
-          " prepend <prefix> to index file paths specified in the <conf>.json (default = "
-          "'index/').\n"
-          "  --override_kv=<key:value1:value2:...:valueN>:"
-          " override a build/search key one or more times multiplying the number of configurations;"
-          " you can use this parameter multiple times to get the Cartesian product of benchmark"
-          " configs.\n"
-          "  --mode=<latency|throughput>"
-          " run the benchmarks in latency (accumulate times spent in each batch) or "
-          " throughput (pipeline batches and measure end-to-end) mode\n"
-          "  --threads=min[:max] specify the number threads to use for throughput benchmark."
-          " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified,"
-          " then a single test is run with 'min' threads. By default min=1, max=<num hyper"
-          " threads>.\n");
+  fprintf(
+    stdout,
+    "          [--build|--search] \n"
+    "          [--force]\n"
+    "          [--data_prefix=<prefix>]\n"
+    "          [--index_prefix=<prefix>]\n"
+    "          [--override_kv=<key:value1:value2:...:valueN>]\n"
+    "          [--mode=<latency|throughput>\n"
+    "          [--threads=min[:max]]\n"
+    "          [--no-lap-sync]\n"
+    "          <conf>.json\n"
+    "\n"
+    "Note the non-standard benchmark parameters:\n"
+    "  --build: build mode, will build index\n"
+    "  --search: search mode, will search using the built index\n"
+    "            one and only one of --build and --search should be specified\n"
+    "  --force: force overwriting existing index files\n"
+    "  --data_prefix=<prefix>:"
+    " prepend <prefix> to dataset file paths specified in the <conf>.json (default = "
+    "'data/').\n"
+    "  --index_prefix=<prefix>:"
+    " prepend <prefix> to index file paths specified in the <conf>.json (default = "
+    "'index/').\n"
+    "  --override_kv=<key:value1:value2:...:valueN>:"
+    " override a build/search key one or more times multiplying the number of configurations;"
+    " you can use this parameter multiple times to get the Cartesian product of benchmark"
+    " configs.\n"
+    "  --mode=<latency|throughput>"
+    " run the benchmarks in latency (accumulate times spent in each batch) or "
+    " throughput (pipeline batches and measure end-to-end) mode\n"
+    "  --threads=min[:max] specify the number threads to use for throughput benchmark."
+    " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified,"
+    " then a single test is run with 'min' threads. By default min=1, max=<num hyper"
+    " threads>.\n"
+    "  --no-lap-sync disable CUDA event synchronization between benchmark iterations. If a GPU"
+    " algorithm has no sync with CPU, this can make the GPU processing significantly lag behind the"
+    " CPU scheduling. Then this also hides the scheduling latencies and thus improves the measured"
+    " throughput (QPS). Note there's a sync at the end of the benchmark loop in any case.\n");
 }
 
 template <typename T>
 void register_build(std::shared_ptr<const dataset<T>> dataset,
                     std::vector<configuration::index> indices,
-                    bool force_overwrite)
+                    bool force_overwrite,
+                    bool no_lap_sync)
 {
   for (auto index : indices) {
     auto suf      = static_cast<std::string>(index.build_param["override_suffix"]);
@@ -425,7 +451,7 @@ void register_build(std::shared_ptr<const dataset<T>> dataset,
     std::replace(file_suf.begin(), file_suf.end(), '/', '-');
     index.file += file_suf;
     auto* b = ::benchmark::RegisterBenchmark(
-      index.name + suf, bench_build<T>, dataset, index, force_overwrite);
+      index.name + suf, bench_build<T>, dataset, index, force_overwrite, no_lap_sync);
     b->Unit(benchmark::kSecond);
     b->MeasureProcessCPUTime();
     b->UseRealTime();
@@ -436,14 +462,16 @@ template <typename T>
 void register_search(std::shared_ptr<const dataset<T>> dataset,
                      std::vector<configuration::index> indices,
                      Mode metric_objective,
-                     const std::vector<int>& threads)
+                     const std::vector<int>& threads,
+                     bool no_lap_sync)
 {
   for (auto index : indices) {
     for (std::size_t i = 0; i < index.search_params.size(); i++) {
       auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
       index.search_params[i].erase("override_suffix");
 
-      auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search<T>, index, i, dataset)
+      auto* b = ::benchmark::RegisterBenchmark(
+                  index.name + suf, bench_search<T>, index, i, dataset, no_lap_sync)
                   ->Unit(benchmark::kMillisecond)
                   /**
                    * The following are important for getting accuracy QPS measurements on both CPU
@@ -470,7 +498,8 @@ void dispatch_benchmark(std::string cmdline,
                         std::string index_prefix,
                         kv_series override_kv,
                         Mode metric_objective,
-                        const std::vector<int>& threads)
+                        const std::vector<int>& threads,
+                        bool no_lap_sync)
 {
   ::benchmark::AddCustomContext("command_line", cmdline);
   for (auto [key, value] : host_info()) {
@@ -514,7 +543,7 @@ void dispatch_benchmark(std::string cmdline,
         more_indices.push_back(modified_index);
       }
     }
-    register_build<T>(dataset, more_indices, force_overwrite);
+    register_build<T>(dataset, more_indices, force_overwrite, no_lap_sync);
   } else if (search_mode) {
     if (file_exists(query_file)) {
       log_info("Using the query file '%s'", query_file.c_str());
@@ -543,7 +572,7 @@ void dispatch_benchmark(std::string cmdline,
       index.search_params = apply_overrides(index.search_params, override_kv);
       index.file          = combine_path(index_prefix, index.file);
     }
-    register_search<T>(dataset, indices, metric_objective, threads);
+    register_search<T>(dataset, indices, metric_objective, threads, no_lap_sync);
   }
 }
 
@@ -571,6 +600,7 @@ inline auto run_main(int argc, char** argv) -> int
   bool force_overwrite                = false;
   bool build_mode                     = false;
   bool search_mode                    = false;
+  bool no_lap_sync                    = false;
   std::string data_prefix             = "data";
   std::string index_prefix            = "index";
   std::string new_override_kv         = "";
@@ -604,6 +634,7 @@ inline auto run_main(int argc, char** argv) -> int
     if (parse_bool_flag(argv[i], "--force", force_overwrite) ||
         parse_bool_flag(argv[i], "--build", build_mode) ||
         parse_bool_flag(argv[i], "--search", search_mode) ||
+        parse_bool_flag(argv[i], "--no-lap-sync", no_lap_sync) ||
         parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
         parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
         parse_string_flag(argv[i], "--mode", mode) ||
@@ -686,7 +717,8 @@ inline auto run_main(int argc, char** argv) -> int
                               index_prefix,
                               override_kv,
                               metric_objective,
-                              threads);
+                              threads,
+                              no_lap_sync);
   } else if (dtype == "half") {
     dispatch_benchmark<half>(cmdline,
                              conf,
@@ -697,7 +729,8 @@ inline auto run_main(int argc, char** argv) -> int
                              index_prefix,
                              override_kv,
                              metric_objective,
-                             threads);
+                             threads,
+                             no_lap_sync);
   } else if (dtype == "uint8") {
     dispatch_benchmark<std::uint8_t>(cmdline,
                                      conf,
@@ -708,7 +741,8 @@ inline auto run_main(int argc, char** argv) -> int
                                      index_prefix,
                                      override_kv,
                                      metric_objective,
-                                     threads);
+                                     threads,
+                                     no_lap_sync);
   } else if (dtype == "int8") {
     dispatch_benchmark<std::int8_t>(cmdline,
                                     conf,
@@ -719,7 +753,8 @@ inline auto run_main(int argc, char** argv) -> int
                                     index_prefix,
                                     override_kv,
                                     metric_objective,
-                                    threads);
+                                    threads,
+                                    no_lap_sync);
   } else {
     log_error("datatype '%s' is not supported", dtype.c_str());
     return -1;

From c86e74d6d12cd3396fdf1ed9fa3b96d858b1fa5f Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 17 Oct 2024 11:38:13 -0400
Subject: [PATCH 07/47] Add `click` package to `cuvs-bench` conda recipe (#408)

This package is available in `dependencies.yaml`, but due to an oversight was not added to conda metas.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/408
---
 conda/recipes/cuvs_bench/meta.yaml     | 1 +
 conda/recipes/cuvs_bench_cpu/meta.yaml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/conda/recipes/cuvs_bench/meta.yaml b/conda/recipes/cuvs_bench/meta.yaml
index 9ecbf82bb..67d66efce 100644
--- a/conda/recipes/cuvs_bench/meta.yaml
+++ b/conda/recipes/cuvs_bench/meta.yaml
@@ -82,6 +82,7 @@ requirements:
 
   run:
     - benchmark
+    - click
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
diff --git a/conda/recipes/cuvs_bench_cpu/meta.yaml b/conda/recipes/cuvs_bench_cpu/meta.yaml
index 0ce5db744..95bea92ef 100644
--- a/conda/recipes/cuvs_bench_cpu/meta.yaml
+++ b/conda/recipes/cuvs_bench_cpu/meta.yaml
@@ -55,6 +55,7 @@ requirements:
 
   run:
     - benchmark
+    - click
     - glog {{ glog_version }}
     - h5py {{ h5py_version }}
     - matplotlib

From f708fe388ee206105fd7894388283995e88ab7f9 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 17 Oct 2024 14:13:24 -0400
Subject: [PATCH 08/47] We need to enable the c_api by default (#416)

Remove a collection of unneccesarily complex CMake logic.

Major change is that we explicitly opt-in to building the C API bindings by default since it is a hard requirement for our python bindings, and the project has numerous conditions to disable it.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/416
---
 cpp/CMakeLists.txt      | 18 ++++--------------
 cpp/test/CMakeLists.txt | 38 +++++++++++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3e98a247e..746245791 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -53,8 +53,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON)
 option(BUILD_TESTS "Build cuvs unit-tests" ON)
-option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
-option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
+option(BUILD_C_LIBRARY "Build cuVS C API library" ON)
 option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
 option(BUILD_MG_ALGOS "Build with multi-GPU support" ON)
@@ -72,21 +71,12 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(CUVS_NVTX "Enable nvtx markers" OFF)
 option(CUVS_RAFT_CLONE_ON_PIN "Explicitly clone RAFT branch when pinned to non-feature branch" ON)
 
-if((BUILD_TESTS OR BUILD_C_LIBRARY) AND NOT BUILD_CPU_ONLY)
-
-endif()
-
 if(BUILD_CPU_ONLY)
   set(BUILD_SHARED_LIBS OFF)
   set(BUILD_TESTS OFF)
   set(BUILD_C_LIBRARY OFF)
-endif()
-
-if(NOT BUILD_C_LIBRARY)
-  set(BUILD_C_TESTS OFF)
-endif()
-
-if(NOT BUILD_SHARED_LIBS)
+  set(BUILD_CAGRA_HNSWLIB OFF)
+elseif(NOT BUILD_SHARED_LIBS)
   set(BUILD_TESTS OFF)
   set(BUILD_C_LIBRARY OFF)
   set(BUILD_CAGRA_HNSWLIB OFF)
@@ -771,7 +761,7 @@ endif()
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
 
-if(BUILD_TESTS OR BUILD_C_TESTS)
+if(BUILD_TESTS)
   add_subdirectory(internal)
   add_subdirectory(test)
 endif()
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f4d35e438..60007825c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -215,7 +215,9 @@ if(BUILD_TESTS)
   )
 endif()
 
-if(BUILD_C_TESTS)
+if(TARGET cuvs::c_api)
+  enable_language(C)
+
   ConfigureTest(NAME INTEROP_TEST PATH core/interop.cu C_LIB)
   ConfigureTest(
     NAME DISTANCE_C_TEST PATH distance/run_pairwise_distance_c.c distance/pairwise_distance_c.cu
@@ -239,19 +241,37 @@ if(BUILD_C_TESTS)
     target_link_libraries(NEIGHBORS_HNSW_TEST PRIVATE hnswlib::hnswlib)
     target_compile_definitions(NEIGHBORS_HNSW_TEST PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
   endif()
-endif()
 
-# ##################################################################################################
-# Install tests ####################################################################################
-# ##################################################################################################
-rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcuvs)
-
-if(BUILD_C_TESTS)
-  enable_language(C)
 
   add_executable(cuvs_c_test core/c_api.c)
   target_link_libraries(cuvs_c_test PUBLIC cuvs::c_api)
 
   add_executable(cuvs_c_neighbors_test neighbors/c_api.c)
   target_link_libraries(cuvs_c_neighbors_test PUBLIC cuvs::c_api)
+
+  set_target_properties(
+    cuvs_c_test cuvs_c_neighbors_test
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUVS_BINARY_DIR}/gtests>"
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+  )
+
+  rapids_test_add(
+    NAME cuvs_c_test
+    COMMAND cuvs_c_test
+    GPUS 1
+    PERCENT 100
+    INSTALL_COMPONENT_SET testing
+  )
+  rapids_test_add(
+    NAME cuvs_c_neighbors_test
+    COMMAND cuvs_c_neighbors_test
+    GPUS 1
+    PERCENT 100
+    INSTALL_COMPONENT_SET testing
+  )
 endif()
+
+# ##################################################################################################
+# Install tests ####################################################################################
+# ##################################################################################################
+rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcuvs)

From 801945fb1acfe4ca12b4d6dd30592f824166a389 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 18 Oct 2024 12:33:14 -0500
Subject: [PATCH 09/47] Use dashes in cuvs-bench package name. (#417)

This attempts to rename `cuvs_bench` to `cuvs-bench` and similarly for the CPU package. This follows from this thread: https://github.com/rapidsai/docker/pull/715#discussion_r1805232894

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuvs/pull/417
---
 build.sh                                             |  2 +-
 ci/build_python.sh                                   |  8 ++++----
 .../{cuvs_bench_cpu => cuvs-bench-cpu}/build.sh      |  0
 .../conda_build_config.yaml                          |  0
 .../{cuvs_bench_cpu => cuvs-bench-cpu}/meta.yaml     |  2 +-
 conda/recipes/{cuvs_bench => cuvs-bench}/build.sh    |  0
 .../conda_build_config.yaml                          |  0
 conda/recipes/{cuvs_bench => cuvs-bench}/meta.yaml   |  2 +-
 docs/source/cuvs_bench/index.rst                     | 12 ++++++------
 9 files changed, 13 insertions(+), 13 deletions(-)
 rename conda/recipes/{cuvs_bench_cpu => cuvs-bench-cpu}/build.sh (100%)
 rename conda/recipes/{cuvs_bench_cpu => cuvs-bench-cpu}/conda_build_config.yaml (100%)
 rename conda/recipes/{cuvs_bench_cpu => cuvs-bench-cpu}/meta.yaml (98%)
 rename conda/recipes/{cuvs_bench => cuvs-bench}/build.sh (100%)
 rename conda/recipes/{cuvs_bench => cuvs-bench}/conda_build_config.yaml (100%)
 rename conda/recipes/{cuvs_bench => cuvs-bench}/meta.yaml (99%)

diff --git a/build.sh b/build.sh
index b787d3a41..29e8fe7c6 100755
--- a/build.sh
+++ b/build.sh
@@ -447,7 +447,7 @@ if (( ${NUMARGS} == 0 )) || hasArg python; then
         python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs
 fi
 
-# Build and (optionally) install the cuvs_bench Python package
+# Build and (optionally) install the cuvs-bench Python package
 if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then
     python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs_bench
 fi
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 7b0c639af..deb67e91c 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -31,14 +31,14 @@ rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cuvs
 
-# Build cuvs_bench for each cuda and python version
+# Build cuvs-bench for each cuda and python version
 rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cuvs_bench
+  conda/recipes/cuvs-bench
 
-# Build cuvs_bench_cpu only in CUDA 12 jobs since it only depends on python
+# Build cuvs-bench-cpu only in CUDA 12 jobs since it only depends on python
 # version
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
@@ -46,7 +46,7 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cuvs_bench_cpu
+  conda/recipes/cuvs-bench-cpu
 fi
 
 rapids-upload-conda-to-s3 python
diff --git a/conda/recipes/cuvs_bench_cpu/build.sh b/conda/recipes/cuvs-bench-cpu/build.sh
similarity index 100%
rename from conda/recipes/cuvs_bench_cpu/build.sh
rename to conda/recipes/cuvs-bench-cpu/build.sh
diff --git a/conda/recipes/cuvs_bench_cpu/conda_build_config.yaml b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
similarity index 100%
rename from conda/recipes/cuvs_bench_cpu/conda_build_config.yaml
rename to conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
diff --git a/conda/recipes/cuvs_bench_cpu/meta.yaml b/conda/recipes/cuvs-bench-cpu/meta.yaml
similarity index 98%
rename from conda/recipes/cuvs_bench_cpu/meta.yaml
rename to conda/recipes/cuvs-bench-cpu/meta.yaml
index 95bea92ef..02c11346f 100644
--- a/conda/recipes/cuvs_bench_cpu/meta.yaml
+++ b/conda/recipes/cuvs-bench-cpu/meta.yaml
@@ -8,7 +8,7 @@
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
 package:
-  name: cuvs_bench_cpu
+  name: cuvs-bench-cpu
   version: {{ version }}
   script: build.sh
 
diff --git a/conda/recipes/cuvs_bench/build.sh b/conda/recipes/cuvs-bench/build.sh
similarity index 100%
rename from conda/recipes/cuvs_bench/build.sh
rename to conda/recipes/cuvs-bench/build.sh
diff --git a/conda/recipes/cuvs_bench/conda_build_config.yaml b/conda/recipes/cuvs-bench/conda_build_config.yaml
similarity index 100%
rename from conda/recipes/cuvs_bench/conda_build_config.yaml
rename to conda/recipes/cuvs-bench/conda_build_config.yaml
diff --git a/conda/recipes/cuvs_bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
similarity index 99%
rename from conda/recipes/cuvs_bench/meta.yaml
rename to conda/recipes/cuvs-bench/meta.yaml
index 67d66efce..3e81edc58 100644
--- a/conda/recipes/cuvs_bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -10,7 +10,7 @@
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
 package:
-  name: cuvs_bench
+  name: cuvs-bench
   version: {{ version }}
   script: build.sh
 
diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst
index 61ac622d2..81fb7537c 100644
--- a/docs/source/cuvs_bench/index.rst
+++ b/docs/source/cuvs_bench/index.rst
@@ -93,20 +93,20 @@ We provide images for GPU enabled systems, as well as systems without a GPU. The
 - `cuvs-bench-datasets`: Contains the GPU and CPU benchmarks with million-scale datasets already included in the container. Best suited for users that want to run multiple million scale datasets already included in the image.
 - `cuvs-bench-cpu`: Contains only CPU benchmarks with minimal size. Best suited for users that want the smallest containers to reproduce benchmarks on systems without a GPU.
 
-Nightly images are located in `dockerhub <https://hub.docker.com/r/rapidsai/cuvs-ann-bench/tags>`_, meanwhile release (stable) versions are located in `NGC <https://hub.docker.com/r/rapidsai/cuvs_bench>`_, starting with release 24.10.
+Nightly images are located in `dockerhub <https://hub.docker.com/r/rapidsai/cuvs-bench/tags>`_, meanwhile release (stable) versions are located in `NGC <https://hub.docker.com/r/rapidsai/cuvs-bench>`_, starting with release 24.10.
 
-The following command pulls the nightly container for python version 10, cuda version 12, and CUVS version 23.10:
+The following command pulls the nightly container for Python version 3.10, CUDA version 12.0, and cuVS version 24.10:
 
 .. code-block:: bash
 
-   docker pull rapidsai/cuvs_bench:24.10a-cuda12.0-py3.10 #substitute cuvs_bench for the exact desired container.
+   docker pull rapidsai/cuvs-bench:24.10a-cuda12.0-py3.10 #substitute cuvs-bench for the exact desired container.
 
 The CUDA and python versions can be changed for the supported values:
 - Supported CUDA versions: 11.4 and 12.x
 - Supported Python versions: 3.9 and 3.10.
 
 You can see the exact versions as well in the dockerhub site:
-- `cuVS bench images <https://hub.docker.com/r/rapidsai/cuvs_bench/tags>`_
+- `cuVS bench images <https://hub.docker.com/r/rapidsai/cuvs-bench/tags>`_
 - `cuVS bench with datasets preloaded images <https://hub.docker.com/r/rapidsai/cuvs-bench-cpu/tags>`_
 - `cuVS bench CPU only images <https://hub.docker.com/r/rapidsai/cuvs-bench-datasets/tags>`_
 
@@ -583,7 +583,7 @@ A default `datasets.yaml` is provided by CUVS in `${CUVS_HOME}/python/cuvs-ann-b
       dims: 128
       distance: euclidean
 
-Configuration files for ANN algorithms supported by `cuvs-bench` are provided in `${CUVS_HOME}/python/cuvs-bench/src/cuvs_bench/run/conf`. `cuvs_cagra` algorithm configuration looks like:
+Configuration files for ANN algorithms supported by `cuvs-bench` are provided in `${CUVS_HOME}/python/cuvs_bench/cuvs_bench/config/algos`. `cuvs_cagra` algorithm configuration looks like:
 
 .. code-block:: yaml
 
@@ -767,4 +767,4 @@ Add a new entry to `algos.yaml` to map the name of the algorithm to its binary e
       requires_gpu: true
 
 `executable` : specifies the name of the binary that will build/search the index. It is assumed to be available in `cuvs/cpp/build/`.
-`requires_gpu` : denotes whether an algorithm requires GPU to run.
\ No newline at end of file
+`requires_gpu` : denotes whether an algorithm requires GPU to run.

From 009bb8de03ce9708d4d797166187250f77a59a36 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 21 Oct 2024 14:24:34 -0500
Subject: [PATCH 10/47] Use Python for sccache hit rate computation. (#420)

Fixes an issue in CI computations of sccache hit rates. See
https://github.com/rapidsai/cuvs/pull/414 for details.
---
 build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index 29e8fe7c6..9503dff21 100755
--- a/build.sh
+++ b/build.sh
@@ -410,14 +410,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           if [[ ${CACHE_TOOL} == "sccache" && -x "$(command -v sccache)" ]]; then
               COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
               CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }')
-              HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+              HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')")
               MSG="${MSG}<br/>cache hit rate ${HIT_RATE} %"
           elif [[ ${CACHE_TOOL} == "ccache" && -x "$(command -v ccache)" ]]; then
               CACHE_STATS_LINE=$(ccache -s | grep "Hits: \+ [0-9]\+ / [0-9]\+" | tail -n1)
               if [[ ! -z "$CACHE_STATS_LINE" ]]; then
                   CACHE_HITS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $2 }')
                   COMPILE_REQUESTS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $4 }')
-                  HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+                  HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')")
                   MSG="${MSG}<br/>cache hit rate ${HIT_RATE} %"
               fi
           fi

From e7f1085b71c340b9600f5f38f7f0059a5c7aa806 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 22 Oct 2024 07:57:08 -0500
Subject: [PATCH 11/47] Use environment variables in cache hit rate
 computation. (#422)

Follow-up PR to address feedback: https://github.com/rapidsai/raft/pull/2474#discussion_r1809398110

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cuvs/pull/422
---
 build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index 9503dff21..c08c2900e 100755
--- a/build.sh
+++ b/build.sh
@@ -410,14 +410,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           if [[ ${CACHE_TOOL} == "sccache" && -x "$(command -v sccache)" ]]; then
               COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
               CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }')
-              HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')")
+              HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')")
               MSG="${MSG}<br/>cache hit rate ${HIT_RATE} %"
           elif [[ ${CACHE_TOOL} == "ccache" && -x "$(command -v ccache)" ]]; then
               CACHE_STATS_LINE=$(ccache -s | grep "Hits: \+ [0-9]\+ / [0-9]\+" | tail -n1)
               if [[ ! -z "$CACHE_STATS_LINE" ]]; then
                   CACHE_HITS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $2 }')
                   COMPILE_REQUESTS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $4 }')
-                  HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')")
+                  HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')")
                   MSG="${MSG}<br/>cache hit rate ${HIT_RATE} %"
               fi
           fi

From 12b10e88e8ea6e944e91dee8a0380c89999b3b21 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Mon, 28 Oct 2024 11:56:53 -0500
Subject: [PATCH 12/47] Fix correct call to brute force in generate groundtruth
 of cuvs-bench (#427)

Fixes issue with helper script for generating ground truthset in cuvs-bench, which was using the old RAFT NN API.

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/427
---
 .../cuvs_bench/generate_groundtruth/__main__.py          | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
index 2b4213016..dbee6cd36 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
@@ -24,7 +24,7 @@
 from pylibraft.common import DeviceResources
 from rmm.allocators.cupy import rmm_cupy_allocator
 
-from cuvs.neighbors.brute_force import knn
+from cuvs.neighbors.brute_force import build, search
 
 from .utils import memmap_bin_file, suffix_from_dtype, write_bin
 
@@ -49,7 +49,7 @@ def choose_random_queries(dataset, n_queries):
 
 
 def calc_truth(dataset, queries, k, metric="sqeuclidean"):
-    handle = DeviceResources()
+    resources = DeviceResources()
     n_samples = dataset.shape[0]
     n = 500000  # batch size for processing neighbors
     i = 0
@@ -63,8 +63,9 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
 
         X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
 
-        D, Ind = knn(X, queries, k, metric=metric, handle=handle)
-        handle.sync()
+        index = build(X, metric=metric, resources=resources)
+        D, Ind = search(index, queries, k, resources=resources)
+        resources.sync()
 
         D, Ind = cp.asarray(D), cp.asarray(Ind)
         Ind += i  # shift neighbor index by offset i

From 9f035d8e0e44c8eabdf6983049dfe58f9f1ef807 Mon Sep 17 00:00:00 2001
From: abner-ma <969023674@qq.com>
Date: Tue, 29 Oct 2024 05:25:13 +0800
Subject: [PATCH 13/47] Ivf c example (#404)

Add examples of ivf-flat and ivf-pq in C language

Authors:
  - https://github.com/abner-ma

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/404
---
 examples/c/CMakeLists.txt           |   8 +
 examples/c/src/common.h             | 109 ++++++++++++
 examples/c/src/ivf_flat_c_example.c | 259 ++++++++++++++++++++++++++++
 examples/c/src/ivf_pq_c_example.c   | 189 ++++++++++++++++++++
 4 files changed, 565 insertions(+)
 create mode 100644 examples/c/src/common.h
 create mode 100644 examples/c/src/ivf_flat_c_example.c
 create mode 100644 examples/c/src/ivf_pq_c_example.c

diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt
index ec8ca827a..2a7e70522 100644
--- a/examples/c/CMakeLists.txt
+++ b/examples/c/CMakeLists.txt
@@ -42,3 +42,11 @@ target_link_libraries(CAGRA_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXIST
 add_executable(L2_C_EXAMPLE src/L2_c_example.c)
 target_include_directories(L2_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 target_link_libraries(L2_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXISTS:conda_env>)
+
+add_executable(IVF_FLAT_C_EXAMPLE src/ivf_flat_c_example.c)
+target_include_directories(IVF_FLAT_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
+target_link_libraries(IVF_FLAT_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXISTS:conda_env>)
+
+add_executable(IVF_PQ_C_EXAMPLE src/ivf_pq_c_example.c)
+target_include_directories(IVF_PQ_C_EXAMPLE PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
+target_link_libraries(IVF_PQ_C_EXAMPLE PRIVATE cuvs::c_api $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/c/src/common.h b/examples/c/src/common.h
new file mode 100644
index 000000000..60b9b73cf
--- /dev/null
+++ b/examples/c/src/common.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dlpack/dlpack.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+
+/**
+ * @brief Initialize Tensor for kDLFloat.
+ *
+ * @param[in] t_d Pointer to a vector
+ * @param[in] t_shape[] Two-dimensional array, which stores the number of rows and columns of vectors.
+ * @param[out] t_tensor Stores the initialized DLManagedTensor.
+ */
+void float_tensor_initialize(float* t_d, int64_t t_shape[2], DLManagedTensor* t_tensor) {
+  t_tensor->dl_tensor.data = t_d;
+  t_tensor->dl_tensor.device.device_type = kDLCUDA;
+  t_tensor->dl_tensor.ndim = 2;
+  t_tensor->dl_tensor.dtype.code = kDLFloat;
+  t_tensor->dl_tensor.dtype.bits = 32;
+  t_tensor->dl_tensor.dtype.lanes = 1;
+  t_tensor->dl_tensor.shape = t_shape;
+  t_tensor->dl_tensor.strides = NULL;
+}
+
+/**
+ * @brief Initialize Tensor for kDLInt.
+ *
+ * @param[in] t_d Pointer to a vector
+ * @param[in] t_shape[] Two-dimensional array, which stores the number of rows and columns of vectors.
+ * @param[out] t_tensor Stores the initialized DLManagedTensor.
+ */
+void int_tensor_initialize(int64_t* t_d, int64_t t_shape[], DLManagedTensor* t_tensor) {
+  t_tensor->dl_tensor.data = t_d;
+  t_tensor->dl_tensor.device.device_type = kDLCUDA;
+  t_tensor->dl_tensor.ndim = 2;
+  t_tensor->dl_tensor.dtype.code = kDLInt;
+  t_tensor->dl_tensor.dtype.bits = 64;
+  t_tensor->dl_tensor.dtype.lanes = 1;
+  t_tensor->dl_tensor.shape = t_shape;
+  t_tensor->dl_tensor.strides = NULL;
+}
+
+/**
+ * @brief Fill a vector with random values.
+ *
+ * @param[out] Vec Pointer to a vector
+ * @param[in] n_rows the number of rows in the matrix.
+ * @param[in] n_cols the number of columns in the matrix.
+ * @param[in] min Minimum value among random values.
+ * @param[in] max Maximum value among random values.
+ */
+void generate_dataset(float * Vec,int n_rows, int n_cols, float min, float max) {
+    float scale;
+    float * ptr = Vec;
+    srand((unsigned int)time(NULL));
+    for (int i = 0; i < n_rows; i++) {
+        for (int j = 0; j < n_cols; j++) {
+            scale = rand()/(float)RAND_MAX;
+            ptr = Vec + i * n_cols + j;
+            *ptr = min + scale * (max - min);
+        }
+    }
+}
+
+/**
+ * @brief print the result.
+ *
+ * @param[in] neighbor Pointer to a neighbor vector
+ * @param[in] distances Pointer to a distances vector.
+ * @param[in] n_rows the number of rows in the matrix.
+ * @param[in] n_cols the number of columns in the matrix.
+ */
+void print_results(int64_t * neighbor, float* distances,int n_rows, int n_cols) {
+    int64_t * pn = neighbor;
+    float * pd = distances;
+    for (int i = 0; i < n_rows; ++i) {
+        printf("Query %d neighbor indices: =[", i);
+        for (int j = 0; j < n_cols; ++j) {
+            pn = neighbor + i * n_cols + j;
+            printf(" %ld", *pn);
+        }
+        printf("]\n");
+        printf("Query %d neighbor distances: =[", i);
+        for (int j = 0; j < n_cols; ++j) {
+            pd = distances + i * n_cols + j;
+            printf(" %f", *pd);
+        }
+        printf("]\n");
+    }
+}
+
diff --git a/examples/c/src/ivf_flat_c_example.c b/examples/c/src/ivf_flat_c_example.c
new file mode 100644
index 000000000..c068d04f8
--- /dev/null
+++ b/examples/c/src/ivf_flat_c_example.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/core/c_api.h>
+#include <cuvs/neighbors/ivf_flat.h>
+
+#include <cuda_runtime.h>
+#include "common.h"
+
+void ivf_flat_build_search_simple(cuvsResources_t *res, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) {
+    // Create default index params
+    cuvsIvfFlatIndexParams_t index_params;
+    cuvsIvfFlatIndexParamsCreate(&index_params);
+    index_params->n_lists                  = 1024; // default value
+    index_params->kmeans_n_iters = 20; // default value
+    index_params->kmeans_trainset_fraction = 0.1;
+    //index_params->metric default is L2Expanded
+
+    // Create IVF-Flat index
+    cuvsIvfFlatIndex_t index;
+    cuvsIvfFlatIndexCreate(&index);
+
+    printf("Building IVF-Flat index\n");
+    // Build the IVF-Flat Index
+    cuvsError_t build_status = cuvsIvfFlatBuild(*res, index_params, dataset_tensor, index);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        cuvsIvfFlatIndexDestroy(index);
+        cuvsIvfFlatIndexParamsDestroy(index_params);
+        return;
+    }
+
+    // Create output arrays.
+    int64_t topk      = 10;
+    int64_t n_queries = queries_tensor->dl_tensor.shape[0];
+
+    //Allocate memory for `neighbors` and `distances` output
+    int64_t *neighbors_d;
+    float *distances_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk);
+
+    DLManagedTensor neighbors_tensor;
+    int64_t neighbors_shape[2] = {n_queries, topk};
+    int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor);
+    
+    DLManagedTensor distances_tensor;
+    int64_t distances_shape[2] = {n_queries, topk};
+    float_tensor_initialize(distances_d, distances_shape, &distances_tensor);
+
+    // Create default search params
+    cuvsIvfFlatSearchParams_t search_params;
+    cuvsIvfFlatSearchParamsCreate(&search_params);
+    search_params->n_probes = 50;
+
+    // Search the `index` built using `ivfFlatBuild`
+    cuvsError_t search_status = cuvsIvfFlatSearch(*res, search_params, index,
+     queries_tensor, &neighbors_tensor, &distances_tensor);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+    }
+
+    int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t));
+    float *distances = (float *)malloc(n_queries * topk * sizeof(float));
+    memset(neighbors, 0, n_queries * topk * sizeof(int64_t));
+    memset(distances, 0, n_queries * topk * sizeof(float));
+
+    cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault);
+    cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault);
+
+    print_results(neighbors, distances, 2, topk);
+
+    free(distances);
+    free(neighbors);
+
+    cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk);
+
+    cuvsIvfFlatSearchParamsDestroy(search_params);
+    cuvsIvfFlatIndexDestroy(index);
+    cuvsIvfFlatIndexParamsDestroy(index_params);  
+}
+
+void ivf_flat_build_extend_search(cuvsResources_t *res, DLManagedTensor * trainset_tensor, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) {
+    int64_t *data_indices_d;
+    int64_t n_dataset = dataset_tensor->dl_tensor.shape[0];
+    cuvsRMMAlloc(*res, (void**) &data_indices_d, sizeof(int64_t) * n_dataset);
+    DLManagedTensor data_indices_tensor;
+    int64_t data_indices_shape[1] = {n_dataset};
+    int_tensor_initialize(data_indices_d, data_indices_shape, &data_indices_tensor);
+    data_indices_tensor.dl_tensor.ndim = 1;
+    
+    printf("\nRun k-means clustering using the training set\n");
+
+    int64_t *data_indices = (int64_t *)malloc(n_dataset * sizeof(int64_t));
+    int64_t * ptr = data_indices;
+    for (int i = 0; i < n_dataset; i++) {
+        *ptr = i;
+        ptr++;
+    }
+    ptr = NULL;
+    cudaMemcpy(data_indices_d, data_indices, sizeof(int64_t) * n_dataset, cudaMemcpyDefault);
+
+    // Create default index params
+    cuvsIvfFlatIndexParams_t index_params;
+    cuvsIvfFlatIndexParamsCreate(&index_params);
+    index_params->n_lists                  = 100;
+    index_params->add_data_on_build = false;
+    //index_params->metric default is L2Expanded
+
+    // Create IVF-Flat index
+    cuvsIvfFlatIndex_t index;
+    cuvsIvfFlatIndexCreate(&index);
+
+    // Build the IVF-Flat Index
+    cuvsError_t build_status = cuvsIvfFlatBuild(*res, index_params, trainset_tensor, index);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        cuvsIvfFlatIndexDestroy(index);
+        cuvsIvfFlatIndexParamsDestroy(index_params);
+        return;
+    }
+
+    printf("Filling index with the dataset vectors\n");
+    cuvsError_t extend_status = cuvsIvfFlatExtend(*res, dataset_tensor, &data_indices_tensor, index);
+    if (extend_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        return;
+    }
+
+    // Create output arrays.
+    int64_t topk      = 10;
+    int64_t n_queries = queries_tensor->dl_tensor.shape[0];
+
+    //Allocate memory for `neighbors` and `distances` output
+    int64_t *neighbors_d;
+    float *distances_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk);
+
+    DLManagedTensor neighbors_tensor;
+    int64_t neighbors_shape[2] = {n_queries, topk};
+    int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor);
+    
+    DLManagedTensor distances_tensor;
+    int64_t distances_shape[2] = {n_queries, topk};
+    float_tensor_initialize(distances_d, distances_shape, &distances_tensor);
+    
+    // Create default search params
+    cuvsIvfFlatSearchParams_t search_params;
+    cuvsIvfFlatSearchParamsCreate(&search_params);
+    search_params->n_probes = 10;
+
+    // Search the `index` built using `ivfFlatBuild`
+    cuvsError_t search_status = cuvsIvfFlatSearch(*res, search_params, index,
+     queries_tensor, &neighbors_tensor, &distances_tensor);
+    if (search_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        exit(-1);
+    }
+
+    int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t));
+    float *distances = (float *)malloc(n_queries * topk * sizeof(float));
+    memset(neighbors, 0, n_queries * topk * sizeof(int64_t));
+    memset(distances, 0, n_queries * topk * sizeof(float));
+
+    cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault);
+    cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault);
+
+    print_results(neighbors, distances, 2, topk);
+
+    free(distances);
+    free(neighbors);
+    free(data_indices);
+    cuvsRMMFree(*res, data_indices_d, sizeof(int64_t) * n_dataset);
+    cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk);
+
+    cuvsIvfFlatSearchParamsDestroy(search_params);
+    cuvsIvfFlatIndexDestroy(index);
+    cuvsIvfFlatIndexParamsDestroy(index_params);
+}
+
+int main() {
+    // Create input arrays.
+    int64_t n_samples = 10000;
+    int64_t n_dim     = 3;
+    int64_t n_queries = 10;
+    float *dataset = (float *)malloc(n_samples * n_dim * sizeof(float));
+    float *queries = (float *)malloc(n_queries * n_dim * sizeof(float));
+    generate_dataset(dataset, n_samples, n_dim, -10.0, 10.0);
+    generate_dataset(queries, n_queries, n_dim, -1.0, 1.0);
+    
+    // Create a cuvsResources_t object
+    cuvsResources_t res;
+    cuvsResourcesCreate(&res);
+
+    // Allocate memory for `queries`
+    float *dataset_d;
+    cuvsRMMAlloc(res, (void**) &dataset_d, sizeof(float) * n_samples * n_dim);
+    // Use DLPack to represent `dataset_d` as a tensor
+    cudaMemcpy(dataset_d, dataset, sizeof(float) * n_samples * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor dataset_tensor;
+    int64_t dataset_shape[2] = {n_samples,n_dim};
+    float_tensor_initialize(dataset_d, dataset_shape, &dataset_tensor);
+
+    // Allocate memory for `queries`
+    float *queries_d;
+    cuvsRMMAlloc(res, (void**) &queries_d, sizeof(float) * n_queries * n_dim);
+
+    // Use DLPack to represent `queries` as tensors
+    cudaMemcpy(queries_d, queries, sizeof(float) * n_queries * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor queries_tensor;
+    int64_t queries_shape[2] = {n_queries, n_dim};
+    float_tensor_initialize(queries_d, queries_shape, &queries_tensor);
+
+    // Simple build and search example.
+    ivf_flat_build_search_simple(&res, &dataset_tensor, &queries_tensor);
+
+    float *trainset_d;
+    int64_t n_trainset = n_samples * 0.1;
+    float *trainset = (float *)malloc(n_trainset * n_dim * sizeof(float));
+    for (int i = 0; i < n_trainset; i++) {
+        for (int j = 0; j < n_dim; j++) {
+            *(trainset + i * n_dim + j)  = *(dataset + i * n_dim + j);
+        }
+    }
+    cuvsRMMAlloc(res, (void**) &trainset_d, sizeof(float) * n_trainset * n_dim);
+    cudaMemcpy(trainset_d, trainset, sizeof(float) * n_trainset * n_dim, cudaMemcpyDefault);
+    DLManagedTensor trainset_tensor;
+    int64_t trainset_shape[2] = {n_trainset, n_dim};
+    float_tensor_initialize(trainset_d, trainset_shape, &trainset_tensor);
+    
+    // Build and extend example.
+    ivf_flat_build_extend_search(&res, &trainset_tensor, &dataset_tensor, &queries_tensor);
+
+    cuvsRMMFree(res, trainset_d, sizeof(float) * n_trainset * n_dim);
+    cuvsRMMFree(res, queries_d, sizeof(float) * n_queries * n_dim);
+    cuvsRMMFree(res, dataset_d, sizeof(float) * n_samples * n_dim);
+    cuvsResourcesDestroy(res);
+    free(trainset);
+    free(dataset);
+    free(queries);
+}
diff --git a/examples/c/src/ivf_pq_c_example.c b/examples/c/src/ivf_pq_c_example.c
new file mode 100644
index 000000000..b6d6b485b
--- /dev/null
+++ b/examples/c/src/ivf_pq_c_example.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/core/c_api.h>
+#include <cuvs/neighbors/ivf_pq.h>
+#include <cuvs/neighbors/refine.h>
+
+#include <cuda_runtime.h>
+#include "common.h"
+
+void ivf_pq_build_search(cuvsResources_t *res, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) {
+    // Create default index params
+    cuvsIvfPqIndexParams_t index_params;
+    cuvsIvfPqIndexParamsCreate(&index_params);
+    index_params->n_lists                  = 1024; // default value
+    index_params->kmeans_trainset_fraction = 0.1;
+    //index_params->metric default is L2Expanded
+    index_params->pq_bits = 8;
+    index_params->pq_dim = 2;
+
+    // Create IVF-PQ index
+    cuvsIvfPqIndex_t index;
+    cuvsIvfPqIndexCreate(&index);
+
+    printf("Building IVF-PQ index\n");
+
+    // Build the IVF-PQ Index
+    cuvsError_t build_status = cuvsIvfPqBuild(*res, index_params, dataset_tensor, index);
+    if (build_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        cuvsIvfPqIndexDestroy(index);
+        cuvsIvfPqIndexParamsDestroy(index_params);
+        return;
+    }
+    
+    // Create output arrays.
+    int64_t topk      = 10;
+    int64_t n_queries = queries_tensor->dl_tensor.shape[0];
+
+    //Allocate memory for `neighbors` and `distances` output
+    int64_t *neighbors_d;
+    float *distances_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk);
+
+    DLManagedTensor neighbors_tensor;
+    int64_t neighbors_shape[2] = {n_queries, topk};
+    int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor);
+
+    DLManagedTensor distances_tensor;
+    int64_t distances_shape[2] = {n_queries, topk};
+    float_tensor_initialize(distances_d, distances_shape, &distances_tensor);
+
+    // Create default search params
+    cuvsIvfPqSearchParams_t search_params;
+    cuvsIvfPqSearchParamsCreate(&search_params);
+    search_params->n_probes = 50;
+    search_params->internal_distance_dtype = CUDA_R_16F;
+    search_params->lut_dtype = CUDA_R_16F;
+
+    // Search the `index` built using `cuvsIvfPqBuild`
+    cuvsError_t search_status = cuvsIvfPqSearch(*res, search_params, index,
+     queries_tensor, &neighbors_tensor, &distances_tensor);
+    if (search_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        exit(-1);
+    }
+
+    int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t));
+    float *distances = (float *)malloc(n_queries * topk * sizeof(float));
+    memset(neighbors, 0, n_queries * topk * sizeof(int64_t));
+    memset(distances, 0, n_queries * topk * sizeof(float));
+
+    cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault);
+    cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault);
+
+    printf("\nOriginal results:\n");
+    print_results(neighbors, distances, 2, topk);
+    
+    // Re-ranking operation: refine the initial search results by computing exact distances
+    int64_t topk_refined = 7;
+    int64_t *neighbors_refined_d;
+    float *distances_refined_d;
+    cuvsRMMAlloc(*res, (void**) &neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined);
+    cuvsRMMAlloc(*res, (void**) &distances_refined_d, sizeof(float) * n_queries * topk_refined);
+
+    DLManagedTensor neighbors_refined_tensor;
+    int64_t neighbors_refined_shape[2] = {n_queries, topk_refined};
+    int_tensor_initialize(neighbors_refined_d, neighbors_refined_shape, &neighbors_refined_tensor);
+    
+    DLManagedTensor distances_refined_tensor;
+    int64_t distances_refined_shape[2] = {n_queries, topk_refined};
+    float_tensor_initialize(distances_refined_d, distances_refined_shape, &distances_refined_tensor);
+    
+    // Note, refinement requires the original dataset and the queries.
+    // Don't forget to specify the same distance metric as used by the index.
+    cuvsError_t refine_status = cuvsRefine(*res, dataset_tensor, queries_tensor,
+                       &neighbors_tensor, index_params->metric,
+                       &neighbors_refined_tensor, &distances_refined_tensor);
+    if (refine_status != CUVS_SUCCESS) {
+        printf("%s.\n", cuvsGetLastErrorText());
+        exit(-1);
+    }
+
+    int64_t *neighbors_refine = (int64_t *)malloc(n_queries * topk_refined * sizeof(int64_t));
+    float *distances_refine = (float *)malloc(n_queries * topk_refined * sizeof(float));
+    memset(neighbors_refine, 0, n_queries * topk_refined * sizeof(int64_t));
+    memset(distances_refine, 0, n_queries * topk_refined * sizeof(float));
+
+    cudaMemcpy(neighbors_refine, neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined, cudaMemcpyDefault);
+    cudaMemcpy(distances_refine, distances_refined_d, sizeof(float) * n_queries * topk_refined, cudaMemcpyDefault);
+
+    printf("\nRefined results:\n");
+    print_results(neighbors, distances, 2, topk_refined);
+
+    free(distances_refine);
+    free(neighbors_refine);
+
+    free(distances);
+    free(neighbors);
+
+    cuvsRMMFree(*res, neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined);
+    cuvsRMMFree(*res, distances_refined_d, sizeof(float) * n_queries * topk_refined);
+
+    cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk);
+    cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk);
+
+    cuvsIvfPqSearchParamsDestroy(search_params);
+    cuvsIvfPqIndexDestroy(index);
+    cuvsIvfPqIndexParamsDestroy(index_params);  
+}
+
+int main() {
+    // Create input arrays.
+    int64_t n_samples = 10000;
+    int64_t n_dim     = 3;
+    int64_t n_queries = 10;
+    float *dataset = (float *)malloc(n_samples * n_dim * sizeof(float));
+    float *queries = (float *)malloc(n_queries * n_dim * sizeof(float));
+    generate_dataset(dataset, n_samples, n_dim, -10.0, 10.0);
+    generate_dataset(queries, n_queries, n_dim, -1.0, 1.0);
+    
+    // Create a cuvsResources_t object
+    cuvsResources_t res;
+    cuvsResourcesCreate(&res);
+
+    // Allocate memory for `queries`
+    float *dataset_d;
+    cuvsRMMAlloc(res, (void**) &dataset_d, sizeof(float) * n_samples * n_dim);
+    // Use DLPack to represent `dataset_d` as a tensor
+    cudaMemcpy(dataset_d, dataset, sizeof(float) * n_samples * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor dataset_tensor;
+    int64_t dataset_shape[2] = {n_samples,n_dim};
+    float_tensor_initialize(dataset_d, dataset_shape, &dataset_tensor);
+    
+    // Allocate memory for `queries`
+    float *queries_d;
+    cuvsRMMAlloc(res, (void**) &queries_d, sizeof(float) * n_queries * n_dim);
+
+    // Use DLPack to represent `queries` as tensors
+    cudaMemcpy(queries_d, queries, sizeof(float) * n_queries * n_dim, cudaMemcpyDefault);
+
+    DLManagedTensor queries_tensor;
+    int64_t queries_shape[2] = {n_queries, n_dim};
+    float_tensor_initialize(queries_d, queries_shape, &queries_tensor);
+    
+    // Simple build and search example.
+    ivf_pq_build_search(&res, &dataset_tensor, &queries_tensor);
+
+    cuvsRMMFree(res, queries_d, sizeof(float) * n_queries * n_dim);
+    cuvsRMMFree(res, dataset_d, sizeof(float) * n_samples * n_dim);
+    cuvsResourcesDestroy(res);
+    free(dataset);
+    free(queries);
+}

From d296d811e3d0f9917068c9d5d2ef04fccacdcd08 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 29 Oct 2024 18:03:51 -0500
Subject: [PATCH 14/47] print sccache stats in builds (#413)

Contributes to https://github.com/rapidsai/build-planning/issues/111

Proposes some small packaging/CI changes, matching similar changes being made across RAPIDS.

* printing `sccache` stats to CI logs
* reducing `pip`'s verbosity in wheel building scripts
* updating to the latest `rapids-dependency-file-generator` (v1.16.0)
* always explicitly specifying `cpp` / `python` in calls to `rapids-upload-wheels-to-s3`
* modifying `dependencies.yaml` to match RAPIDS-wide naming conventions

## Notes for Reviewers

This originally also ran wheel builds with `--no-build-isolation`, but I reverted that based on https://github.com/rapidsai/build-planning/issues/108#issuecomment-2436764212.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/413
---
 .pre-commit-config.yaml |  2 +-
 ci/build_cpp.sh         |  4 ++++
 ci/build_python.sh      | 10 ++++++++++
 ci/build_wheel.sh       | 16 +++++++++++++---
 dependencies.yaml       | 10 +++++-----
 5 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 439b42959..f4fdf202e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -113,7 +113,7 @@ repos:
                   cpp/cmake/modules/FindAVX\.cmake|
           - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.13.11
+        rev: v1.16.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 7bc0be5a7..db4c496cc 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,6 +15,10 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/libcuvs
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index deb67e91c..3241a2c2b 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -24,6 +24,8 @@ version=$(rapids-generate-version)
 export RAPIDS_PACKAGE_VERSION=${version}
 echo "${version}" > VERSION
 
+sccache --zero-stats
+
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
 rapids-conda-retry mambabuild \
@@ -31,6 +33,9 @@ rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cuvs
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 # Build cuvs-bench for each cuda and python version
 rapids-conda-retry mambabuild \
   --no-test \
@@ -38,6 +43,9 @@ rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cuvs-bench
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 # Build cuvs-bench-cpu only in CUDA 12 jobs since it only depends on python
 # version
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
@@ -47,6 +55,8 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cuvs-bench-cpu
+
+  sccache --show-adv-stats
 fi
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index d1030276f..4994374a8 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -32,10 +32,20 @@ case "${RAPIDS_CUDA_VERSION}" in
   ;;
 esac
 
-# Hardcode the output dir
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+rapids-logger "Building '${package_name}' wheel"
+
+sccache --zero-stats
+
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
+
+sccache --show-adv-stats
 
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
diff --git a/dependencies.yaml b/dependencies.yaml
index a68a550bb..cf9b68c8a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -74,14 +74,14 @@ files:
       - rapids_build
       - cuda
       - rust
-  py_build_py_cuvs:
+  py_build_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
       table: build-system
     includes:
       - build
-  py_rapids_build_py_cuvs:
+  py_rapids_build_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
@@ -90,7 +90,7 @@ files:
     includes:
       - rapids_build
       - build_py_cuvs
-  py_run_py_cuvs:
+  py_run_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
@@ -99,7 +99,7 @@ files:
       - cuda_wheels
       - run_py_cuvs
       - depends_on_pylibraft
-  py_test_py_cuvs:
+  py_test_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
     extras:
@@ -116,7 +116,7 @@ files:
       table: build-system
     includes:
       - rapids_build_setuptools
-  py_rapids_build_py_cuvs_bench:
+  py_rapids_build_cuvs_bench:
     output: pyproject
     pyproject_dir: python/cuvs_bench
     extras:

From b422cbeec92fe925ee59f5f966ac9834440200e2 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 30 Oct 2024 11:20:31 -0400
Subject: [PATCH 15/47] Add ci run_ scripts needed for build infra (#434)

These `run_*` scripts are needed by the build infra team and bring the cuvs project in line with the rest of RAPIDS

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/434
---
 ci/run_ctests.sh       | 9 +++++++++
 ci/run_cuvs_pytests.sh | 9 +++++++++
 2 files changed, 18 insertions(+)
 create mode 100755 ci/run_ctests.sh
 create mode 100755 ci/run_cuvs_pytests.sh

diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh
new file mode 100755
index 000000000..6bf83961b
--- /dev/null
+++ b/ci/run_ctests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support customizing the ctests' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcuvs/"
+
+ctest --output-on-failure --no-tests=error "$@"
diff --git a/ci/run_cuvs_pytests.sh b/ci/run_cuvs_pytests.sh
new file mode 100755
index 000000000..4de8927b1
--- /dev/null
+++ b/ci/run_cuvs_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuvs/cuvs
+
+pytest --cache-clear --verbose "$@" tests

From 6041a81ce8e534ac79f5b27c595c9231b88d1d10 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 31 Oct 2024 17:25:10 +0100
Subject: [PATCH 16/47] Enable NVTX in cuvs-cagra-search component (#439)

Since parts of CAGRA code have been separated into a static library component `cuvs-cagra-search` (to selectively enable CUDA separable compilation on them), the NVTX flags are not passed to the affected sources anymore. This PR fixes that.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/439
---
 cpp/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 746245791..e56e21383 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -603,6 +603,9 @@ SECTIONS
     # This enables NVTX within the project with no option to disable it downstream.
     target_link_libraries(cuvs PUBLIC CUDA::nvtx3)
     target_compile_definitions(cuvs PUBLIC NVTX_ENABLED)
+
+    target_link_libraries(cuvs-cagra-search PUBLIC CUDA::nvtx3)
+    target_compile_definitions(cuvs-cagra-search PUBLIC NVTX_ENABLED)
   else()
     # Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
     # which is set by default to OFF, but can be enabled in the dependent project.

From 71deb26c457bbf398c9af0142740aefadf83220a Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:18:32 +0100
Subject: [PATCH 17/47] Fix index overflow in edge cases of CAGRA graph
 optimize (#435)

Force `input_graph_degree`, `output_graph_degree`, and `graph_size` variables to `uint64_t`.
Before the PR, they've been `uint32_t`, and the product of them could overflow. This would lead to `cudaMemsetAsync` not filling in a large fraction of the graph.

It's not known whether this bug has surfaced for anyone until now, but it's better to be safe than sorry.
The change shouldn't have any impact on performance.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/435
---
 cpp/src/neighbors/detail/cagra/graph_core.cuh | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 43bf1ba2b..4253cb781 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -1076,11 +1076,11 @@ void optimize(
                "Each input array is expected to have the same number of rows");
   RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
                "output graph cannot have more columns than input graph");
-  const uint32_t input_graph_degree  = knn_graph.extent(1);
-  const uint32_t output_graph_degree = new_graph.extent(1);
+  const uint64_t input_graph_degree  = knn_graph.extent(1);
+  const uint64_t output_graph_degree = new_graph.extent(1);
+  const uint64_t graph_size          = new_graph.extent(0);
   auto input_graph_ptr               = knn_graph.data_handle();
   auto output_graph_ptr              = new_graph.data_handle();
-  const IdxT graph_size              = new_graph.extent(0);
 
   // MST optimization
   auto mst_graph_num_edges     = raft::make_host_vector<uint32_t, int64_t>(graph_size);
@@ -1148,7 +1148,7 @@ void optimize(
     constexpr int MAX_DEGREE = 1024;
     if (input_graph_degree > MAX_DEGREE) {
       RAFT_FAIL(
-        "The degree of input knn graph is too large (%u). "
+        "The degree of input knn graph is too large (%zu). "
         "It must be equal to or smaller than %d.",
         input_graph_degree,
         1024);
@@ -1217,11 +1217,12 @@ void optimize(
         assert(next_num_detour != std::numeric_limits<uint32_t>::max());
         num_detour = next_num_detour;
       }
-      RAFT_EXPECTS(pk == output_graph_degree,
-                   "Couldn't find the output_graph_degree (%u) smallest detourable count nodes for "
-                   "node %lu in the rank-based node reranking process",
-                   output_graph_degree,
-                   static_cast<uint64_t>(i));
+      RAFT_EXPECTS(
+        pk == output_graph_degree,
+        "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for "
+        "node %lu in the rank-based node reranking process",
+        output_graph_degree,
+        i);
     }
 
     const double time_prune_end = cur_time();
@@ -1317,7 +1318,7 @@ void optimize(
       uint32_t kf       = 0;
       uint32_t k        = mst_graph_num_edges_ptr[i];
 
-      const uint64_t num_protected_edges = max(k, output_graph_degree / 2);
+      const auto num_protected_edges = std::max<uint64_t>(k, output_graph_degree / 2);
       assert(num_protected_edges <= output_graph_degree);
       if (num_protected_edges == output_graph_degree) continue;
 
@@ -1342,7 +1343,7 @@ void optimize(
       assert(kf <= output_graph_degree);
 
       // Replace some edges of the output graph with edges of the reverse graph.
-      uint32_t kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree);
+      auto kr = std::min<uint32_t>(rev_graph_count.data_handle()[i], output_graph_degree);
       while (kr) {
         kr -= 1;
         if (my_rev_graph[kr] < graph_size) {

From 9bea21585ae121194c4df49e2ad4ce1bd16e3408 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 4 Nov 2024 14:12:05 -0500
Subject: [PATCH 18/47] call `enable_testing` in root CMakeLists.txt (#437)

Required to allow `ctest` to be called in the root of the build directory

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/437
---
 cpp/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e56e21383..c493af488 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -765,6 +765,7 @@ endif()
 # * build test executable ----------------------------------------------------
 
 if(BUILD_TESTS)
+  enable_testing()
   add_subdirectory(internal)
   add_subdirectory(test)
 endif()

From 3ac206364afdd9f413de2175763cc37fdefd58b3 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 4 Nov 2024 14:24:59 -0500
Subject: [PATCH 19/47] Don't presume pointers location infers usability.
 (#441)

Here is the results of looking at the cudaPointerGetAttributes of different allocation types on Grace + Hopper. Allocations of `malloc` are still usable on the GPU.
```
ccudaPointerGetAttributes attributes malloc ptr
  is_dev_ptr  -> 1
  is_host_ptr -> 1
  memory loc  -> unregistered

cudaPointerGetAttributes attributes cudaMalloc ptr
  is_dev_ptr  -> 1
  is_host_ptr -> 0
  memory loc  -> device

cudaPointerGetAttributes attributes cudaMallocManaged cudaMemAttachGlobal ptr
  is_dev_ptr  -> 1
  is_host_ptr -> 1
  memory loc  -> managed

```

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/441
---
 cpp/src/neighbors/detail/ann_utils.cuh | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 29f790ec5..652d41c85 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -63,14 +63,9 @@ struct pointer_residency_count<Type, Types...> {
     auto [on_device, on_host] = pointer_residency_count<Types...>::run(ptrs...);
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
-    switch (attr.type) {
-      case cudaMemoryTypeUnregistered: return std::make_tuple(on_device, on_host + 1);
-      case cudaMemoryTypeHost:
-        return std::make_tuple(on_device + int(attr.devicePointer == ptr), on_host + 1);
-      case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
-      case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
-      default: return std::make_tuple(on_device, on_host);
-    }
+    if (attr.devicePointer || attr.type == cudaMemoryTypeDevice) { ++on_device; }
+    if (attr.hostPointer || attr.type == cudaMemoryTypeUnregistered) { ++on_host; }
+    return std::make_tuple(on_device, on_host);
   }
 };
 

From eff2cc5ccd83ba25083436d74f8ae3a3d6836f97 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Mon, 4 Nov 2024 20:49:04 +0100
Subject: [PATCH 20/47] BUG: CAGRA multi-cta illegal access with bad queries
 (#438)

CAGRA search kernel errors with `cudaErrorIllegalAddress` in some conditions, specified in the new test case.

According to compute-sanitizer, the illegal access happens in [compute_distance_to_child_nodes(...) function](https://github.com/rapidsai/cuvs/blob/b422cbeec92fe925ee59f5f966ac9834440200e2/cpp/src/neighbors/detail/cagra/device_common.hpp#L185) accessing the graph.
The `parent_id` variable in that function sometimes appears to be out-of-bounds (larger than the graph size / number of records); it's invalid and seems to be the same reported by multiple threads, yet it's not an `invalid_index`, neither `index_msb_1_mask`, neither any derivative of the two.
Further observations:
  - I've checked the graph just before calling the search kernel; it does not contain any invalid indices.
  - One should disable any fancy pool memory resources to make it easier to reproduce the error (so that `parent_id` does not hit other user allocations in the pool)
  - It seems important that the query yields infinite distance to the dataset.
  - Running the search with a newly created `raft::resources` seems to increase the chance to hit the error
  - Even with all conditions satisfied, the error does not reproduce every time...

### Reproducer
```
./build.sh -n tests --limit-tests=NEIGHBORS_ANN_CAGRA_TEST && ./cpp/build/gtests/NEIGHBORS_ANN_CAGRA_TEST --gtest_filter=AnnCagraBugMultiCTACrash*
```

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/438
---
 .../neighbors/detail/cagra/device_common.hpp  |   5 +-
 cpp/test/CMakeLists.txt                       |   2 +-
 .../ann_cagra/bug_multi_cta_crash.cu          | 108 ++++++++++++++++++
 3 files changed, 112 insertions(+), 3 deletions(-)
 create mode 100644 cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu

diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp
index b7cb9c42d..7ec3d4d9e 100644
--- a/cpp/src/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/src/neighbors/detail/cagra/device_common.hpp
@@ -120,7 +120,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
   for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) {
     const bool valid_i = (i < num_pickup);
 
-    IndexT best_index_team_local;
+    IndexT best_index_team_local    = raft::upper_bound<IndexT>();
     DistanceT best_norm2_team_local = raft::upper_bound<DistanceT>();
     for (uint32_t j = 0; j < num_distilation; j++) {
       // Select a node randomly and compute the distance to it
@@ -145,7 +145,8 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes(
 
     const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u);
     if (valid_i && lane_id == 0) {
-      if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
+      if (best_index_team_local != raft::upper_bound<IndexT>() &&
+          hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
         result_distances_ptr[i] = best_norm2_team_local;
         result_indices_ptr[i]   = best_index_team_local;
       } else {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 60007825c..1ed8466b3 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -137,6 +137,7 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_CAGRA_TEST
     PATH
+    neighbors/ann_cagra/bug_multi_cta_crash.cu
     neighbors/ann_cagra/test_float_uint32_t.cu
     neighbors/ann_cagra/test_half_uint32_t.cu
     neighbors/ann_cagra/test_int8_t_uint32_t.cu
@@ -242,7 +243,6 @@ if(TARGET cuvs::c_api)
     target_compile_definitions(NEIGHBORS_HNSW_TEST PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
   endif()
 
-
   add_executable(cuvs_c_test core/c_api.c)
   target_link_libraries(cuvs_c_test PUBLIC cuvs::c_api)
 
diff --git a/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu b/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu
new file mode 100644
index 000000000..6f4aa059e
--- /dev/null
+++ b/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_cagra.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::cagra {
+
+class AnnCagraBugMultiCTACrash : public ::testing::TestWithParam<cagra::search_algo> {
+ public:
+  using data_type = half;
+
+ protected:
+  void run()
+  {
+    cagra::index_params cagra_index_params;
+    cagra_index_params.graph_degree              = 32;
+    cagra_index_params.intermediate_graph_degree = 48;
+
+    auto cagra_index =
+      cagra::build(res, cagra_index_params, raft::make_const_mdspan(dataset->view()));
+    raft::resource::sync_stream(res);
+
+    cagra::search_params cagra_search_params;
+    cagra_search_params.itopk_size        = 32;
+    cagra_search_params.thread_block_size = 256;
+    cagra_search_params.search_width      = 1;
+    cagra_search_params.max_iterations    = 0;
+    cagra_search_params.algo = ::testing::TestWithParam<cagra::search_algo>::GetParam();
+
+    // NOTE: when using one resource/stream for everything, the bug is NOT reproducible
+    raft::resources res_search;
+    cagra::search(res_search,
+                  cagra_search_params,
+                  cagra_index,
+                  raft::make_const_mdspan(queries->view()),
+                  neighbors->view(),
+                  distances->view());
+
+    raft::resource::sync_stream(res_search);
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, n_samples, n_dim));
+    queries.emplace(raft::make_device_matrix<data_type, int64_t>(res, n_queries, n_dim));
+    neighbors.emplace(raft::make_device_matrix<uint32_t, int64_t>(res, n_queries, k));
+    distances.emplace(raft::make_device_matrix<float, int64_t>(res, n_queries, k));
+    raft::random::RngState r(1234ULL);
+    InitDataset(res, dataset->data_handle(), n_samples, n_dim, metric, r);
+    // NOTE: when initializing queries with "normal" data, the bug is NOT reproducible
+    raft::linalg::map(
+      res, queries->view(), raft::const_op<data_type>{raft::upper_bound<data_type>()});
+    // InitDataset(res, queries->data_handle(), n_queries, n_dim, metric, r);
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    dataset.reset();
+    queries.reset();
+    neighbors.reset();
+    distances.reset();
+    raft::resource::sync_stream(res);
+  }
+
+ private:
+  raft::resources res;
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset  = std::nullopt;
+  std::optional<raft::device_matrix<data_type, int64_t>> queries  = std::nullopt;
+  std::optional<raft::device_matrix<uint32_t, int64_t>> neighbors = std::nullopt;
+  std::optional<raft::device_matrix<float, int64_t>> distances    = std::nullopt;
+
+  constexpr static int64_t n_samples                   = 1183514;
+  constexpr static int64_t n_dim                       = 100;
+  constexpr static int64_t n_queries                   = 30;
+  constexpr static int64_t k                           = 10;
+  constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+};
+
+TEST_P(AnnCagraBugMultiCTACrash, AnnCagraBugMultiCTACrash) { this->run(); }
+
+INSTANTIATE_TEST_CASE_P(AnnCagraBugMultiCTACrashReproducer,
+                        AnnCagraBugMultiCTACrash,
+                        ::testing::Values(cagra::search_algo::MULTI_CTA));
+
+}  // namespace cuvs::neighbors::cagra

From 6b35b65923933e6396ae61322ce2e9b0772eea4a Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 6 Nov 2024 08:24:44 +0100
Subject: [PATCH 21/47] CAGRA tech debt: distance descriptor and workspace
 memory (#436)

This PR introduces two changes:

1.  Refactor `dataset_descriptor_host` to pass and cache it by value while keeping the state in a thread-safe object in a shared pointers. Before this, the descriptor host itself was kept in shared pointer in LRU cache and was passed by reference; as a result, it could in theory die due to cache eviction while still being used via references to it.
2. Adjust the temporary buffers to always use the workspace resource in all CAGRA algo implementations (as of now, only SINGLE_CTA algo does this; the PR expands the change to MULTI_CTA and MULTI_KERNEL).

Both of the changes are required for effective use of stream-ordered dynamic batching https://github.com/rapidsai/cuvs/pull/261 (1. fixes crashes and 2. fixes thread-blocking behavior).

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/436
---
 .../neighbors/detail/cagra/cagra_search.cuh   |  4 +-
 .../detail/cagra/compute_distance.hpp         | 77 +++++++++++++------
 cpp/src/neighbors/detail/cagra/factory.cuh    | 20 ++---
 .../detail/cagra/search_multi_cta.cuh         | 12 +--
 .../detail/cagra/search_multi_kernel.cuh      | 53 +++++++------
 .../neighbors/detail/cagra/search_plan.cuh    |  2 +-
 6 files changed, 100 insertions(+), 68 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 95c158675..5778d85a6 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -151,7 +151,7 @@ void search_main(raft::resources const& res,
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
     // Search using a plain (strided) row-major dataset
-    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+    auto desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
       res, params, *strided_dset, index.metric());
     search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
       res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
@@ -161,7 +161,7 @@ void search_main(raft::resources const& res,
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
-    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+    auto desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
       res, params, *vpq_dset, index.metric());
     search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
       res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 297eb1f55..7eb798459 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -31,8 +31,10 @@
 #include <raft/util/device_loads_stores.cuh>
 #include <raft/util/vectorized.cuh>
 
+#include <atomic>
 #include <functional>
 #include <memory>
+#include <mutex>
 #include <type_traits>
 #include <variant>
 
@@ -232,52 +234,77 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t {
  */
 template <typename DataT, typename IndexT, typename DistanceT>
 struct dataset_descriptor_host {
-  using dev_descriptor_t = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
-  using dd_ptr_t         = std::shared_ptr<dev_descriptor_t>;
-  using init_f =
-    std::tuple<std::function<void(dev_descriptor_t*, rmm::cuda_stream_view stream)>, size_t>;
+  using dev_descriptor_t         = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
   uint32_t smem_ws_size_in_bytes = 0;
   uint32_t team_size             = 0;
 
+  struct state {
+    using ready_t = std::tuple<dev_descriptor_t*, rmm::cuda_stream_view>;
+    using init_f =
+      std::tuple<std::function<void(dev_descriptor_t*, rmm::cuda_stream_view)>, size_t>;
+
+    std::mutex mutex;
+    std::atomic<bool> ready;  // Not sure if std::holds_alternative is thread-safe
+    std::variant<ready_t, init_f> value;
+
+    template <typename InitF>
+    state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)}
+    {
+    }
+
+    ~state() noexcept
+    {
+      if (std::holds_alternative<ready_t>(value)) {
+        auto& [ptr, stream] = std::get<ready_t>(value);
+        RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream));
+      }
+    }
+
+    void eval(rmm::cuda_stream_view stream)
+    {
+      std::lock_guard<std::mutex> lock(mutex);
+      if (std::holds_alternative<init_f>(value)) {
+        auto& [fun, size]     = std::get<init_f>(value);
+        dev_descriptor_t* ptr = nullptr;
+        RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream));
+        fun(ptr, stream);
+        value = std::make_tuple(ptr, stream);
+        ready.store(true, std::memory_order_release);
+      }
+    }
+
+    auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t*
+    {
+      if (!ready.load(std::memory_order_acquire)) { eval(stream); }
+      return std::get<0>(std::get<ready_t>(value));
+    }
+  };
+
   template <typename DescriptorImpl, typename InitF>
   dataset_descriptor_host(const DescriptorImpl& dd_host, InitF init)
-    : value_{std::make_tuple(init, sizeof(DescriptorImpl))},
+    : value_{std::make_shared<state>(init, sizeof(DescriptorImpl))},
       smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
       team_size{dd_host.team_size()}
   {
   }
 
+  dataset_descriptor_host() = default;
+
   /**
    * Return the device pointer, possibly evaluating it in the given thread.
    */
   [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) const -> const dev_descriptor_t*
   {
-    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
-    return std::get<dd_ptr_t>(value_).get();
+    return value_->get(stream);
   }
+
   [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) -> dev_descriptor_t*
   {
-    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
-    return std::get<dd_ptr_t>(value_).get();
+    return value_->get(stream);
   }
 
  private:
-  mutable std::variant<dd_ptr_t, init_f> value_;
-
-  static auto eval(init_f init, rmm::cuda_stream_view stream) -> dd_ptr_t
-  {
-    using raft::RAFT_NAME;
-    auto& [fun, size] = init;
-    dd_ptr_t dev_ptr{
-      [stream, s = size]() {
-        dev_descriptor_t* p;
-        RAFT_CUDA_TRY(cudaMallocAsync(&p, s, stream));
-        return p;
-      }(),
-      [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }};
-    fun(dev_ptr.get(), stream);
-    return dev_ptr;
-  }
+  mutable std::shared_ptr<state> value_;
 };
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh
index abc907da5..e6e7ff64f 100644
--- a/cpp/src/neighbors/detail/cagra/factory.cuh
+++ b/cpp/src/neighbors/detail/cagra/factory.cuh
@@ -135,11 +135,9 @@ template <typename DataT, typename IndexT, typename DistanceT>
 struct store {
   /** Number of descriptors to cache. */
   static constexpr size_t kDefaultSize = 100;
-  raft::cache::lru<key,
-                   key_hash,
-                   std::equal_to<>,
-                   std::shared_ptr<dataset_descriptor_host<DataT, IndexT, DistanceT>>>
-    value{kDefaultSize};
+  raft::cache::
+    lru<key, key_hash, std::equal_to<>, dataset_descriptor_host<DataT, IndexT, DistanceT>>
+      value{kDefaultSize};
 };
 
 }  // namespace descriptor_cache
@@ -159,20 +157,18 @@ auto dataset_descriptor_init_with_cache(const raft::resources& res,
                                         const cagra::search_params& params,
                                         const DatasetT& dataset,
                                         cuvs::distance::DistanceType metric)
-  -> const dataset_descriptor_host<DataT, IndexT, DistanceT>&
+  -> dataset_descriptor_host<DataT, IndexT, DistanceT>
 {
-  using desc_t = dataset_descriptor_host<DataT, IndexT, DistanceT>;
-  auto key     = descriptor_cache::make_key(params, dataset, metric);
+  auto key = descriptor_cache::make_key(params, dataset, metric);
   auto& cache =
     raft::resource::get_custom_resource<descriptor_cache::store<DataT, IndexT, DistanceT>>(res)
       ->value;
-  std::shared_ptr<desc_t> desc{nullptr};
+  dataset_descriptor_host<DataT, IndexT, DistanceT> desc;
   if (!cache.get(key, &desc)) {
-    desc = std::make_shared<desc_t>(
-      std::move(dataset_descriptor_init<DataT, IndexT, DistanceT>(params, dataset, metric)));
+    desc = dataset_descriptor_init<DataT, IndexT, DistanceT>(params, dataset, metric);
     cache.set(key, desc);
   }
-  return *desc;
+  return desc;
 }
 
 };  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index 0003f2495..ecfd856f1 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -93,10 +93,10 @@ struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_
   using base_type::num_seeds;
 
   uint32_t num_cta_per_query;
-  rmm::device_uvector<INDEX_T> intermediate_indices;
-  rmm::device_uvector<float> intermediate_distances;
+  lightweight_uvector<INDEX_T> intermediate_indices;
+  lightweight_uvector<float> intermediate_distances;
   size_t topk_workspace_size;
-  rmm::device_uvector<uint32_t> topk_workspace;
+  lightweight_uvector<uint32_t> topk_workspace;
 
   search(raft::resources const& res,
          search_params params,
@@ -105,9 +105,9 @@ struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_
          int64_t graph_degree,
          uint32_t topk)
     : base_type(res, params, dataset_desc, dim, graph_degree, topk),
-      intermediate_indices(0, raft::resource::get_cuda_stream(res)),
-      intermediate_distances(0, raft::resource::get_cuda_stream(res)),
-      topk_workspace(0, raft::resource::get_cuda_stream(res))
+      intermediate_indices(res),
+      intermediate_distances(res),
+      topk_workspace(res)
 
   {
     set_params(res, params);
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index 9c22134a6..c6fe21642 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -91,6 +91,15 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre
   get_value_kernel<T><<<1, 1, 0, cuda_stream>>>(host_ptr, dev_ptr);
 }
 
+template <class T>
+auto get_value(const T* const dev_ptr, cudaStream_t stream) -> T
+{
+  T value;
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&value, dev_ptr, sizeof(value), cudaMemcpyDefault, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  return value;
+}
+
 // MAX_DATASET_DIM : must equal to or greater than dataset_dim
 template <class DATASET_DESCRIPTOR_T>
 RAFT_KERNEL random_pickup_kernel(
@@ -609,18 +618,18 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
   using base_type::num_seeds;
 
   size_t result_buffer_allocation_size;
-  rmm::device_uvector<INDEX_T> result_indices;       // results_indices_buffer
-  rmm::device_uvector<DISTANCE_T> result_distances;  // result_distances_buffer
-  rmm::device_uvector<INDEX_T> parent_node_list;
-  rmm::device_uvector<uint32_t> topk_hint;
-  rmm::device_scalar<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
-  rmm::device_uvector<uint32_t> topk_workspace;
+  lightweight_uvector<INDEX_T> result_indices;       // results_indices_buffer
+  lightweight_uvector<DISTANCE_T> result_distances;  // result_distances_buffer
+  lightweight_uvector<INDEX_T> parent_node_list;
+  lightweight_uvector<uint32_t> topk_hint;
+  lightweight_uvector<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
+  lightweight_uvector<uint32_t> topk_workspace;
 
   // temporary storage for _find_topk
-  rmm::device_uvector<float> input_keys_storage;
-  rmm::device_uvector<float> output_keys_storage;
-  rmm::device_uvector<INDEX_T> input_values_storage;
-  rmm::device_uvector<INDEX_T> output_values_storage;
+  lightweight_uvector<float> input_keys_storage;
+  lightweight_uvector<float> output_keys_storage;
+  lightweight_uvector<INDEX_T> input_values_storage;
+  lightweight_uvector<INDEX_T> output_values_storage;
 
   search(raft::resources const& res,
          search_params params,
@@ -629,16 +638,16 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
          int64_t graph_degree,
          uint32_t topk)
     : base_type(res, params, dataset_desc, dim, graph_degree, topk),
-      result_indices(0, raft::resource::get_cuda_stream(res)),
-      result_distances(0, raft::resource::get_cuda_stream(res)),
-      parent_node_list(0, raft::resource::get_cuda_stream(res)),
-      topk_hint(0, raft::resource::get_cuda_stream(res)),
-      topk_workspace(0, raft::resource::get_cuda_stream(res)),
-      terminate_flag(raft::resource::get_cuda_stream(res)),
-      input_keys_storage(0, raft::resource::get_cuda_stream(res)),
-      output_keys_storage(0, raft::resource::get_cuda_stream(res)),
-      input_values_storage(0, raft::resource::get_cuda_stream(res)),
-      output_values_storage(0, raft::resource::get_cuda_stream(res))
+      result_indices(res),
+      result_distances(res),
+      parent_node_list(res),
+      topk_hint(res),
+      topk_workspace(res),
+      terminate_flag(res),
+      input_keys_storage(res),
+      output_keys_storage(res),
+      input_values_storage(res),
+      output_values_storage(res)
   {
     set_params(res);
   }
@@ -662,7 +671,7 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
       itopk_size, max_queries, result_buffer_size, utils::get_cuda_data_type<DATA_T>());
     RAFT_LOG_DEBUG("# topk_workspace_size: %lu", topk_workspace_size);
     topk_workspace.resize(topk_workspace_size, raft::resource::get_cuda_stream(res));
-
+    terminate_flag.resize(1, raft::resource::get_cuda_stream(res));
     hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res));
   }
 
@@ -847,7 +856,7 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
                           stream);
 
       // termination (2)
-      if (iter + 1 >= min_iterations && terminate_flag.value(stream)) {
+      if (iter + 1 >= min_iterations && get_value(terminate_flag.data(), stream)) {
         iter++;
         break;
       }
diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh
index f23b96631..99254aa50 100644
--- a/cpp/src/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh
@@ -151,7 +151,7 @@ struct search_plan_impl : public search_plan_impl_base {
   lightweight_uvector<INDEX_T> hashmap;
   lightweight_uvector<uint32_t> num_executed_iterations;  // device or managed?
   lightweight_uvector<INDEX_T> dev_seed;
-  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc;
+  dataset_descriptor_host<DataT, IndexT, DistanceT> dataset_desc;
 
   search_plan_impl(raft::resources const& res,
                    search_params params,

From 2d4afb515e3b509152adc652e3a9d97816b7bc3b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 7 Nov 2024 11:23:15 -0500
Subject: [PATCH 22/47] Put a ceiling on cuda-python (#445)

This project is incompatible with newer versions of `cuda-python`. This puts ceilings of `<=11.8.3` (CUDA 11) and `<=12.6.0` (CUDA 12) on that library.

Those ceilings should be removed and replaced with `!=` constraints once new releases of `cuda-python` are up that this project is compatible with.

See https://github.com/rapidsai/build-planning/issues/116 for more information.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/cuvs/pull/445
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-125_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-125_arch-x86_64.yaml  | 2 +-
 conda/recipes/cuvs/meta.yaml                            | 8 +++++---
 cpp/test/neighbors/ann_ivf_flat.cuh                     | 1 +
 dependencies.yaml                                       | 4 ++--
 python/cuvs/pyproject.toml                              | 1 +
 12 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index aa12b4ed6..80bfb0c24 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 494ec394d..07937726c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index f4f03ccee..b7fd6fcfa 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index a295e93f4..83a457465 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index a73839457..21cb98180 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 3f869da9a..432509bcb 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 407fb6058..0c5043ac2 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 81943b184..cbb22333c 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index e7e2daf0c..560c95feb 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -26,6 +26,7 @@ build:
     - {{ compiler('cuda') }}
     - cuda-cudart-dev
     {% endif %}
+    - cuda-python
 
 requirements:
   build:
@@ -42,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -60,13 +61,14 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     {% else %}
     - cuda-cudart
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     {% endif %}
     - pylibraft {{ minor_version }}
     - libcuvs {{ version }}
     - python x.x
-    - cuda-python
     - numpy >=1.23,<3.0a0
 
 tests:
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 8cc46b2f7..23d84ca98 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -24,6 +24,7 @@
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <raft/linalg/normalize.cuh>
 #include <raft/stats/mean.cuh>
+#include <thrust/reduce.h>
 #include <thrust/sequence.h>
 
 #include <raft/core/resource/cuda_stream_pool.hpp>
diff --git a/dependencies.yaml b/dependencies.yaml
index cf9b68c8a..e909ad0dc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -213,11 +213,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0
+              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
           - matrix:
               cuda: "11.*"
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
           - matrix:
             packages:
               - &cuda_python cuda-python
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index bf62f5adf..30d784c67 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -136,4 +136,5 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
+    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning"
 ]

From e559d581acec030d8e71833aee1295fe442facb3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 7 Nov 2024 21:04:57 -0500
Subject: [PATCH 23/47] Adding tech stack to docs (#448)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/cuvs/pull/448
---
 README.md             |   9 ++++++
 docs/source/index.rst |  72 +++++++++++++++++++++++++++++++++++-------
 img/tech_stack.png    | Bin 0 -> 125904 bytes
 3 files changed, 70 insertions(+), 11 deletions(-)
 create mode 100644 img/tech_stack.png

diff --git a/README.md b/README.md
index c1b74a9e8..572e8d098 100755
--- a/README.md
+++ b/README.md
@@ -35,6 +35,7 @@ Finally, faster vector search enables interactions between dense vectors and gra
 
 Below are some common use-cases for vector search
 
+
 - ### Semantic search
   - Generative AI & Retrieval augmented generation (RAG)
   - Recommender systems
@@ -68,6 +69,14 @@ There are several benefits to using cuVS and GPUs for vector search, including
 
 In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale. 
 
+## cuVS Technology Stack
+
+cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU. 
+
+![cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU](img/tech_stack.png "cuVS Technology Stack")
+
+
+
 ## Installing cuVS
 
 cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/). Different packages are available for the different languages supported by cuVS:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 647061ae5..286836c18 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,19 +1,8 @@
 cuVS: Vector Search and Clustering on the GPU
 =============================================
 
-
 Welcome to cuVS, the premier library for GPU-accelerated vector search and clustering! cuVS provides several core building blocks for constructing new algorithms, as well as end-to-end vector search and clustering algorithms for use either standalone or through a growing list of :doc:`integrations <integrations>`.
 
-There are several benefits to using cuVS and GPUs for vector search, including
-
-#. Fast index build
-#. Latency critical and high throughput search
-#. Parameter tuning
-#. Cost savings
-#. Interoperability (build on GPU, deploy on CPU)
-#. Multiple language support
-#. Building blocks for composing new or accelerating existing algorithms
-
 Useful Resources
 ################
 
@@ -26,6 +15,67 @@ Useful Resources
 - `Issue tracker <https://github.com/rapidsai/cuvs/issues>`_: Report issues or request features.
 
 
+
+What is cuVS?
+#############
+
+cuVS contains state-of-the-art implementations of several algorithms for running approximate and exact nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering.
+
+Vector search is an information retrieval method that has been growing in popularity over the past few  years, partly because of the rising importance of multimedia embeddings created from unstructured data and the need to perform semantic search on the embeddings to find items which are semantically similar to each other.
+
+Vector search is also used in *data mining and machine learning* tasks and comprises an important step in many *clustering* and *visualization* algorithms like `UMAP <https://arxiv.org/abs/2008.00325>`_, `t-SNE <https://lvdmaaten.github.io/tsne/>`_, K-means, and `HDBSCAN <https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html>`_.
+
+Finally, faster vector search enables interactions between dense vectors and graphs. Converting a pile of dense vectors into nearest neighbors graphs unlocks the entire world of graph analysis algorithms, such as those found in `GraphBLAS <https://graphblas.org/>`_ and `cuGraph <https://github.com/rapidsai/cugraph>`_.
+
+Below are some common use-cases for vector search
+
+Semantic search
+~~~~~~~~~~~~~~~
+- Generative AI & Retrieval augmented generation (RAG)
+- Recommender systems
+- Computer vision
+- Image search
+- Text search
+- Audio search
+- Molecular search
+- Model training
+
+
+Data mining
+~~~~~~~~~~~
+- Clustering algorithms
+- Visualization algorithms
+- Sampling algorithms
+- Class balancing
+- Ensemble methods
+- k-NN graph construction
+
+Why cuVS?
+#########
+
+There are several benefits to using cuVS and GPUs for vector search, including
+
+1. Fast index build
+2. Latency critical and high throughput search
+3. Parameter tuning
+4. Cost savings
+5. Interoperability (build on GPU, deploy on CPU)
+6. Multiple language support
+7. Building blocks for composing new or accelerating existing algorithms
+
+In addition to the items above, cuVS shoulders the responsibility of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be receiving the best performance and scale.
+
+cuVS Technology Stack
+#####################
+
+cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU.
+
+.. image:: ../../img/tech_stack.png
+  :width: 600
+  :alt: cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU
+
+
+
 Contents
 ########
 
diff --git a/img/tech_stack.png b/img/tech_stack.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b3eeedba99957e985ccea233c8970c396acadb0
GIT binary patch
literal 125904
zcmc%xWmuI_7d4D-x<g7px&)*fL0UpOq`SL8x*Mdsm6k^7ZX~6pq#J3Z;amH8-sjJE
zew^$4IM?f?Sohxdy62j6%rVAXn@A-EDGXF%R0sruAtNoJ0)fB{gI_+#h~QrW9#nO~
zZ}2W+GHS@+hYzx81o%6V>w8UCRR?p|Pe#sW5DR+;J2Pe%6K69sdlyRw*Auu-A#f2h
z^dfO*Gb2|k2YU)ND?2lYhM6e^8xMtogF6KW8wVE!8^>ENe(;k*NuJ`ps@gouoIM0W
z0g;gqRrAO`TJhFZy?RDEoj2@S|B4anAU>I`!tvF(cw3s;!aYMdN|;;fX;^9GmmkAk
zqsV<R^S!wTMyuz&A9Kws%^R~+)KVg~e+aT>#RGC>Uo^8|kM{<II)&qFzhi6_8+PQy
z4U+Y{v*fvq@45({Tr@`#nxKbZiy>1&zra8|+}vUR`)A6K_^J?Cv;Xrylzk`&!~eT3
z0v9zI<v%X~UtZ(U<Nog=1$rgA|6P==lrQ(ccjEgZlJUQLI8oV;d@wUJ^Ehe0e*2dB
zf0s*#6AAkXO2!aQ9$YQr(OXzp6ny*kjkz4>fA6jFUQ26X7A^Ue;Qi6}_>t|u3-JG2
z!qk^buXlfZuUB}gSF1$$X=vcTeEIVK`hVZnxsvX-|1IeM_f?Apa{TXh8~=X^%m1|-
zL^gfL6#;?KRU93>WScond)nv<9iqV6eakJswJ3fYfh|_W1l@3r@Ui{pCFkAR<_Gyt
z)#cU2ICs5>gynK<^QQsnaIaA$HOYy#9t+p@GJlBBa^$@V(Gu_&<W1c7WNWYPsB5)z
zxTV&Ud%aC9hTI&hG;Jd`gm3Z01=B4=6+tXS<saiU@@wNNdRL0O>X15U$RC^Fs51zK
z?z1)eugdh$Gx44^a+0u8%38uAvfwk^K@*<`!WiX#<6mP&r&xdEen&EB8fY#oFwn8w
zjBT!;==%BGnW!6Clth8N+h2s{rLI$AI4+z1!2?Fz*UGm)=+zohs2sBNPw{y@1aXYt
zvIUYB;~a7;@A<p<U#7(k-fiaO<o;uSu0h|#S0?2AgyHO03V)(q&Z_x4U_&v@Xc5l2
zZ+tee#|-v6=6Afu$>*J?2romT4_PC^eLoEjg+SiUGyW%R`thilP8{7DK2Dgf_{^H0
z{CS))1^l}y-M=E9$I2N<@#lW7VD=)6B{Jl*54qLGM31eOGS)?Pjy*f>bRwal#E}+s
z3;&U!+(re1^5&Avk7geCqTy<2*&$#(9p)~H&SOZXFNt79s!)2EyIaN1$tT7^^}7vt
zY-wm|R>5L2$_s9d`%>FP<uAG4dJRtLl3tEuMTyo%<Wbl--;SI9P7MCqDZ$kx-nhlR
z#b-cP3qQP1G$d>o(r+ACHVmH>+Fdv?*s}0J8sSlYj9&BG``BB?*IBNV=6elz0d*pW
z^*2I2PHREgLNnLd*kW(-R@6bV4E}=elVa($L3-7(rd-2~l`@S>X=hEEMb=_h6jV)9
zz)eX0=hA)@ACYh(y*oIVMi)^_T1PT)+epYhU~)A0r7Si9OR$=Avm5%*PX=r6%8bua
zX73+`D0e7O5=80jc99!+IMKr~*p_JG-wzAUL%emXkr4z!Fg9s7=R~eM5UWZ=amvIt
z4qzn^`o*ritTxXy4&vK;V{TB=_?_B_!Zs1pZ6FFbh+0{Q2%AP#Y&`m(tnMdrZao;#
zE%>CVw&J-zA@guD#=SDCSnoy%rDW1wJik~@&K$nWl#s@rQ$(~<5@?*K^hl2*GGx10
zqv05$qOttPHZf!H++h8Qe`k!1gtKcDPB#L*&M!^I!G_0o0P^U>$ye5Pt6XfuXjEEC
zeiMs^Dz4d?y7k%pkCb*0@BD5xF&T1;tN&qy;GARUI;-CIxm3|VF~g|TG3J~Usk$xX
zuFG!JA@S-ab~PSn#yp<qrtW*czBE)QAU7+dFe|fR8d~1K_kVZ$I98EA&q2HULy$%;
z$;?@2)R`-;XhMxMG}%R)0DUc;Mtdw#eV4P~>U^Iw#hl|W9)41IldaS!yVZ|OTILCp
zZGtr4T^bIprev93+8Er{BOOMwDMyB+uo2C~@o6&V3MSqh+J1&~E@T!fq|F|Dxp#Ua
zby!X0-%Zf+xQ;rXs9)ub4-=ZC`tCCc1-TTyZ~pOc{yJaOjV>Ashb-Fp)@-hN*!<@v
z7n74a@idFUAcxC^R^;S{r&f7nimkLsrV$6-i&m0P@i7dqOKL~q<SP#QZm!;@3X1+T
z#lhOG%1i9;O#b;H^R3(K`Pv)5&%5%Athtq(90T9ZwH7VQsPmki4%5R)28W%ewKgpI
zlbl)ZcWC5<I1;Rc8BB8Hx`sF;(Js<lz4RZ|uNIi>vV&y!(*E|-ewTaPo{&;y(mLyL
zew|ud2%Bc+EVxT;&RzGg9VtgN-n=DGJI&<As&RXPceQk?6h;&4QfZVyHtZXoY+@oL
zoA=F>BkjEJeW~Wu()-aq!tZ;D>C3m8;Z|Ca7w(m`asn!I>AXE=Obqiq>Gj!W6Y4xW
z3-J=EyP3HM9I5Rz31tOEEZ4!Z^UkhRva9;Xsl3|$<m`2BxY^V6X6O93@_8?1`A7T(
zGWxNuly<t<Wqw3i5t!E{nja6|x@-kKGUzjYd+t{2nHy7Y&UT5FG2YP9uDY*9&Crxl
zC!UT;M!WFsl~VB+*3?+Od>4-`mRaY0Z4b?%f8zotKdgNJ16C80u8m#1eyY=0BHOYl
zJxD%fVcQ=CSdEO#vT!n%#pCzy5@O^`Zsm`M`=V06WsFV-XWBprDvb`ag9sWa`ZmK&
zGD+t32x>2#>If>(5ETYyWTOP(ZDVRlK0Cg8@v~wmeGuoq8NQ_wC7F3aZqr$zNcBSS
zq_h6pd4~Is%kx1J8jDPM`J#oP^BsnFf~Ve1`Y(u6J!hur=Gf~gC_Zxh4Rmg>&z5NF
zu*tV&Fgc_f(QzF!+nmypsb7j+f!ofw9sQv=qDDDp%PP@7bzRS_X)lvM>dY;-kr~G%
znaW?(EI7=*Fk@@(x+wQFN}#Kwp!clul|7VHHTLRI)2DN1Za3Xnf$WX90kdFnI+s!Y
z=y19_{cBCd$Bu-Q=09<|ux2BHZVId=nYR(Na?MJbxjO#jP7_jzvaGtxRnLKLa=0r&
zt}y5-w6b$V@9aw~W92k?icL9XKMKjlTA42yGpKeN?A&^NOsk1$BKQN>uTreN-1nc|
z`1MC5;|>bgxe@R*=%4JREhM|u^>7~vF@}CJ<8z57f0hjyE{n|fJY<T=t0lY395|tw
zw6-*VO>Jc@h^HX`JQ`X}x$ZyjO=6|<Jm*oW>c+-*5|boCP!Q0TSF^(G&xSU=VNSU_
zY8KaE<-?#KtwC)TE&98ULmE{p1~v-0tfraLA%Nv0#?@ibu{;OAzjSsMW7Zh`V(I&x
zGxtJ8CMmVT)AC^~9QG$OQgrTI4XYPfWFvI3gF&PUIeGjH$+$QPT(40wC+h}^SGc`Q
z#|VRy#jYhw!#?38)O^)s8%m&23B6$;wj@(WzLUKetc&N{<~9=k1X+6sm2a~%Pe+|M
zT$tO*MAM);+-Fa3&|+=F#39;ZM@`%57DU%k$+xGP;J=V!Cy<-{n%^wwRt!Pgsp!Uk
zMKyNilm3}rD=VUKEX+I;=7P;9J_JE$aUQo>e7Lp}uUbAQG<%a})qrs@N}7T1l-}2d
zz}%NnkWS&q`Dm@{7LC{B+vq^uY`=V7bS7+}TskeuLu``Ri;_U|4}!@r6g+#)CC?5~
zq^)*6Z4yil+5IlLOiA&Iv(d^XzT3+WzKO22{YuObdTh1Hmu5DGXgV(6Rrff?h1d8|
zj8_=pa0StROF2F>Kuag5Dl|QqNfN<LS8!ufO-WK3eu+VoB(86aD_4kei_u!vRi1k|
zB));eUJ1>nN{owXNtHX0GPo?;=An-xRd-FTKs{$7kt(DfvyLI7V!7G8lrfJMYX`m_
zCsR?^-JL7AOgHoRgJf~7k5o}qXv!`PXQ-@QnytH^Z<1JfsFoa^`C@LL={R?GcA}N6
z{HQ4hyJ~%?R>kL~H(GmB@T54!**QOlv#rVHkvO!b4AU}+6}o>EvFuuT<0PT}dt-5q
z%;u<4oSiuOLv}jD-H1ch;cIFsgMGaP26-gFX`0#}vl&eO2K=1$`7++Y=d+CKJhUH*
z@wv)=)YM&XA`JFo%++!1`f8qQCCXNQF?DS*B-(sUyk3tjt2O1Gwx~J3vu%u$B=zCY
zMn>v&-4EJ*CSMt1S$(}N%eU6UQ^*tCe8uTGE5Aurtxa#EKlWtWHwx~4p;KZs%hv2k
z=H%3Tf}!TjWiq)vdCd`OljK~RUG&%H_SQYSE_H^!b%(k(S*FNPS3+=lC%MMCAR#4f
zhutoFlvz=Fz>H_^QtMzu{HJl+18&CXc{%Zj_@>)ly0h(_#O7R&2nScjHWBf)*`4;t
zjLNm(7{)AzQH}-w2q8n8WZlr6I((ufx2$&h_*9y;)?%AmY^G&n8*^$cO^zxtGXVY3
zqT9P6^c_pi$%A`oS?lBxtpRBxG!*!s89R?SCC)ZR+s#(Tl|)oYnjNw>5s9IwX^R#N
zNm1Q}1=E`5PW+teL|O~~(6Q?0NmQHNSJ=!rKf63(s?>2c?#vRX*J(44rrFm&hhR@}
zS$oMNH#eknWJsPW(HUiuT+<Z|H0qHFg7-<7hD)?kZDU(YZQ|LjuwcLz@^CKTgqMFG
zy)&c-E6P2>*5pvKMVl7hhmA4f?ibIbVD6lTwCt)V?^n*D)ixqtMq^KUK{()ryaDh!
z#P>-TE?c(SS;hVk)`T||r<io-JpS)KE{$2#Bys8wQ__()cEv`8^$|tyQRh=8-kjGM
zHnTyn0eQGUON1k05|SJ@eJqqCrZOTKZ022UB&an_Ta&G=>D#`k)4Ma87yk9e!1|kK
ziH(acmgqy#Wl1Z%)^hLo{QSEvn653(i<N&mPYolow2pFD?uXSjS+HgSloDecx)(J?
z5wxwkQkSyEw=BEel}i$ZH4i+{oixtC)rAvF89cYR>$@@wFA-U|;7*KDurMRj>D1Bu
zV@sPPHJr|*J4_==viWr-ZhmiZOMpJttc7RHVc)k9A+WBWdk=9iS@s=oFi%T6d?MQw
z;(DkfS~6=+E6%)&bC;Vf3fs>u+jZY^wGq7W`)GSANJDC^+cY<mlR1a1PU1KR&lZ}&
zS8Oq9&eDBu%B@AJNGY-G3%+@CJln};E~^TOe+278)lrd~UtGE3wv=n+`L57;B|fv0
zg-z@vTbGIvKkr1N5oa|mMkTb%|4vVNl@So)cf6$h;DDNFK}YGSKs-fYlaTus(a4T4
zbFPh9=Xakv)BBzy_z7x%DH@7>Bcth)VbYcn1yX9#l2D$ipS68DVX72%HZUB;j3M%^
zdb4U@U8M--3Ke<@-xKyEqi8maiMy=yBK#tCxzk|^wf-^Hi)f}KTw!VJWzU)4<+*jS
zM`N0n`eoTLq;?oSx<BZMfu&a^+ARKO$e091%D_%+gT4gS57sv;lK&7PB-50NzxtAc
z#R>qRx-&eYd{0W?;zap*ac~jkzHC;^nN*taP&aJXS%-oWRD}q^+WbjCB88Nos25O(
zoW{yei!;zXYV0=<G0cN8y{x?51pmIjr?_X?5d%ke8dHSBK6UWiS1TKqVI+z2nh%~6
z>FR(gd4ELNZwgtp9w8M@yX5}kJ1gOg8m5I9HvQVvKEWuVv9RvukwA&hTGQ_)L|94F
z_BgTvh~6);aJHvuMup6n((6vh{6O49LhIamgMq!J%N}x3MKUgSH=uIOU(oMN(~%eH
z=1?Mjufma7{EedERvVd817*b?d?iAf-8zf+tr<AO)@T+F56bi?eH9cmEVvhybAQK>
zz$Q9*(<mR82*(oVNC0=vW6@n?Luts_vqfTiT13wev6O`6Qw^Hh?CIY|4c!%CPzk<l
z<YpG=GfzHxgD>RcTpK|;H3LtNLmxKOZRdXQ%9CO2+Yp_4AbhQp^~B&WZ5)I|o@5H6
z4$O>K90}hmA@7miWc~fFeQQUcLN+UUU5B}V_9|H6NhDHM*vOyo+JbUKyi<2XW_>j*
z*sL^U>S6@`6_!g;tsf)nE3Drhr)@97tmn5`ynn>U#)f~3l<iJS6*=))+h9YFIwOI|
z4q7F!AD`dbbTP~*dA{X~BsuqkVDk9&Uual;J*G1-g>W>JkYwLkEqQ2qKq_;LOM8cF
zVGz@NXrhU>_&X!;BS@*b>r5(-$Vtk=T7Giu1~%h_eZcj|MRI`ic!f1$gvH9Q9nZQn
z5U`<Qgg4G${KfWQpbJB+BB(cW^YiV9t~PAD{3gLEZxt@uth6G6Bx18J?2+4ICqmD?
zWC%{J3fCWhJ4C=nLbB7gBR&p@eNm%QCqkQ5^1NU@K&IS6{knujx%F*Rl8&Ndm6*w<
z)JQNXJiFSnl)TD+@7r=}Fl<SnFP|Cg1_D-@q*}{N->Ekc|9=RMmd!)H1R-_7;7bxB
zV0|O$>FL$>=Brc^RxAxrL)7Agk<@HL7jfEr+3<ld29oJz`7&J8&t-62kTFt?+Nz9L
zMi!<+CGKlAnv$+=)K}m4^mn3I-))Y}ISUM(U{#Di)UZ@N{aoJDF0kRGu<(O*)~@k?
zDuj`Im5sQ+@ny9IapccWi9hp2rM43g6HJn?Tk`V{#BX%<u_KC*5A!Q}Lc_Z^bNfji
zg89~l$G;tBXol4Y<@zVWJME>bxhe4rsWMbl!iU!g$?Ii@ar|i!$!O&t7C(e}nbNP>
zDF}DixTmzdDAHE2fX>#0nUBvRKCPt=Yt;lFFpW^w=IU>I&{V`Rl-V>3kJQ|8;P{g{
z8}(<IwW)B4c+vE`^y--c!)P)DojDUC2>j1{lFES_1{rCR*C**O13;q^o4L%ycD{;C
zkxpNmf&Zl**;a`mUpYqq`(XL#P6gwKAzYeu&Ek-*%RXi5?h4tz<~MhDzr<VkF~>sd
zNs!W3nJpVsl$k_N@qIG43Y|;oXejM&-){NpA*z1lKj$r?6q?$mw?#DVNZI!|ve%F=
zT|iGVX-yf*U`GxnlkdK>BgLng-wX=MdH}kSW%Z`l4STKOq{+m<)-{=KL@Ycd0h4i%
zCt7E$VhMR{ytAJ2Q8&JD!Fd=sK?QiV>Ho~tcaeTVAY=q-b(GDc?E|ZBr4FZQ&GSE4
zSBHx&#IG9=qjrupxDoLjw1Sgw=!ZU{DTxH@171({O_?sl6*ZI&CJG--$!lM<!dVA1
z%9gRz8Z~}bkULW1bBI&bXL|k{7f7~EjuaiWN!0Gsr>dSXT`vyYR>Ut5|Ncxd*jQII
z_dT+T)`z2WZzM!ZQc7vWSv+PnCzE&%?NBpeuHw{HjSwm!l(X#cuJ=sg<}!8pt5!)(
z^b&e^^TRbfFCuNOf?hJzO-4#*ehm{N2wS6ouu*#dBW!DEbW_%Jfx+7)z|-9cMX(A4
zVZcz;;h&eAOAnwsAeZO1YjPa>H_a+CoQ!9L{aRUsQvhW%g(D;saeg;Xg?Wav4xYPI
zBlf<zeR+ft&B1ADAf<iYQAut>;%5lLFJJA|`hvAX<3{sK3`R7AUJrS;sD{@D9qxI+
zGJ>pFBU9#3YX^sPYbW!-bK)cXdW7_4x(1og1Jf?w9j&;BA_dmbFF$dB%E;R1J2#zu
zq0!ajDhXDVU<|$_*HyrZUv4HZvEdqw8SYs1XMDu<m&OdH@?4RBm3YWf1~2KS4~<gG
zUXZ(FxKJrm?faY5?t}*oP^9wqnj%sl1Qk_%7UGxqtRP`5XM<BDHHVniofpO;lFKr-
z-MtVHG-um@usr2Z9#Y#(cUp+>wv;4Xn$i}ntJkm;UXaI<x7V@F1W97^Q-#!yQ3-3G
zisc_xksbIn4incyPhW&kQ3A~F=Iu8z;Nt6*FrpNE7Vh<RBji?M#Yi0fndy2ZMI;DW
z3X-gH9k8$r4)d2(AnmGkqXyO8nbk=N?B|Sy(b&!_1;z6r@d{MpL3HYTElx%ZQSNwL
zb4?LNgu)BGU24WeU$=T3Jlp@6)szw-YZ%7MVpp*;UoCsrQlU+U4I0nD6|n70o)K1O
zgkIh2Gz&w5@q$A{AqG*2zVgg=l1{byngvXx@b5e4P8$1fjF|}kkaw><yr3fwQ9eVF
z&dx*W-JS>bCpE2og=pH)c8}qlxyZt#`rOfM^4F~zN74ny6z{)@;oFO{&RI5hpMRZP
z`>5YkGDMJ)MJr>)bf@2(Nqcw^MV0+q(346rk-^(q@2QUIfB(x{drmhX2(yFmPNn@c
zmy!&AR{Hv-?A;^nalu;y24j9aR3qt8a(XdkRXzM09K64{xi2HB8gQ-c`K1d|loaJ9
z@4|eN_b=k{{$73`Lw@W|UH*(rS@;Mwd|iq6`7qRCcV)=#NNwUj?p<Qz36W(^<{qTd
zD1DB_N6be0p5Xhd_bYe5CMPcfn!X>#$RDk^3Pm(SOAj)<L9us>+eUS4xC>RmYmn0A
z7P1WQXyuv$NJ}OE(85=wRo(AKudx2~ikEGzdz6?+`WpQC%%2;4VUnTQDSs1vwC>TT
z|1(@eA-0q)$uNM$<qN_0KTLLhr^TFaD;Cs_IO-}%&cia#_t=%LT%IP;S4wi@mTEj<
z%*PA@=KtshU%mRCzqI=M<!rr`po5DY3+mo{m1wN&h5QqT{4(nsFLbq_{V)$a-^6Xo
zQcm$ke%yZO>co*&Qb|lxGOHX{uWdv=6CG=OD;TbYt^JVZ{Lt%Fv9O-m>XF=$0<5LQ
z5tAhIc^{)WqXon?4%1E6)!Bl``=M<orO&T6xgcZ#V415&{6iHFt{8O)kN&hu{mqJ6
zbL<|39b=+%0wXns$6Pdh6Jn|1-YgK2Z<0NhMkA1sS+4~V<mO{#F#6dSoH~;xT)Cki
z@Xv{qD+xDUXbPb#IZRNHmT0^94s^MVdCMj4kB?$$nIXyeBd3eyXz}OETr@1qh$kK%
zTltv@^x3awD|&-=Q@_mYug^U6a?r4_U<+=2*wEb}bt9!8ES7SQ4PR5q4A?5`5lT<V
zN{X)$(@Yv8AL*|6$q;0|hpa-m@A>gCv#22x{>(*hJhBp}<+c`>^u+o)S;0Z%m%qq4
zyBwSF`a}6OpAWcVh&jk?;9)S4w)^1eiO1_TBl}8Hf_`K*GCA%$pr0bm8_q4y*X)eV
z*BCxA$z>(Pj25K%p(l#&oNw9x&J4{Cv$5waaQ({_<KNu%*!Jp@`5V4NB6t&`p395X
zVrr*a7ga~HyU#Mxw=?{>&PwlcetVEry{v*0|42YF$n5yH$+;qzL!Ym+N<NPJxm<F1
z0Cw*Uqq1icp;p-@yI<{v7peH->(QbVX_z`5o>bqbCQLLsI^#$7dG2>VzF9(5N>g|e
zn=D&iEm6yim1d1NAC>xcEP`};7WJOMSU(M=*>cJ&e5FoG6qJdgwR-%+gnBa!g3$?U
zpEFA7Y|IRS><?9%L86ly&gQUU$WA?rhG!f7w&z<vkA8g-x6{hO79(NeC#J&>hG8N9
zkGS?*Imb9VKK!sH$F0!IrMi*9s6zF?EZv>h+0bO$-*>({M6}!TjLokqRAGK#<V@$Z
z60SBjN9wj9H~Y32)A=1{tc*#r9JgBcjO<9Zd&Wef316d0ssw!o<MEUb|NI4|*+Jt-
zde&c=9+xf#trnX_?XjLHf;Sm`SJ^){sENLry`N>ts$@BLTqa4vROcMS!<ZJVFjPs~
zOpVnU9L}lw_z$n_{)vX8n55YB+_CCRms!sIgM!CXvTCB%Vvn4<v002(ot95L8|hh=
zTpB-AtIKWpRlN969IMLtF=mFDTqX=yw-h5pS_78r{PA)VHpby*asp~i6}Ae>ht=ll
zZESkRM6!&ITBY2#^cvKoud*{M%bgcWSSH(TomJ?_i;ai7Kcw#bc#dTl+_qie!}9E{
z)GgQCFl}BFurEzTkyW>;dSP-DkjkJta#&PXO~T}oY{o*DQPjL;z@jpisQ<xaf9dnp
zlJ%Kyp_$c><YLZ`5#fI^)Ms}WEUyL8A``8m^(I(NXwmR{wyA*1pS@sU$fUM;A~~GS
z7~bY<c0^E>|5ekMA;V<lMK#^WkRb<q{g<m&5o~Y$|EKqxGm<!wc<1SnPr96lM1}iC
z`|CN0WvgG%7885<@`ZWDHj_(BxZy%?_+g3pet#}3BkoA<@?~&~u~DSsS&}Eyg7~lP
zv)-SY4+l6qPkX~vO8L6}g0$t_a>xIE`88t=nmTE#tIhVAV`MO~FFw)+mc+UKNX2i?
zwf03+ZGCrxc3qvGG$^NjP4Qib+)~qB2hMDOkaMSY>kD(M{gjav)9PMB;31Wg`Rpwd
z!`Da`>eRZzzvF11l=yk@Yk%wvCrflEI*({gwqar>XGCsKbc{Yy>yB|?YNj*H_oBfE
zFc75iH_X=EOOH%j0xL&totdd~ow1ntR2TCtmy&r*ihttoZ}(^~lZNUr#&i)uj3$92
zn+cfP+m|n+0W~X_c2-iD`C&*wgp$m<UGReqPR7YuTCiRQGf|ZPee{tpttRinG<_4K
zj{^_@QJGkR%mj|OM;ZPy*t3hdFO~*miQ!(>iS(im2PBr8rpNLH{0u!kx^pNR)a^=f
zUmKIY#3p#XA#A1i-&p{A1?B`%?vLP|&4zL-n|kD{R@^cBkK~f~qHFApsMOe}c1jcz
zhreWI$He?KqaTqEt}A<5dgRj7&<#53C#Jvmd`pEnMaZ4F$|_dIg7INql{6#ds7H?G
zv#aogZV0N(t0<)!U5rT0*sc2gUlM4%T{6K-^Cv3W{wwEcuSN0qtz=1n#Z1$l6E`Ks
ztd<*2_;iS-Lisq2-gtopi66qIBP?zS9<G%@vHL=H>+o8fQw1|a4K`8&7|N(;E*$X&
z+95G~iiZu2rZV~3`T=dUXG%>w-U_Jwrp)J2pl6SX;*|8?@iR*EuX7ktW}E0#w#5CQ
zaqFD|uM8SZ<5CLq7E-Uu!^uqJP*%x|<HCYV&_k(1IaE@~#sD*`ElPPkILs+a?!?sH
zFBKl78~1uCg)hXAAtHT*CY5^ovl<4W`qhI$2x!>&_-?dQ?~y4hHB?a^*EtBwJx+}=
zyT5P@6_{PAiqwZK1WW~kS+=JLNlc9p#}IWEviZBUH<$G9tV0l<kTFxl2q-TePEXnt
zipcIvBnjmOyPS|Qql2jGZr3iqm6wHpNo*LS>LW>yF{Z8~TThQ!e0bybdS>`-6PZ`=
z3p%p37BE2+!)*RxX3_E__{wmDAO1Qrh-z_~djmS~tVLWCGmWozTw)dqlAT{v>kURj
zmHa)1I+TShS*0k*h?Vwr)Dp%A@Uawe0!ph6CO0*H`yhKs!hs6Aqmq%&;{itAD!9{h
z^@(HXy-zrV-#qel2u&wZ8~v^z2O}z>C~bU%M(ZkZ*WnGzQY>-dx9MbeNs{T-N_iG$
z&O`TlIcqoP`y8_qF24JJsG{S==S;EXktxTPzFS~{a}|0qgzT^rLie`i^Cka1$l?D&
za1`iXMwkV|jPN-)ueM^65QJ5pSQ6TtT%^tWgYpb%F#pyN^@cNpx{}~5iA^S$TcEe_
zqdy5fAs<hZ2Pd24uN(@+a2Pa^We6h=REJx;9?i<qLoY@A?@^QHVi`Qtkeo8x?6lv-
zgLOYm;_GT5MQOzx)9svSjAI<@%kz_w>G<;tW3k{gitBs5vg=pSGaAx|p|7cYWiUVF
z&U23==_&`9DZVt+Qaucz|DsnOn%L3>&Xk-5SpVk;i}P60#HeI<7)>dQKsqtfehp2<
zl5)V0FL<=zP|I1&e=j<J*(EAO3wMi4knf?xgWAFJeu15u(dOWu#)Xn&$U&9D{nDc}
zyuk<08#?rV4V|NfV4u%-%X}e8gWql%SNKl=h`8JTzaNhIzjg}bf5&??U`#5$CxP=I
zg`s^pIS}aSoA}Oe(JxuQ78fJBxw)NQT!<SQQna<V3kYa|V6z1O_l!_HW)~rx3A*6K
zW_fjon2{0X@bIvbo*qW)3MM%@c}Q58@#$K($KIE4$3fChT_YoCfByWzj+DsB@d3B^
z-?<^H*8_y#%F7cnG9uFq{D}OYZc8@r!=9rgs3Ih&Y*$+FqY0RMPS<-Aa&u#s>{~H9
zIy!PPe9m9?UI<pGm#+zcJKfygPRbIAX-E8@W%1N4{jFKB{oVSBHI+p#s;5Uxx6Na6
zvkrrM9HOM55w6$jZgg|Hp3d(=XFi@C2?K|yge|^UQB{QrS^N9<bH;>Vbb;_Q`mX)i
zC#?2YXa=`PEuT&>6Qto;S&6?z#P9r{Q%+Ne7^3H$&yNmFT6OT9Pj_!1A~ZPNtE=Rn
z{&>E|Lq+hqKE^OGFvu48gt@)FeRJG;;&{2I{QIxFCKEy1<G-bw+ugic>siLjze_Lc
z>+5T@D`SPm^!4?Fh^@r3xw)?Wjpl4`Zf=dcdqQu#uI<4KBih=#kK7|5N2{IWQhq`~
zAK$1O8k*7&w6(?Gjo?n9|2GT876aJB6oqc#(O$Cjm8q7Htgd>Ww0mr)XgD5LHd3*%
zVH?*Hg8S4g{cUnwhaENNe7eYc7Ac?IxL)-~nz*<*A7Op*@+A_8>;qULxa{=&Tuf7w
z(AdO;-KYniKIH=>-!OxZZNPD7l!!_DhU5IxF+5yNr>iib2_cU${2%Y<ce+g;FM5+}
z*`A&RiiID!tZi)0RvK5of*89bB~hZJA~#snYa2bRM4-K7)eVG)dwP08KEy|?P4B^;
zs#nIOofjM+WYKR!j_!xe-E~^;K>%C$LcJUxq+@A$dGLz;C$3L69@Hm)|6<f?qC=D5
z?yhHI0*ho2x&t6dE-65g6>14TUgAzoPdgpVFnD--Q}gi=P6$5>s;H`FRg_&^UA=F#
zUtzOb<XZMRtc-|@3;$^<rl&`G>D9h;j*pN3yU~GMy;3{0^|%G?Vs~N?95)Nkedn#e
z-%C9Y`O~wRA}d-)!$-)w;)?6{a3%#3_7VjPY4-$6I95DeE$aN-S+i1o`2}w9bcrJM
zMKG3Oy}raj5W?PE<*R_Zou#brb@8{|;Z2U+E-!=r86l&icX4uBY}D48RCoPD+2_Nf
zPqI<5vt#yqe*E<CZ^?nr?EsIFk<sydOXlI}{-pHVw;po;hd`Iz@h=Sx1iKS?pWw$t
zd{;@K(YU{-1{=-0><H&|xfl5Bm(u0IESm4#rD&dzujA%t-me<f`g^nGFN1=D-rh!_
z>}!T&lJ->TwJsxiH_u5Isl<>95ReSIxvLDh6@IBcYdz_R4o1R)g}}nWC1hqsSxpyn
z6VdRt)jVEI2-mtDs@pC%y`W1%ac^z^>C4^r!z{*VwZm6hPOk5FtqHrsAHkt*=K~8b
z((>~1UbOEP)c(5)%M=NQs_Jy0N<Por`Vanb=m>n6)9EG*PD^`vtj_2Ja_eEDFw*0!
zC@sE|ll*&bPNbsn6MVaO_-3<~OsqQA^Pid*uN8A8u3y>j-AF7EF)Cz>>=a|2?wQ0X
z7s2mO0Y6|Ky>%t_`RTgzce^+52U*#(!|KkB##O)G{pk{;TFov(@$2Ii%9IZ;aBzY;
zeD9&P0RsbbdUfT-bymAz3re!IvhqFH0y#Qd#y4+j{h$4^-+9kXHZpN>;kA2T56JVb
zz=w(jg8F(1!c!cM<2<(1?DCq-_flDtDRo@oLwM6>tU!&$1exo}pZ4!bVpX^?>?C}S
z5OUwk1nQw#OD+&2*O^gI6@W`YL9i9N%{Z(6&+XLG1d$SJe3;_lnEf?Iy}vsB{oxT2
zH;xv*^SSO}pI=-ktE-2#w7i9ai<XuaJp)6xjzfDWSjV+BDbKz!p$Yc=T@cLPd^Ne(
z^+BlpN{f-%fDZJvcbEHc!QGJBM#77oiQYaLTy{2R3JIzgh!W@`jmIm>zQ;^Q6=&};
z-~f_;2FbqhommI+;#b0#iUl76C#-U1J-z8b9ontoe31-SPmpxs3MIV<v*l+X5eiyb
zhy^ElmzG|AT6RL(*w|pR{6(?szGR1tfe{)Pht+3XTcDVpmX>A+Ne;_|w5}WzMifzc
z2)XQ_W0LR%BqrkEy?nWOf6^)C<isxY{NUnxIEV4##fzKH=X>E8B2L7>z`z_Ib7ZUb
z#YJtwi&4%RO{OM*q?3&ftK$kPt$6tO>m_-9J>xk7FUZMP7k*OC2ZA864a9@y0m?*_
z24{WXl|!HK^Q~fBWgM|KXU<zURA_^N!niId#50G)#zx-b=3TPFk`fcpzQ}BTSE#>n
zc5z|<bgZkUrq*R$ztI&4=K(H>2Q5qBxGC6rr%D{aspjdq*n_5aRG!}*N*QZMNQy&c
z7<lFbTeq9lkk?Ae$}ntUryahvuKOxCw@#&1RRiC)-#E_M_SiM9vRK0Y-Nqxh#ynYB
zza18)fS^~H6gY@gVWTF?q?y%fp00daHR=vRxB)28?L-1C>HFhWth2K-kJD}>$^>ba
z&M)eFOO33E7{np3^L%h1j>pUVK3B6ELqC6tDl5N$!jmNzF**?kymlqQhaw!1=A@a3
zVpW9z*9wY@pX8r-SAC}US5P@PIcv?wh#g773kzxb1_w8R6QEgR2&0fI7%4^O2@j$b
z)NkDV^?mdy;M(h{I`-Y5`$y^6HzTmNox(!l(!lSwNHs3_p`Lp*;bB@)*y#S>UuaWN
z*493_@tv#C0sxprQEKTy>Alg1a<trxJCepam_RK>(s_S>zX_gE%Gj8S%>Tg&9v+^H
zC>a2^$#RpE5@U+AynMe!o^Ke~ei9)cLWj09L=Fy)Uj<TFsHxSs7zMh**GF<b?G}oU
zrE^p-{vEGGg97>jnhvMMXaK<WlRxXF<=@vW+R45rCDwS!BR)Lv9WAuK!j3%Mj1xAV
z@ug-r?n7oWsK*I}xk`bIxgk+(d5H#c>?iu&Zoh217#HxkUitI_8yixujZv@tIh@=N
zivH6dWu*X+K%1jnIvWfCrM?EcrTDb8@RN>v%(!>|2#Xc-VnIA5BqVg}xn4|>jasRU
zjpq_<t#t>dbC^X@%Ohcu1gMntEjPQ=nhat>L}X-8>|6ixki7H4gFa=z9fJZoB3#Gq
z&MPSm<lRlGLh9MsSt!P8wz@MnIIQX$i0{$>_`(vp$<NnO5I1}Tt$Q;VOQ;LfFucnf
zED}C!)e?pN=ZA@B!>$0BG<osf2JfqbP+Hkk5henY$Ga=Tm~~JXSY$q@qH-A=Mbjf(
z&0ZDiHjjco-35+`c+h?C{)GxMqJk#Uo%|DpEkn-ZV81N==4P{Ie%5F<3jgL{q&mKN
zAFi;Y4V&R&cm3BF4w84y@Zp$beXN7AUmnt&m+9LlC(mQq5y9v<^MKgH3#k_ZkA&1a
zIEdulN(#FB?s#r6jV7#vg9GTLr-E_N)@Cu0d;0HRRj0=HJ7+gQ1VE#Qg~<C#w$MO}
z!v#>P!lEL|*RPSFSP1}<-FZtIa8LIG`5%gTLh0-#hy$NrK{+2KH8q?=`xSxT<CO-K
zvLz%W<fQV5RlNtX$A)hL@q`v7pa)Ev$0Mwr5_Wc#YpudhH*f&tIvMwA=2^>Uqy8>6
zZl3r*3t*B99qlPD1EvM-8(ds!HTv)0BS7h7b>|Z?c&}Lgc>Y*%kjJ_HPo99{&A*`G
zf~OTRk=GZ`UuR@uLI-u|eSO>y(q2hb6|Pb}S|*7OaF^!~IjB<z>;sSl<daGrG+|+3
zh=`{rzu$8u?E3n8I-eB|B03?2m6i43VvM)eWl#Y;GSXrIghtm(Mhi3rqR450HGwy^
zCX`~nzZesw9q%r!MtydOlAjvkbE$y8avTxaz1wWC))R{6VYo$~UBMSwx@Wm%KNeb9
zg^3l@LjGDIJ3yCZP+d>|(aZ2=`!~D9`(gHcfBzU_Zq&HgSJ7GaEoBR-_#wJ>fT6Gy
z0q$WlU)BH8z$XyoE)|Hf|NYWx%M3&t9LXpA?+O3Fh3^)dphFCTh8fZy{gPK80*z8)
zFW37xP>u1unHg=sUALu}xVY2v<F!d|7zQLDAOIql#e*>zM?MHD9x-bS5kmOZ)v|>5
z)oXt{kS);IB`FG_B>jXRAOzB6vRs1|z{MKi8-V-A5_4N{nvZtHk_wPKCc(kNI?n5T
zT2Il?7qzvmSo%!_Xygr8&e?)R9wjgDORRU7*mMfn5j8bz5J$9eadGil_rKaPWbcSy
zvgihM8+N5K>*P;bR!uEX>V@X8gH8eC=jR7#aSNAY8S=|Y^%=d)=vM&+!oIyX%ov`-
zUT0M4E0}=%Ls^$GIXU6*h*9IxIuI9r97N-JjUGoTqg~LZF2x(1g@h?#*hqm+$Odtr
z*`0*~DNG<tL_~yTb}=p$$aX3~T06;vW8G}#sW>@tsgFqmOK?k!e&qzyYtre#A`BPG
z-u`V|J^lGQZ*5M;q5tmc(5W{iJu@H9!_)IW{jT%riWREo7Zw&g?)FQZmh2kc+^3MO
z*vn@R2r)W1LqbE>vm83QK`KqQdw;6Xu18&IaqIo|u^gL(B*yptpV6N-&nAF=PP;ju
zzJkbmfX;IEOB{=mn|qd>f}E0c%)ZfNzDn<xC0A@hS?i~h*g<lC66gkiRt)$eYQ9i}
zn{S5qfAtUOEdnQPxVWzs`U+)|u!)He8Y%!d>+6&8*e^${w0<f8h#eXldh>8G!R~gT
z2HgjhvYA@_UVjrHL2Q4sZSC#`vxmhA-61E@$>YF_ZdFg9bw&KXON$jx`7oN37Nqan
z<Fs{ZRH5CB<Iq-%2kOzoebM!|Ef0F<wSM}Td4+y0j<@^C0FXyr7lK~l;hTT2SH(4}
z^_4KlVL+M3$78qEx|Zq4Mj>Lb#)0&DK}Z;JJ=Y!w=)_>ax6`w3fs@W%<Q88a9}<{r
zXi{gh3Q2)H78+iRk<a2;mtEmGTBv`uIG?tb(8L#K79C$632Qc0q^7REF>m1CAA*8g
z6-j3bgo?*nAkyjGVRf%1mrA}|6~m`h-%n{?`5mBjEO%rH`wQ(%<gwlygZ?xCf{LE+
zM`I-RRt2aOAO)bW+q~SLh8)}*S=VdR(9q<A-T7T_qY>SJPfi}^|FB68G6_4mK8)z)
zzoUiVkLBv?OAej=dHzrPmS64awQGSVoSdAz)^UH_JDS1CZaqT}+AW%O0lC*6B_19g
zUvc?c?|)1|!NJviR+M@X(lkMU)Eu|m+aEnU2{Zcher;<bDN)QD2#Lh4y&U6Zzi+0o
zowpNzip|0P;%7Y?m;iS+GgER3mc9n9f35FCK<YuMiZLE4&>4<?e!volZTtP}T|TXk
zkI%BU23LK@&<I*~r?Bbc{bmt@)i1$>fZ1|&qr;cYzgV=<g=2f+umpeQPD@C)f}vAO
za+rCFR4hs0n7Pd?Mhhp9$x{BqVRY^xx^G&}r6+l^#*}p?bw9(V2uw{LxeEX)cLu5@
zh#e7>XJKKX;lWH9ApCZ-BSoMD7AB=;7NDZ)eKdj5bg?|3|DjMl0wsjS#m4<n_`f@R
z1p@B;?%(EM3b-EyNEUR1m!U*TY)JQ3HvT~cVicqgAXS?{W)z6rz6EqmU;h%-Pw0Wu
zoQAH)y56{_P^O92nd0H$p~hMsk%QN+XWsi?7wCb#C!~R|^Sn?WO*wSeSl7Cd$onkv
z{6H=$wRK!#pvT1YVel?P4&T2Ccq4dlRYbM5{Ft977p9@_i~r&UuStKMnY8yAE!0j>
zR#w(vAq~+5u3Dk&B3SVszk56Iu&(YZne~9(?YXv$D<D$ZsesQ{nd#W0lfK1+3T5?b
z{m-Bxc$$}2pPxRpUCrreu$ygFSAF}2+T7e+T2>YaViAT#j`>*<E_PEJpN4m4i2p7~
zyONFg^U%b^;8c;U)7H?7o12?8K;i~f72k!v&h>zO=XKfDe*GtUEQ{CV?_&M$PJdyK
zTq%&jJhs0Bas)n6(b6I~IXOYa3XqrUCns3tv#6~r3UJGMQW8|kzG*4=ZyY=R3EIa+
zc1M@70v7)T2#u#F5`u?KGa!q^0Idhm#i#0q_>xs$!cd7qKV1OH;J~%ClrOU90T#Xa
zkOdMz`Z;*`qng^x;1AN@=X;>dS3Hwr;o`fV%wcfm`XYuI_;mx6uc7iKQtzlI21<jp
z!Cy=pSY1=JT9xXfS-e=FFN$bu6MKO|D)VK?0id!DO6F!)_&GA=jmDHK$yVm}$;ql2
z+;$i-pMk-u`SkCP$7oyCn|+bEH$bq%%<PJPmJ9}jp`ke|XvH3QEwN=^QBhG1>*wYd
zG025T-e^=wn3@(+m;SrH9t7Pja=ka4T0r0xx!(;X_`DmT37_XVjlR#>d!MuZmw-5;
z(%?iEd<>PQ`RX56#YvTb3^>ir6EK?Y)M$Yz(-#ndT%p@QVsNq$ymfnV+Lz1#3BVso
zrM`}4Jq7T+e#_6gmeua9Y|+>Yypt)}-~n!^R0Ck#Q#dHVC5^b2e!)+})n}4{6WyPF
zK9fYx#MBpef4y0wA18o>b^MD9drIXpM6{(4EEKB#0BEs9MMa%nPAhJL=HAWRxIATr
z^$%Etdy@q>J6R4jPMea+sj0@l@<ov`F(pA4Eh&ry{_P+@Gch+e?tnY~#KBn7r7D=v
ze<!PPpbWn>HT`Dyat{Vo`2ujJ>b7%KL`;&sn?FEi!Xct}W%Ik74pGOgfeZu0;g#dl
zgJ-s|KiTc=?F}Gm(SZK;7}xU6?N1g+WlhUey#rEB`hHMsX5sJ9-KqGf*FS#O!#^12
zNkmkvFEB~pZfUv$D$i#j6s;(H3*XWv_+P_L&htP<uqxw7b6kpPfRi)t{XsPAIso!^
zs{P}I@2`u6ISYQSI(&{Dh;Ka^&@k|eOngoSKH*yoXay2;TjP;=ZoM!yGc&r_8A})V
z#Pap)S2oijY~ZCW16QeOJ5{IZX2}J(71UC(Yrt(02MXBlW*v#-tt;|7?|*$DzuC=(
z@oZN+a-c5fZ8VVlPVnEamt7RI?jL|#>T|OmuGwHq1iVzBPXa`P5smhzihxHHq+Sjs
znWvLd<VK)HYc@Gj0;+=y5G%cB5bXNo<fLZ3H9nnuW{Bs-&N>jcKvlD4je(z#1Gr)k
zzptN8{*bzp4b_VtF%K+nnL@3&Jl}H^(A&e8PMSyoN2d1U_d+)dA9g(3c%RMhOAOFk
zOiWBk*$D^iaQgXHl5_?ymR}m;DeS?;RDsE$nA`JXlgHTxzRdT|PGMMB*vy9&u%B_U
zzUWJhzWelXdRA*P&W})!K?bBXSR>TGWV4#0v0eOu@_aYn2^DM`;8&Ogl_;=QNJU-b
z<k0V~Pgak+7VH}fKv%lIKKTSc2y8q)(7>Q;1(cG8j&1|+BndS&Jigdozz*N;l|lyz
z4Xb|lV`tCcK6ZaFP0)c~3&>rN*q5%YpGjwZxQc5I{`apShQJW`-Q+}F_x(E*=h{wt
z(15<~{$VxE2KwgZ(E=`nq(FCYV8D2(!ER`Dv=8XRh?CA|vKV$LC1J-k5mdS=)kH&-
zDFPOe6a99IbDZ_b!2uvUuh-WC;nEY*kp)cFJjr|=T2QJx&lK@?UWzQfL61|+MUUNF
zZg`Co`KwZ=QGqW}lKoR3P##f9uLZbCx9%i?K%D|HBylMPB*4_%+!!PopT{W$w1NP8
z<Ch%7KrYSuB}7F9)ey*NXyS2X?;LwEc)M0tSJ}-1XRSklqz4q;h$F+0ix_l|K$F4P
zbx^W={!eWM)oIn$tRP%=jVobLUZ*H{1`izZV`=iwO|sIvsL047j*c~h;qSy2H-M;!
zea1pXwZ3o%rji#Bn0<W`z(%0v;HdK>TL@G3yT1|#APUT(8qW)JXx0Kuk+8C2P-Dc;
z_5G){3_Pr${d^i66!X#aK#;i5O&*BCe?v0X2X;3O(8EoDr4Fko6!V3^*P`6NU5M=r
z2Tdusd+kP`O_98>uaAO*V~X8TLt_rGSr5-hLw<+-qr&(@LWJS1%*jd}Ug+nE(}jHB
zDrEBof&l%;Gy?db{P82g_k|d!>TGHv1hNkxG|1Ag`NHYn!){?7A0DtlX?1mppaV4x
z^;V&Z1;{X{Wibe({A9I(FfmPNJPSSOXcK}L==M$PVdDy@x6zK6vE|>sEiNA=v4@r9
zdZCt;m64Uay*xjM9dv3stZH=xlze@rOcki<I3u2{TC))X+oc8!Xru{P^^f+vnFylS
zH#dVzOX=HgHlp{YOR#pvbD|FptO?N(EA-omLFNMFE($!Z5+C>7Ppj77w#>wu*&PfU
z4z>yWX)BhKL&+8)*+!QWe*ASSa}(I+8FGjja$%9UuQxXOqrq*3p>96Z@aOw<Oaf2@
z>W^LsdgO+WnY)Zmg8NevL<2Iz7aI&Jw#Dr*?0BWMKq`(583jdjBKKYE1Ic)Zt^;(G
zlaLSyW-h0KU|#&82r4+v)`GFzoezc#`U7C#Nc5~b9{=h5{`-#yYiIWKL|!)4#)Ig-
zvGWl_r3?=+xo+_0N=ixr$;o)7v(}f#D@!ka+~3|BO%=-AY^NF2826(>5eTRZNicH)
zxHVIJnb?ULDFLR?&I{WI2M5Gb3Q~i-D;6=LN7VLiM2wFPC~a-+ze-4{$8o3hp|*N1
z$mpJ)9_$^Cc2T(26_&lh*<T@*?+Vg!oZGEE_P3U2nn5??$-)p76+JyW>%Ikk259Z;
zSiGP`_W^f+iM|tP$lncium1gA3LhF8!dxM57U~}u*Z_S948_2w-TAyO>9?Gl4Ku=;
ztzgywOXqW3gA}TiZ3?um{5g2!c@O~D6bfbJz$5F2LInt?H&vtM|IPv!k7x6P&1OB_
zN>bdY9V83R%Oh`VYs1FF3jzMMfMj<S@GEx4vVhmASeY*!?ep(viRS&&JwR?!a=K^<
z<oyNQXWm@|LD1`&{f_IjE1`;t5+~AecTGUx?Z=QGgzr%cWv%x<Ex!Qo3ua$`J-&m9
z%E}i}XK{IM%K6}#8MPP^ew>dVKgOOAu;@nkJ>Ef`7Vm#Y-T9)yHFir4?td-Tx&k92
zBc0GK;UTYhY#|WfoEFs7U;_cCR6YwxV>cKpEY=H)5!2(LLiMnSum3inf~}8Pm#4j0
zCPVR*(D@SZIG`pOv;~1N6kvyXtxu2V|NadCRviVrg%^N<rt{blLqtlZ|06pGIhCF_
zH0Y8j+1V%0^h8Br6!L^tb{>-s#e1VOKCOBp4Pqc>R|EGR$<Nx_T1i#!Gia@bn_-_T
ziKmNxxC;MEOQrzdHh>+qA72nHge%S(xJjmLgofW2NZ=gZJ_3uO^Wj|9;&pKRo7i1X
z;4vB9o^J!rG5|^jXjaq#LQFEj2;fA59_|B@oX`D;kj(pt9opo3LQt6hgs<2S0<PzH
zx+W@@$%O)*s?%_5HZzkL0-U}Ol`?pU?Lr*_&=I@iIk>imV8%%gIFN`41bjz?b@1&c
znzQxWy^OYo5<n*jS+SRLcjp<1AqsBIMifEC!WsmPIUZD+*+}Y_uU}z7T8024K9N>-
z@!hWipwd8*LOF)ZjvUle2PQ7G#Ni8^d6~*5E#ngtgTO+-AQDKtwX%|2YYpNdgDhzp
zSQB)MF>^;47(T~6!|)TRwrsi|@8&yn_P0khK?h^X-<Q??-L^cNK-F3^NW8_L1s0G@
z$@F?OYscP^sjj-cJqwUn0s=(f!}tEr_p7x&x6V)-U-;n^?zruH2Y)sQ%h~4O|CzdM
zZJD+KD;o?MJs!>m6VlS=H+-SwmXODmuvk7T?B>P;D*x5XXYHCtxcC2nneNwuf`s(+
zh@VNXsd;$jnB(aR89-AQ1jdgO7;N>}H*W6c`40@Iym12VvJ@H>a1((dF$BKz(AZc%
zc%m0%WKfzVQZTs&Ah+_^Yt8C5jEFGX(|V2+$b3riFi0sD4*;np^}kuVySgpKZha+2
z-pq=yz0EGWrw3o(|8DjbC{$qpG9%oaiy|iqDiT1EtONA?j|h5txE)sjPFa5t*q-&~
z-CSHQQQEC)%bn2-&2}$tpr*e}??e5cK?e|0D5>x1?Ind<dldHGC41?XBJ_0K1tEub
zJ$*~hxQQ>PI>TdBZ=5WLCp)S#I@ZV0J6hL({<krsMN8A%{g&9Y+}JEID=d>`GZiJ3
zrKvfpm_KtdKT$4-ql<0#dQf+ZgV6(J^un**BZV_>B4gKuWn$NJ+;iLo@W3J927E3l
zd5tA_3bC@XN_+ea@-z?lc*gbGVBOxQ*Vns%j+%fpWEt$z<13%WcJaHunm6DCj0cRw
zhy*>@^jch@Tmd@h0i&(8Z--*%gM-u4Ng$AHz>F0X6a<c3g<fkan9XE@F<HyGe!x2v
z4KRuH02>A+t-!UWu&&RHib4e}0LW@X&`^HcHdtQ=wofeD@_-2p^dFy}9yMq`0S>G8
z?Ei3cR?aBv?BZm{)AF%INCA997FwViOcrq!PZvsTz>$T`oa;Z-5+zBIZ8R8B=<TUo
z9{^_=2gNJ>;e#=d_Dq^J0e{-PTXE94;CZq>gJCVub>b>2I695?{Yy((WbeFc99BCJ
z6J)5GbAh7B1(=(ZnfXCU3HueVeIek)vsGH1e(wNR1hc)BlogFi^$S21m9(^e0$G^B
zW&{iH{Urv%BWPM4{}wF3Kv6E|Z3^%OvOvCa0vn6RZm}=#`N2Xmj%+N0(?Y3Ct*XGe
zE0>axf|>a<Xl!gI1L(Fquc#$s)<Ai!f$&j*W1w(In5WAw6F-r$f0`+msR7gTbofnm
zlUVM^@USLW5NM1$;FddqNMyiO7_9f2E0n<n>Jf;gHNc!K0EmzX`(;;DFdcW?+XK)Y
z2Q^`nGx7jZf!+VfN9b;k+VB2vV|n5dc=^>cctDMhsapdjb#;l~Ehe0SpFCWijiHYr
zT4@K{`CttEEXC;tI|1z<mcWE5$j=Wj@PAAJ1CXgUPZzMCbV-qb4m6^4EjHLl85-sv
zu9^{`0h{W*9_?E(WYPUwl~_{Yf#YStr>kmzr-Ln-w{8c01_mC1`t1%W3U!6u39)sB
z%I83@g2EjKZp%;qrwelMKp>TN{&=5k1Jknzu(%)gD|amxS3eePv){T!0Rns0izQrR
zzw)-hb|DB%^@0&Gj6ne&t+b{AfOu*drcB_1S1p#K1i=Q%|NkNFtKza+gML*+Q9vXF
zX{7}LX(R<{B$XDBZX~5c1*DPg?(QyW>23t+2I+=x-hIw@ajwsWd;j)UWW8%WGxM)m
zl(<}cvEfqBH&HD3_`qGGu<%G7)O#3ldf+|r98c(EeV`=)N~SuVuTts-7!iT?W*s-_
zd3br(LArq1_cxGx=M$X50@<Ms5~sYWu1S#j5Nr<yrWP2l?!1FTF>EIP{{2e=#oYP&
zY;^{1H1m99S|2C1?uc=@$&(2Xh*$|64hKcgVDj6;A6cKR=LX5l-rhb*G1q9S>_w_b
z03ujZcRXd{3_TMpqOeE}N1vXacN+Hd$iR)oz==?+Fx7%Cgh+~jS1@9H^kmlzR}f&;
z-*dZM#ry(sUNQ5RaJIt3!%gRo>2rsNCMxY1m!B-7Ot+2u_b2iP1ucesV=?HnxXF?W
ziwwt1L7gx4N9#-A?T(U5eZj~`o~6!xUA==UAtCV;n<Iv~gUN->XtsnhnvkRF+GXL?
zl1j!oEn?9GB*w6CISoZG)%=y~`=o!@4HA-VH^=sMvBXo~aCL=<_(-H{!wW4ywu)qI
zybL+o9501D0Vu&@Vqz#D9X(@Z4Fjk-oGyvS^<_(}_RDqw*IDGKF&uGMMB5?avHuAr
zVe$9ZZ7}@05mfj1Sfp^A1zHbGQh``@vnAt_C5%p>E<GS=-*Qqkv6J41PCij%OGg{-
zo0XNNI_G3>|Co@l8&J)wr)WGt`DQFTh|9Y+<{J6okHASQF&d_V0E~L#>e3Q|OUr%u
z&<b2Rn$uF7O(z$Q2{7~@09gY``K23OucA0D$w1t7=QY_w81rq~2&c4K|A_o-@x8M0
zZy4SmZ;to(K@XJI(Aeqv$LD%!zp%InJq9BoA)(;O3XoL|$8}0j+xmd~h{?$O*M=8b
zeL%?n22OcNNeM^*$>3}8{QHFjCTb{HQ<}k$i3$fvt>*equ2NyHt7~U~C)_Ij+An*@
zA(UIc`vlVE2g{7!$FQ5MOyMpD0_{a7!Y|_G<$dt*VM<jMAgnTX^`QI6$a3=X{)%cx
zEc6EbxZ|`k$c83F<Lm3|;s?0&IqT+6{$?u<m#Mr5g4gC9OkNH81)S=ibe$n0-)+Ju
z17ebro<Rd^{T)osL=b=&=s-(LK>GoE`2h+_668k^sV%sYsmjkXP%?;b561#L>dUJu
zIh?j-A*Bx=WEB-fzzEeZjRxxyxG5qrbq2p)1EHs+q=YC&#KM9$%om_OdQjR3?zm>d
zrCMR?pip{)C9Y!trRxliEh27px&7G<LDW`d+|J*#A+>|tuAw0f*lQiYQ9d*1Pv9;u
zLwiYvi+icrFq7ZH3zQwYWee176)0N>s}*p=-6ZJ!b!RKFkzwU<T`)(I##Z6BgVgyA
zga7g|#BV}ELN?UULt>w<uA|<9V^DW}zL}?7+-%D;3{ThrQW_IQT1a5u_e3#$hbJ9b
zy0yJ6B_wng&O1W))djx=h@BFS7w{B>IR*17RWg!+=WHodX^UoeqPV4;Q6*g-lbF~A
z-S~}FD%`2|{jd-6e5dXyDP&xBJ3ZfZI;>1~XANWSFKlnS46=lXJ|Y$HY>6-$%flyi
zj_R6RC}qVc`=Jbud+AFyPdHI`V(+KOXT#*wQ8A_k*VR_d{L{h8DB1PYy~@Qaj0gqP
z`g>0_W;IyNrf`Jima6BVb`j5-L`Ftts?6vu6x!2QpKgKny$W0y0ztuGg~ovVkpR3#
zyL9q*ceiysx-yHUFZT!9aFgSqvl|i;63}n0z?(xjOUy75f%{RlxKnTt!`E^FBme=>
zvY+<^Asd44`S<)99@ecHht+uVT{keH9L|;_VXi`KW@!1cW)T{GD*%rm{_Ch8B#z!t
z9KiNrkh#H11H#n;$sh6pRRA~(b9g#?g-3Z?ptS<U2?6u97+$XfWE*C;0Sz79BD9Mo
zgz5v`asfEV0=zJ#738w#cfqHIPYfWpv$q#{i*A@JK5$&aDZpu`8WEYRocy!-?eRB&
z6A>>}79<?UA)EkAv!LM)1;^d+3W>1{gQBJU%A(1;*d(q|s|NLdeRHaD1W{_u-s=rD
z0{t1?e*?XKpibC2rqWPjd-m6_U&a?d8Nr6&fC~S7DC#Of!3HC7$*ZNM<vXG?g2(|{
zi2yJtP^kYx$stk$p`ESpK{z2=1LXr(1C62E+hT|E0Jv7_-~fzf#JpC_O@~40;o&g=
zB+b}Ijv^!$6Xr!oXz1SY@e0D|23MUD1|%lA$Snvo+@DbL$c4J;gMf7ucvDg_tXfAK
zqXkY3UO-MsITha`FbxP|6Yvf4H^C+EQPOnz&*aAO{P)iS1X@LtIubtTi6>mwFuR#-
z)@1SfsI&1DS`_GLX|wY3gye8QIDQQvU!_F%!M*QaAx(np4xt(!0Hw^1hlW%#3GX1R
zhM?Y`8?V?QErpPDXRNyEfhfelQ=19*?h|*p$Q;J(|NZ+fB)&vY5WWQ<Tz4BWb1aX?
zTj%D~g^e|j!++fG=Q}}L?BKU)6Qxx*w-g|&p8ScpvmKyC#!IiVrwxni>7~<oNarhd
zKPL4_BY|SS1$+W5xqjdqlO=knfU2Hxb5FBbE~llWe1i~8sFH>QGT8PFrwgc+7m-Z~
z_A8L;7_E4PM!@O~)*vW{!N%i?ii#l-5$)iA8k;$+%~Z3*a(piMVyXb`P7o!A)u<WH
zd2arr^V8E2qjF&@YiIz3<2R9EVTkk}m(7~HYJLWw)4H4M3m}}upou_deL+v34m})v
zQExc*Ltq1b*`1Yw_zyvV7i4!l?vpzI1jsu+#iFXBvVQEmB&k(832q@CgnAa>+@m~)
zd3wLNxS09Q232xen#b!>10B8#!r=@Vg9Y-Ss^cz>dufaG)r0>bGWp1&_4B{$>kfb7
z7S9PdusQ?806jy-mx7Yg5}NJ-Oz08_vmpBOg9n!C9FU;GeW~mYYy$=fo^8l&1onh*
zhI**7Ac}N67xFL2c;%>H0G``xMr8ybEe@0P>1!(rM7kDHhk$?=5Eu*O)q)454C*xl
zHMS;TpdgZra1<@WpFDp2tIpv7(X9&<j>N9rZ8k=Hpuqx4fiNG$iNmk2E|89Yry<a_
z^Qi^orCI^|A#m1fkZW#%p0$i%kI+Pt-~K51)N}_SbaKf*WT;z(bhG`{(fA(`*{ZiF
z0>C!_XzRm3E}(x5P2_T!(JdK5l~&BBRP&o|H?i0qLBnusMcM+ahzFJM;@A}l^CQ0B
zcTE$m+=;ACz2?OBD34H?pyZ$#k^^!Au8BFy{SPQj(D<eXw8bdKiJ=e?Ue7Iv_XGw7
z1#S9E8Bl$2mrE5v$b%rq*bmls--4L-O1#}{6EuS-R5|k5kRsJZ09Jr71t#6y-70nV
zOoAvd99saOsMA@=@E+j+f?NQ-=@TT%oFPAUi`8hz19}S!$m`J7Ufb=&(YZB&L)HV5
zfiy_vy)kUHE4GwpI)4`zTf8yJPeG8nbs=X6hZ20F{^c!4XjpkFEFkI*Y#t$iX&yji
z0FQ8p??Y>9f?C;wP-Xr;*G2NAsXjbDI3vM{iM!{2Vsd-?Hk)=nk|JQNrfak(Dqc;+
zlNY|t(8mxkI6c%k^i#R&<k^+<OI+0NJ9VKTxQ%bR8j!kftmD6QxiE9(Rk!a|N<ltO
z%L|f;S7oHuATxUTTZ7IhOjlR;mH7-O1gR>#aF;_62n{|}U2@w`F;|YbKIUHl7U5Wo
zHuFG2ZIkQZRK6#XpVS>@&ZsJHflt#XM4IGqb3Ev^^~#57x5@1H)QD6f7=YqXzvUM|
zR6cBO>H4)nu&n-UuY>e#yz#mXYKW08cWI?1`tKh6KLfBlL1=-@rV21!$YoO)mrpH!
z05$={!DuiA8R2LFtXM(}oPM`!7MNQ3E6$Kr=<4ofdWr_h(;ie3uv3-slcuI7p3}eI
zM|HEavMeYsxZJLJ3LDO8L8=CJh7~U1DUzzSHBk(1We_B2=90fq1bhWn?aE_u>-jSY
z2){lEsr~V2c_;bp*Yn-bLR9uxDUvgaYOVhY+6v+s8SkQ-!BGSGW#S2SMwLcbc=#UV
z{PrNer^8A>s7mCM)>Jj5a3^zK)0XXAo0E7K21kvQ{0L&8lp6~R2sG(+gKYODe?r|T
zL#wm*EwFZkNPv(AKXrys0@qwkQF{oH3Lk%e3T9@ktQ~s@MU+6a7N!oOqe6`M`CZy@
z#IP0#B<h4=@^Su;G%$}%-;bbSf}B;`HD+9X32bqB{s*Z@>fpzN)ks$9IG-8mL@v+A
z>@~La``C91PF#;;9@UK+rxz9d1W<Gf=A8gEI$a8nG&wS2J@M2um~EFJL4qm73}gW@
zL&5qZbQY0EX@crkhRV@&-a`id3Pw@Qak@A~NX<M^x9w}c8HW#@Fnm_5Ugm&K@`8>o
zci2O9^wpTx7<EWHlq!agj}HQO;^5*UZYqM8Dds5)7#e0xJIx&BmXt8d%gZAiEl!v7
zG82<viS!g;5C}}!+TK3aHOt`fjGsRd3g`o!X1*c|G%5|3BL#qUGg(-i;syMD6&1=a
zkF5i)t_YM<B*{IJQtH0BIUiysI$ttTqdt6-UP3AZK%a@iV-JJ3_4mb2eub0(nUSHc
zOc<*KLZpNM{A)Nn-4XPuFhw9Y0<#_BE6ckv;)Btg%ZTRC74`zO_t=wJryu~?@CD-U
z-d+CD4Hmnm;bhV4k74rz7!_1i#feZaG{KTny}0gxq#d+(Zo3@_8>A_!z}S8PI`AQs
z#F*Wj^8xx|N6;!nO%`GF>$Q4gUV=~T&O09|Wc31y0iV<28RU7aUKhh(pbN)1PpvW#
z0_uL!$50K>=9MeWiJ6#~-X-$Ihf>P`qy(l%1@us*R3DLJf%YSd)AnS4yRH#tm%l%n
zQgprJ(E=P_JrIo$0u-PnB~+xy86K!U=U7$&l%~nKK8@pZuitOXBDXpCR2$;H1_#&G
ze7X&k<SoFY1{Wv9uyuQL3qpdQj{O+1>~>x9E>`ghvL5)PB?r?1f$cusTSQ#?=Y}&X
zz$>G<N~KoI66W<siU48|zX2Ws>*v`x{oWWLJj1!txN;y7f!l^iZ6N>}95Cp<rO?!`
zFXnClgRhNbV}l9Db9Fe3m|q;T_KO6(#X?pGMerBzR4qP&^Vkk>D97bwJu|Tc=<>Lw
z482-~F319=vvtuR(<9=_UTFB=CQVl$1Zok`1afI@*JHs~PRFKDXF%W!k1k*iq-Zv}
zj_*C7bi)DO<*<>RiC8kAQ_OpV(ga-Q*INu;AsZW3xEMqmgjAHXiwmHPW$24rQ{{f3
zAzfeXy45znQ=okU*alIL`Y?Swe*74hm|bqwFGXaDVSYWDc&iJwBurxe_xAcz$$y83
zX#o}J0OQEuoU>1{N7h{fNP;x`PR;Ly-CY?t202HP&+h^aS*|wF-`}n<gHFW6$rW?I
zBM`)%PtGo}x4wl~uX0)~p%~FA?Ws#YVr=+#M`xd8$rI0@Q|FQbJ@7vw3dV1#;UF1k
zdo5tsOM!^);Nb95wagooXV75>*KHty3i6ejYNfdpv<HZ=v6_te13pNic?iI=;EEA6
z5=ed{C>Gd22<{1p8Wb%+EN!4zG=rG^8m@x>=8^@Zg1>)1AyfxlT?)h+Q01&69UmW`
z?L)$FF^cl_sq+7XOax+P2{dCT1Z+zQK?R<pk@tC;jU+|fcu<i#zzreb#$rBGWvmAm
z%j9xy14`}M`4%7({rP53l;^OjVyEYE1UeD`mU1jAaSFT^e3C23wh!be&;jv4*ePvo
zZI`gvvjp*M#Ab}J95D1ZC|}#O5y)<6NM0$b9AGg>8~Qsl)sUE7SY5U2d^~(}xysK2
zxq`6B$W%Cre`+}Z5HdtlaBu*$^YHWvF6E8QqgQ*%X_z$K_N}W+f={JBMoA;VuT8s}
zp`^@s)EBw1q3dUDs=7-lfKh|Q=+3dcm#URUEt0^d-tOXWRP1_rdSX6#(v}mRt6XEx
z1kVra@!$SKd=>_-V2a>N5;0Mgp*3K~$!<5-h;xVFa!~MAAn}0p4zz0-^7@5zrTNfl
zfnC%1A~I?4tiUc*Y8Qqn*sI^7U#eBE3R5x(k^nHEjf}E4DN_aw*C4C`pGQbU1dD*c
z7w8Z8Z&thhLRJB56UF4E`n@^FDxmNp%v+Es^KH%Gz#wc4l^UBD2tzmt)Zv%vHD!uc
zl;9LUz5_#w4`zs}6F+6hIo8vszX5%&fntY1)PP_h>Z0)_y|1MO2^fKf!}1egFaC*h
zZ6~2zp{&a*E6ji(lti`wp(5~HrxPUy9>_m4BuwryeAiLi(~WKRr?1p&N2NFP*ke`d
zjoA=tGX{D+1A9*sz=iF~$XA?>-hcYA*oXKp;<l$N27xEw(rff=BbhfY-9?O|pm&aU
z<E1;cR7#bPuWts6bx=|}W*D~_#J@i$*6T$74zmX;7aEfn2vy-sI_<ZPSrmhaUN?hA
zH34=DSnDeQ%$jtW5ld8&NT%hgx1Az1GT%Z()C9US3Me<Ah9U?U!rhCPrZ!-r44I3D
z-hlYPw_q1$P<76Pl>k2BZ-5pE4;L)e=bW4qW)&mQQUErzL3gSK%znC6+7FIH<4*sx
zGaWFgdm%Z6&uaJ>_yQvNf>6G}d#$4S9H){;M@I*P7mzsg3WTDXm4yPCD?ctYNF!3Q
zu!KUe9cDX!)eh`<&;m<4<t0_HLV9H}@XRKjtN8P-hT_W`Z>8&oq%pQ|MY`3#aM>4k
zbNR14z;)8Fe(mUa^mnB5eSw=&n9@H;RLMj2Z#7==U)x+HZr~s%%V&EJiSA7-1|7gc
zmvfo(Yc~?spP+ebSVvQa)I3An*&xuI;U1;JiHYRF9>u4k4|?`<riYXK<*omk=J9O~
z9K>ian+k(_<@JadUXe}-XR}-}CKu5IIXLL36%cX_EchT44@kcFtgRV>GD;IP5d4u1
z4hk4+0eBkrUlGM|d20C(74;0P8TH9=BqXHH&d&K>jyaga9`M`D27N@Z2ZztBi-OqA
z0f{gJWG=v^5P50^*4hF-@?^bJC0%zp=&>-cA$%=WKqYCMO!Kg?;hG?nG`bZQO9T%h
z=<O+<e}!2#YVl4X%j1`1{uig#*>L&oySGJ05*{O92!=EsE;9vJ3@9S2A<`v}zd?^-
z-Yo3L#=>ge{QOdT;@?xn%yi3E!=)cL*GixP-+K%Hh`qP|Op@YQVG?YkB3O$A<#YD$
zb2Bid&<NT6!3PAlsT$CTO0l+Zxd|+DAwKGZM~@l|U*r0Df)F<dWMkZ*a>fc3(ZZ?$
zolF+%+(%S0#OLO1op3vQJr9T$a}2&J@iuG~)b2EQcl!a|u!6)G$nYEu*XJgP1&i@2
zYYjS&U;3rhL>Q}Ik^u`v!C4xS2%`w*e1MKlY1C(`-!HLnWj6;J5yt+|&4pXRl19bc
z>*?}d{l#HOt4QYSZ&g`FZdF2n0CX7Ce*S(pYwJPVq30e+P;0Gdq<uq_l8B%lz9YVY
z8d<dlqT^**P^Hi?FnpnWW7LE4{A0({(VEra|IGr3iGH*q|2sb~U~EiF%w_Wzyc%0G
zaYGsawN1c^yCAl?2Ga#5w}t0D)t9&k!UwsIV%2=aYNIm{DVWK%JB3+UpJ0L@C>r2s
z!~=mO_FG6!-GbU$0xt^D907I}2qef^9mtMGDdBj+)JcE^LI5G?Kw@D2<!%0M@*4zG
z35ZZ9P{ckcvn{R*HLizPb^xqkSHvI23vPWv&x%y+>C~scuoqB$k&q0smUg+~huT%(
z0CA1K8ys)kZ-K`3^&t(9OAO3%sQgUuOHu9`JakJ>OKWwFU-Wk>65S>SY6w_Z_7n49
z4us&BG9G_}$34HiEDUav$#QjGo_}TJ%lSr!+6UW%-#aq-=DVm_C>&8FXC3`6T`mgL
z{)kgVB954dFtm1J?k~Eq?Scqz1-K?!sHotB7xSNg1piKM3l`h(fWie8^Rpq-M}$WM
z9|_1gAaz!FcIhDUGy>Om21C!@(GlvjAAys=33%<WpJvjy2kCpz-5x=4f-z?SDY3}+
zJb>T9YPpX=(rR4Z#oHA!_XK7Xd=m)0LhRq&?P^og8Dd0$s8CY}*CcLb$Y;yqcml}7
z0>cX|2_V%r5aw$l=c5Q9g)aq&ox0nYSn#K2Z!d>+%w-RmPATt`tttU*brn2t`N}J}
zy4R6malRy8xm^C>;X>+@8*l=DP-pOcU2lCbWw-XLWCRC?)RP)J)ADk{l52x&f9GPi
z_Hg5~u<#}2^>f<1<M+Z4DEnuo%|E*i9yw3MtLT+pB!~6SZmDlO0)emXA1_p%8l0=D
zmYa5VTdxjPOB#%O$o>c<mb+~L3pO?5$wF34Y4;*nX{Rd8B%PgkV2n`l@g-bcoN%1_
zMskCt3y2z_bt8755EjO$ARi@`_#Z@xBS16Ih8f#&br84qAl->bI)K0kwjG?mt)~9|
zAb3Efz!}Hu0W;o3kQIcl_W+v;@*7cLy@D9M0E0#!<a^)_39jesUXDVYQvPu30Bak7
zYy*w4$Yh)WP@E@#WWa{Ypn%2kxx|1uTMBv_V*d=pABNO0ilb%+kBM)dh5OrqL<w<x
z@qt{#27Z3I!mnV#!m=REL#HFix~_e$7=R#Tr)C4O-$HwO?YeLmObWmx9zPI)aO<s8
zCRdglugLReSJ09q0^>QQ1q8mDpxzF=q48}Ne}8+rVX}S6qZ10VbR|^gtnp+{H2-)h
zpv`anCUbg5RO+|bLvbWBC6L@70-69M(E}U+p~WDUY@o+efjRtTdy36>pcHm1{=vrw
z9|Eyfq0{cK1$PLsO;=`!)rsx{xDO$xA>3dvKoDDjMVsH#B_ploTuyHP-Q4U3cPuNb
z>aob$<qaIeA)rAB#t0t{A0HpuF<KERH?zsu%LM2B&bhNriV##zrRy}Lm(>YS<BtIa
zz3=DgMf~zv?C^23c=;bT-oobC<j@7;yKsCBfJce+iibQN;iIxXyCqXj@ceF~YlYWA
zdO8?Gi_5l60fv(~PY4^&eaWmbPXfH#K7Ls8jWGoO6MwVuZ*-=9k;$s`!cIMgq|uA4
z4-_D)TwU9!(Z6=(`9}^ge9Ua><Ap?s*oHTP1q8xV-rLA69D?khhTD}bjITBbh~P76
z--Wotjb7;=4Ms@wh>3{Y+Z@Z+L9Er~E9&<r^g{fTF<MuQEcjyr49cJ3;ljX%5Rx;1
zdO++AV`snFFxjw1;UK^Z3Zw-E?;utgL6=8H#YKdmIedNCeT2xWXFrVE0(^$Mqj=F*
zt#oPyiIflA-!;<Z=Tm7;X?!<XlvYeIFu;Nt0#{AYh6@qh-180|D3S#S9brN@<+>T1
zmr+k9CI&tJc7Jiv!G1`{+1RucTd>GzSmf?Nsew#(EhW0we@Ta_auZR|ViBPU*!4r0
zgP>pyWXh02BnVtCP(=RKbR#ybBN#RNem;MTK>3Ufr(}pv1{F%p%^eGtvN%^KRwo<T
zvuEyGTc!||Yy$6!af3n$o!rK}H_J%O1+>FBPRnn=g%TWA@Qn<`DX=<^TIXNojv6nJ
zI-;oQQIZx)SjSU78pyUbn7#Rge|3Ih0G?nS^X?PpCv&R8S5^ymHO(fxHAAT`*N^$H
z3WA;oe-JL9ktCo8YY0%tckl!e>l*v8%JLmC)-S=6(1EiI8;YKg&={!vtO*{Zu%RIh
zDBkVk<9J}ffVL!;J6d3^6E7|YeHX$VP;TJw(J(S*!mNPV1W~bG5Q2e&V?eA7ItLyi
z)oH^;>~e98;IJzoG8-QHv{Qr-Vq#!m0Lq<lwS$)@Xuo<2uCzsbZZzz6{`hfUOH0d^
zL6j_5%pCVFULZXS3A@H{npdAa$#bFd`qtu;_Wi*hq#>d4y<Jb}9CU}s3&{HskEUxM
zLtPCsC!89*q;LTG9>df!gSj%F9gk{`Vmj%9NAT9+7VyG>Svzuspabw^ZBQ@}S``Fr
zZlw<X20-2k2N<j`gpCTEkCgx&RN7Rqv49Q1hPC8#ZekE7oBuCk6&4l-EedMy8(cNG
z0dSZR(Huk~2D~i^(CC1hnjnpPfyhu|F)w%xxB37^`#$`t;cG<fSZ^Mo(?koA1+(&b
zJcgzg!;b#zZ93V`tW)87byuGlG0de3DZ<FNJGrO(rz6=EP-4@5b1CnsiS1@>5v(Tb
zLyq5y4ImQdtKGu0zc<$ygx@-_6O$bLR?G{-#auDcQeRdluMMT6LZC<s9zKu-FF0El
z(Jsiyu<jcL-9H){n+4Di*vAKY3wl8yXlY8`8LIj1VA()dgsEjXSwacB`590(HfsZ4
zq4Y6vaD*TdjSySuRm<Q_B%913Am(jK!RLMrIcYcpz;=<2t)uRv6Ht)zz*a@tJy0M6
zL|Rhcljw}*uuAkNt~#0uJ}{k)>wHgQeKidR!CGx9AZ89=uOWxcemv~951Cqi46l%N
zR!bA{_W@Y$y1(0qAOS)Adw0)i>UqedhLQm2S-o8V=mG=k<HK?NQaLc8fvkh6m^+Ff
z03bG?Jcqp0=#EnM2j~}7%s;u_metqS=jQWCQiQY%!(mvGGsW@|k{3m2W=vKTb~Cn0
z<(H4fu~rw}DrAoFKXb>^i1UO1AnE1Y&ECc(vU446fEKyPV_sd9ZDPmE0fakS7F@_~
zx&*Tw#$yKG-rWa5e7-+2Z9*u~{h(vIp@3^6i-VjlK_vqHD_9JSLE_bjk~u;kycztw
z-+{MX?+uC)U9VF|G#uf6X4Lq&&2{^f%}B6cqu!p3DY*7f&Z<dphz9($|GpkBn5=r`
z)ZWSZ21)1iAnEI+-HF>7OrVKG$fPxE3Z#So&ptDhOAP@jZqV)09Q<&ZXH<s_|DV5s
zfwN!Ee7b`E1g*B?D0&l3-@!RCEg0n7|8te_U9A3OZ{SGYR~ecJE=EUuSGf59{`EnC
ztLy*ozxC@Q#0uC%(dyuBWt7gW50J{y=>y<n-9=V=FMdPn(MFYoar;&1i6z10%Cp0u
zR?H06B6o&3tMvhXtYPn^8lU2&(Bc_)aqP}h%X!0FkIeqY2Z-Eh5yhU9J#S?CZBZ&0
z)!{{79rdzd?Ot2*ZptlO|2gwtKQ-^hH29NX2QE*47&>?f-(_ro!|J1E9Nj=C+cf!v
zvQwS}g)L-h0?;KN`DHLtj!%8};0PJe)Mr)uQrA*uSNi<r*Cmy6{@K5kD|>x9ow{nd
zZd<vJga+?nj?vw5DN87dKqpU7J5FPEIyb(PxI;(0eD7DC8<G(n^Q*0?AA`E>-@Z|a
zi3VLeE-Z9l4<EFXQm&}yimS)_XjNhk8(px_=6^%k#P9rfdUxUeOzTx&HjDStq@It5
z<_dk~=PT!odKDo-yv$Tn?UoRO=z2AouBi4Un%Ied?WH}}i3w$vLrHy_d;tz>es59@
zjtL)luz9e(50Tnn8Gdyb{bSb1c~^XKay@sDcf4{wZlwKLOMsXb2|nHML=k<EiwL$p
zDh<8unxiK+3+>mvL9UrcwQsZ0NLi!Nk#9Z3{4DhI%I2Au0U7?aJ|FHavpWK*@+dt;
zo3W=qTT#z5F#SyyyWpDIgk-mPLORxFK58}nJ7<+Yc{A4$!EBwxEfA3Bs?Q$nDS(z3
z`_B1yc}PJ5*QC1BI|~*?1kW#U2c+a{x!CRl9Y>>cJ48~RVyNBluAxxRkBn}OSUzd;
zH<(P_40hY2a>im<=W7(f3YO|_8SGEi$!1A!`wkeE3&OHnz9Kp6HHyXM*`~+!I%{#^
z>4w`*49{wfee&&OC$5VP<dBhg-xji~b(Ca<=?l*kbnJPgBSHRGf*#0p#|<fC-rm_p
z+L}ghQJvCpeDYK<1ye<m426|H_<JY*e6~Q{>JxR%m7GP~&&i>N!&a7s?)l!pa6#CS
zxZP4tBSTCxl)*}!)+RBE;|(Sa6WQ|N5l&e{n6FFhu<&_|spaX(SW7^_(97Vb-`)hg
z$;W0vM`ulv$@G4ToQxJlef*lAJA!^_Jk77CZqDFtjf=rmAM?y9d^stV_rD0~SV6kx
zdqOBKA3RVnMjs;AqNfB%U`2Nr&((i6uev2PG4jr`DoZRZ;^A}OVd1)a0^Lu8;|#Eb
zg^&4SuF9m{MOq7U_oo9seT>v^MGYWGMw7%e&+i<17yQ_?kaT<G<@}dc{K$v(-`>PN
zV=i3(nU^U2WtwECJ!dHc;r!z+wg+HC+Gh62AI1|J4ldSKUM`aODSs_Jf_6!D)t}NE
zA}ofBVo$AI3<3@+bwB-P(rzedw0gj5nMT3_fumQ^zBIJBVKK+SM_O2x&xN1P63HwT
zO)h;*cSpaD3!zJM_wTsJa@|YR^W3s$+9HZ<n9r(GFSwjO)}_Oj(^Of)Im<Kdq@CrG
zF_)3Ig5X62eQETi<D-ZHp`n9-Ef(F73Np{KC+oU9JpM*%%uMvXOFdGbW-BASa8JD6
z3{!hmF<E+WoO^P37OinEv<oHGcIq_sn7kL~E=Wp@5v!UKo7zWbjhQ7O5}1kN)IPuD
zGf_03Q)w(k<KSvvUL53)kVv46JEc^Kl8&8-43&y_9y{ZVEqoT;<A>f{XUmtGnpvD&
zbbbYn+lcC6s`h{pE6u}C>~f!Orz@dZx|A8R;;G#;BU)7TAqbo$efgU3RTYc-7yMT^
zG&I)_k<~bU8sxk9IBec4p-=J&6qs|zWHg_COI9stzCUiZDC8sYjPH<m<+G&D(M~kQ
zk)>Pw-s0%fo+KycuTeR>ZzXR#OrtH4&^CV)lr^L{7tk_f7@CS$ba+~^qRpx%tLlZR
z!WLHDDmL54GgN`!L?CpHi*%3jC&~Bb!I~}>olExEIA`(<tbw~fF!7hEh-zMuwvb?d
zRVRP&*{Heb3(^^-Rx3|Vhk8=K-ljOA*LASNT8IBi{OQw-{KWp#Mv;)(ml-0;0mzGw
zkX}0x?U#=X>ikUb{z*GkD&4R0!^(J^0ht?3cV$^e;H@kpFB1|YR)a!Rkd_Ji8M3~t
z3CaeR58sCNwtn)b$gf+xRPS(QRqqXQ2+G(OIjQvguzIz$)uLOGR5mhbH_%p?4%b$}
z<Bu$?9QNj$U^C;S8v)Wvd8znEqgE^|ne#E$g>NPr$w__^6lS@BzXVzw47T4TsAp5V
zd~4yR{oV0%cP+L_wy4|vdA}Iu*R|%OwF{F!l`6y?F=w)msyp7;somvrLv7C|e0ACH
zElljOQz03q`Mr+*p1-1J)Ox9p_3$n$l75Me0N*gX;HO~xtof^O&Lv*$X4BEfBk7Fi
zSr%oZz9pY#f&+?Ky&L4T$^}Orej%Jt!VwEXS#`nnuPtFx{kF#a3hm41XD+e9zXTS2
za<#KG<@-mHG_H<lEi0sC3sFw?qYju%d#D<JeM_;Ww)>MXcl6Fv|DRo%QzF5(IsLQG
z(|o?FM{nldzrAi}Mf>5r&P|nolD}x@YqF6}usojJ2Y2oFtWhKG%`=;NGM~H`lBwzL
zWy|<cPk8s-q*~)5yGl2Q#pt}rlgD&_q%x{)o24?X>P*O|(wj7JTgM$0!cS-Tfv#rF
zq=Iv&_K=c__Hy5GU@zgoj8t4Gl7LNBo5H~4QMIV=FMjVL6y9@LyS^=jhSHwMkYom~
zzYlFA(%tLKCYMN=<U9nED(+)`CmUeXce}Eu9lnkc_{!zqiiRYaYJ!^U(f%_m*pq=T
zWG}J4r7Ti=;=~+H!Y4Q`USvN|?H2Oz4cgO@Po?3Dmb8gvbh4&pn`W9nT(dl5E!Cwk
z42yA=CNIpA|1=DVZZgVeMxgF8_z8Zpz#vN@A2=INCSi5dzV|}cw5PqOSJ2t?XsmbI
z^jFp+YRfxm3>=>(Ffe&@Zz|vf;<C01ZRQ@KOA3%oPFoRINIkCkhjNHyJS7l0lp6VS
z`nCRpG|{SiIvU%@C|d;Lr<CE-%cjaKXK-`QHoWs*RO-}WPawG@5<Fb9C-M3&eAjDT
z9lMyLVy9lbIz-P9-_hmj^`(f1EYW<eIk_^eRa+e&%E-L7#;j+MAKHP?J8$xyksCqp
zk_>}7<rt;XG_qhe4w1DmTCJXE|1hxkx{oK_?D}~y3kvq?vB>3#ieINY$qNaR29W0|
zU=p~sJ$Xc2nj~_Hm~H}?n1y;iVaoZ7HQgSfV{HCIVN?sMWdFwfmcA?S@kcOV-8WR<
zf6-^4VB+bjIEH-2a1dw9&~XHB>c|`x>hgFcIuUUogIOadY2tAm;XA{z;L(RKT|jgE
zSoN`lVVziK;h=fDP4I-~dv!%oWy{l%!;N{h^b4V>)cIps^>Q*+f;NVrlqct_%%0h*
z@wh%)<Jt_r<(6gEvB*QL2~6rr#9sy|!Z4rFtE5AbE2Gjd97WN4Q$lw)*G8&NR#vEW
z=3Vf$q5U>HVWd3W$67uedOO7^a7^D;hy}vyWHg06_@h-aW!JY>;k9{!H|_H^3Q|uV
z8T8;_sv39Z-rW=)@}fOb45kZlB>NYHy<Q$OBR4J~6mD4|LxCZCmmXVaY9*R%hKap-
zmCK~^kz>1f{rRSFmcg2B1UU|~R*UDVrPmp~I)}_!Iopys{&RiB{K9Ttw{^bYeecES
zH#E}3K4dn<on)085ts({pG<v!t3H0tI`P&I8~suB`Sh|x?v%>Ldjq&3a4Vgv+0K*L
zX~lKfa%2zuiVPA^lpe>(8KsD!Slat&(BKhbKe=eFyFPCCSwXL|_We)U@(OLZQOS=$
z+Lj34JUz?u^Slh1Z(<yZFXq%a4Mgf>-+1sUxoid)sfX{`y@+?ApH;yNPVVD+Goon_
z%})KG-r7%clwT^!`^F;Ci8}-HFWT@ZpZtAd=a<5*r{n{i8x95`D7lmOPQ452osGx8
zvkT}dR-x8^VJYw%5{he7$5_&*@U@T%DEX$-I>gJSCyB>$-&md9qZ`)*p8-p0eA`7!
zHKlSoUPgcPbfrj)Bnw5*!1ra#yX5zDf{ke+(djB=Oi#bMJ*gFs5*j0_Hz9gK5YSyQ
zcqoHoq+nJqxiRSHp`k>CYqV%{r(9$WZ^>7_ay!}Eo8yrt>tT&Kd*0#+o8_o&Vby@!
z8uNSIar-%4-fZlwmewAky^eu;4jLswM|$GrcHbAmt%i^9QhUGSaQZA+*^8rP?15GP
zp=w0IwPe6#OTuP$rR5m!y>ReErGAF@z%vx_$I%M+7U%Vt0K^&UtornP7fvZl`4IIb
z!`mal*TsTR!ZVBJiE#V7;E6VK&djG$&APNDk?LWTSZ^_Dw8&glDqg*w^$>O%64G`P
zP8#z)WmC{-AjI-e3_u~oC2zi<_FiIBAv3WqxVU;Hy<vJ7;oBoxmU8ZD7Al6HQB3&M
zy0leGlsAIp9UcquK4qS@VZ9`;flC)=^D>Qzpp772IK|M(ME47>s23skrcS~{UsfgM
zsGCi)2&8RWcIjYEN-lUv2gu%|Qj*)E3TBrem)+d5-T9fm_9;S{i{Ty3=y3&}Pfs>i
zt=xfO%x~K=m-S-PVjKUf1Ml&iI%Xn9e5ns1zkBR>oU+B23Zh>e(qnq`;Y<b$PFX&E
zaystnQ=QKGp2855#<6&$dah?uUmY&V%+2+=2Thj@9?PFH6_*axoj8kHxfy==$zRb_
zR#NGL-tt;q;<!wzTi6<G=%jWj-0hF)kF-}}Tx@55e5_&VG`II?_168~Iany!Hg79_
zTw#V!9>En5;mF<fh+JaOFG@>`OkdIW_e>mR3C7gtg3yHG@h?~Bn{8E!fhDSRHtNkn
zm$j3}Oq#}6le_JHuaE^S>cs36uG16U=I(T)twyDjO{Y^>3r~@2mB-^}e&u;4(t0@J
z*J?K`Jht+(c(?3rfoh=D)kY8B?OgAF*?oL8LhI`;WYz-y138#eqC<pdv&Ztasmbs#
z(L~i_<b~qy2_#YktL4_mn`L9y4AAao522oi|MfsMTYr`DVC;dqVVoE{5pMbuLGR&e
zvR<aI89w<XKS~P%_Y)k*n&<fTs%@sV<UTwXS}%SX%+oa}f@(#Kr;%zGL5RPRjwHlu
zt5~daV{G9s#y*iK^C9fvdud;>wa?>t2m70&1C7`}+J<OE>&*+gyYt1;TBoS>7RNca
z$cMFp*#%4(b$Ye5xb(4*JEY)8w8;q3pimko{O0|VY__pPgU3B<q~0uSpU4?pHz}9d
z_CxqQG|-i^pi7y)7gzl1$=4KYOUaxGZE4<I55MT?jepMfLxnxQJu7QhW}c)*DGSo|
z9a^x#oEzF{RT_Ts`VlwP#y2TPd!N~B4Fs!r<U%7MMYhRS+aPz|pS#Zu<8X&)nJoBB
zw@P+|0CPU>O^uin&J)xVopKWPJeLKr&8RXP1%G9Nd=yF{>N2*ADr+V>3OU{IYfPKn
z0MJ2hia3e+1H9Gbtbx$<h!W4`Gvr)HtI{oukC2#OXp~~ekH7pRToHxu>*AH?m8n|8
zAYk}Jcr8Ppi^$zH7)K(1RCBl%PoLBz`%~r+QIvC2@LZfB$Ntrkc6M>#s4Pb6Ns%@C
zNVHHctI|ssR*hCEWz}wX+TJ2=k*RD2eQC!1xFd#+d7ES(%ol9fZcDEfZ#!z(V@5NF
z2sKx6GO?ttgnd0gR!wi9B`U2j*Jl*AED&?j$rhVlq3PHgdVk)JePOJjw)D!BNa7=k
zffFs+O3T#M>OZe^G6$6vHu0!0y_RFyYHufr1Mx~!sr9)u9sAg=$2kr{jLTGbJ7O%J
z#*aizSbgG%n;Ca9dq;NAX)&HxhDF4nP1S>QFe`<p;`9zrrQDL@>b;53r7?%uMa-XI
z)Y#=$W1HJF+)9Uk*1nyuMLQmk`&s1^={?peCv%K@fRnqiEXmVIgQZeP>3z!NLzX_Z
z;ZsDIC=uYQw(;ucs49oqa(hyNQhd1Yp=Dr2?s6Hl=Mvpw8!7We_A%cf;kpWaej2QG
z?6Sb{lbHBLLA4r_*s>V2b(^q10cHkwn1mTa0#c@qBEN(Y?^=~hQ<!EwpIgVl%#`!U
zkO@&w)XcS0q{rXLMx649xA#q`bjN7bw#4x|CFO-8ijS4hrQ1B&)i<cbBe|^%CQ}`o
zbKk}3OW7Vb(&BMpsl6DK%%#6K(kaIvWw&-aUMz3@SK^mEW2NHT@8}kY<V?%Wm{x`A
z^xsV_cT@~lzt54G!OaiECcx?7w0wtK=8w0ou=UjiwRu}krO)w2uhqMSs?w8pWO`vP
zf@{rDRbeFg>ok$RGsup~a<xJ2R?{Tr?Z)+{%5PH!vN7ka8T~Qk03WgX4)n)UFI<{X
zjPmM#cI)+cCwDFi6Xz9k^f)Ol@>SZNM|Q8Na<quRXkib2T0Xt|-$9C2F7_PdgIA{X
zx7>JBe~ix7VX>QXYWaF@<-a~oBfUU>8t85nOX3n>_C&(;VaAaBqeen&9~W#~G&Squ
ztn#%V!eb2uZfNJ9injU+!aqs+c<b1Rsc$5?-aVgAr=fpgdnwE5(>)}nDR<%hl#<5#
z8Cw)8DnsJkm51GO=6+iCz4ax$M{D1|zI9mQ+7RY8CV7!NEKZ_e{ayWW&9QqqN@(e$
zQGvTgPdG!GhQb}9a;ZlCVNhSM3yoVoLsmmyQh8>o`*Az<JpkksR*M;Tt;$96d|YJI
zNxINUi?s9MH7`o$gTbfG8ztlGC++ec__CKsn^i$vkENb{qHq3~IyiIO8)K$8SNtW}
zhD=Atc)MeKW6LSQw@29ar@CZMh^Z9^quxps$~TGqi6Se^(!{5|e>DW|Zjj;M3>oj|
zP{kInXiw<h6YjU**xyL`ft_)0k-cS<lJothH@<?)XH}QOcP<y3wxaoONeSiNRvLNx
z`wBI^)6j6u-!$RnCphd+|5>5`;(@wx*c`L>Gq<FH9?uB7ne9RY8`JHI9Pz7-cLJ&C
zcO*%SB$<CRW;+(+UZt{#6Af1dNxe{x?ee;VLKp@wPyXL5K%^|C%1^JT2dRTv6fS4?
zPJEt@s-Box7grnx<?$+pi5-4t{nSaxQAm>Ak~jIt1yi6WiQscr#Dg@vaQQHfAQZOo
zop=RFFS^m&wj(O^<#%6vnmvEy*Fc};YT3`J&$rIwm|v{bq<wFM;Lhf$`TIL;G|ELd
zSC3xsmov2rkFK#0#JlttPiAwx)cwr#NpW+aV{B_N3C;UC)zhA7G+oVt#jKV=(H~za
zNm~>RWQu;)QxcW}Tb4jF91s;vi9&yB6^Bf~WZK!0^5`vpPcep5<JT?nFWfX#X=C4+
zcxo7Z%iXukYzI>nihtfT*=uWbQ5gyn5Jrsjh`(3bFlw2_{oX*i^tt=X<YEL7(j+c1
z=Afm}vB#F#WI?gftp2m{6w5G|#m(8+Xpy)C(G;|IpPUozqUz}bMy`A)Jg4{>PhST3
z3!=n^zwv)x9r=`r<o<|59g7jqI-A;<cB?M3pCzdXod+q4c9}a?UT*YGX5dpM;!<}m
zwD`UNiT9fW$aJ!wT9t1pu(@Mdr8hbl`ATfjc8$-SH2+X!aR6;*I!bC*09%+6_oX6g
zSk$YU&MAAjf+Qj-G3D*m4<-*DDkl9B61e?W3|H&5Zc*Y@EjMXuHoWFq!*y^k=5xg>
zEKNy@gRjDQ2_upIcyJ##2SwL?d;+o~bbs3qcq6|ui|ReSj4{(qeuhy*(8;g=_4Y%F
zEz^*WrB~afXK1_Uk7{m1ksPk+XnSXe+oxhC(-__gZIc&88v53DCo5ca@)g_8d4{S|
zeecK-k%>aXco|v_WkBJ<1=oovvbqgf=b#UTTJKXkJ)H^tX9lrfd-7lBxl;ZZ?H*RH
zp$u)6{bky!jB<B#hT8VQ7*?8B-5qRX#R8gQl3T|4F4~#2M*}QJ+lAxA<$<p_!jnhg
zCzxg2!9EUS>f8Lo)A;?{XQj{@s?e<~k$0KIH7&1iuM}v1v#olc%a;8)emzl~N~_ym
zevVKoD4eS~l}Lw&{hmu?yLkq__{D&`iqGH4Mh~S65g!7XkCArDWqjc_cH;FTyaS_|
z^xs=%;zn~Gr=j2Rk3wY}voNQ5QPha^{X0sIWM#eLZeGra()|%71LPosS;^c^7fYo6
zM?*}EtRYc`yyt6Ay+p^QamMUrWiP)9qNdx%xfJ0{&#_4S+WjX>&vv)w;^Xa)eGyP@
zcW{EQ&JEk$WZyF6PVPj7b6)&PcVpI$BKZ@$I}yK2jXj`T%4ACSW}T&%PYZ1=<<LXx
zR}m#f3VG7NFRysqwG@LYmZotgauGV3CasYjfh6>bX`J>&k)$sB1`iTs8fET)!C{tH
zC*s;IB~`gal$%B^UQ^Uv@|C)w>LFrr{7YuN?Rh>FB{EOHe*aeFtp9A9;D?6Ub0-sa
z{Ts0hUCNoiH76?~1vDJS&0<kFKH8nn&4dFNS;&p((3Hbau&%Kz=jA1{Tg1UsR(>NO
z5Wru`$a`;n@W&)NlHa1^vP7mt9_~xr3r3BXM-2<e%GpPE<O?JVTv5-Qj<o~?;4QD1
zDhCeK4E_xmJd(zN_-0iJQTIhc|3%)#5;@9u!d<0%nHkfffvi-|>5q00Vc>&C+Osr1
z+^MwbO~f8Exc7j9C&Im9g%&5tN22{_q;OZ<-9uGuuW&#6FGCiNWxK6^4VXW+f5;vk
zB=5<Xx=Q|qCg3tCDxC732tDl)xt0_|HGU^W;KT1XdjI<9hMy7-`Xv3^dpR{IhSB?Q
z3^UDr`8VJGruLR{z?T-m{M;kK<0Cthh!3Aj9Rtgamp^W1i<jT~<Ng#YL6S;Ooq<Y+
zU9w~C@991L#wg>&voF%mXnZd^AGM7$M79a8D?C_Klf)Yl?H{Ra)UY#_Nt@tT-0IiK
ze<8ZPdLPxnjr5)Jh`qjB97a=cYiof1r6!hG6zfy~%gUQ=4ZD%FSzGu?rV@R8%#l45
z*X+oP5g%6gSrVGmzpt@c@p*ldyi|T$N*6vtj9au4%9U}S*HPgNRpwdhtyhF%R}<<V
zJ-$s3J1`7I_*%akX&@>(+2mRr!NVhTQ>ccctjhec`cYPFpFD~igI}GQQBQ1+s6nUx
z(R67vPYI(yjTjYUzX?x=oq{m2uANuj*%E0^H#)ArsK5I+h9%!joJ|4sLwB78yjOV9
zCYx~Hzt$hF-Cv*K{=?=|R2Zjfh4xg!{bn-to6YybDyuI)HU@vZxmH9vbJvUgx`j0$
z+)+j4MUT5xqd8CJB5iq8z2VXlPh!iL^VMPaJ=2|tKe-MDR|+(Fj?+a1hjK1*5>~Ia
z?*8s5>h)$8lS#WX|D8<sVM*sChVnsY--|cv?<qqzM8@hlURw!Q2d5H8ip@ygQ*?{-
znombLyntS+CBX9X>t3<~!y!f)U3>%+#b*uQYb*Lql=p10^rHqto4%^w7YZlB1JrIh
zzg0h#rL%oxR72ZykC`3U9ZTR{m^w=|e{9Fr5wf8dJta4JagAI}i_pfj7gv0wbNX_X
z;ICY*rjNe$8aTgVPHrY^Pu8+_E@m%28%egH5u{;8<OnENc=*$5O}gKCfOU_sIs;>Z
zFVJ$B{j`<3ATE!pbzNM%B<An25pkSN^@O^VMab}*p~bt@ws*|6kz`D*U$lO^C;F6-
zyjcp?TCzh1<MBlZwpW9ALyl)`M8)=3%Ol2ywl+UAT<rg8?2e?@bn#uhCS$+M3CY*$
zT~M8L{lwBA??3B%aBf9Q!kD?RC%fTU&+V|XZZL7wkFoDw8<Nw37rJwNOIo6#%FiW{
zaK~B+L-^#g?&cL?a%Ol;>c!;IC5!1<leP1iE5CK%uI`m?H%F|Th)QuNZ{e;%|84%w
zF|<7sR~-%~<FBz@FS^ka+46Qx(>kVvx@h!TDmh9v+Zc;(eq%}>efYa=5;AN2dBJ3O
zy=4xmD!!$Lp76mcN*_z=MZJd9bymjNt?NGAOWwaWPF{SU$IMr|=VoL!k2WVQ?eA}H
zm%rw9%5vSu4{TL;Fibt~nQ_Q-u|YK<{iB`!kgF(^nBU&lK*QO;cHD`x+^uCz?a4<r
z*W=4g>klWKowfP?Dav-2uchsmIYnKo;unL=0}X3<N_K~KF^I9ngS&s2=PmNc`gIaN
z6kH^BV_&rG^<`*jjYF#I9xFAPbY#DspLtTc>bW{`e)y%r?f7=w?0Zw<jTxSKUmCi2
zBvF%z(p3~Sb2?&YOE<l&2Tm{a(nHg6UyX!&^U%5$1xdBNrRTq!IjW~~(H~TF6P4IT
z9V8nN>07&1-r)B_bACBHamV1siSPxNg~u2LBgPEdfXC$e7nRlPzKK;=Hv?4IBM+Kd
za<RJ4-8j)4xqE(RnqO03oxV1;wj8a4W=E^uE?>`bNaq7dzSo&zWJJhOy5jfF^yM(h
zZS9-+d+e`IN^gj8TzFHao5Ofce2v4Xx%*XV`GW87ZZB0M<rBHN;&tqm9M9ixj*a6y
zps9Xfn3dtkDU-2+SN!b3<*333QTEjhp1y)zGlPrS4S|F^{mEIsk$BlYD0Fak+bCM+
zjA85`<Dd_Y7;LFd-kj=e*#{NfM;Yx19Pka@m*IP=U_EScd0G@(9PouZe>Erd!ui}J
zEBoj>OHkeB*D>0_#Cs3N<uIcg2Q}OFD!p7E?H)2@T+a7yIqRXC{BaFztopIYD{CW(
zNpd+-XKQRDa}dh2^rs>)U~aazp#O<L3&|If1plKu!g+34Xa+J!+zn?q-|a|E|5j61
zzW=kBcF0Ilc8)xSw#iMnUd*mBJKdtQxtnHsb=-7;b;6syySu9@B*!p7lvy|0k-c+!
z^)lt!wV*(jUQKezU{Z;Y|06kPzyH|@*Q}$ooHct51)fR%N}ogV)V_#gZb**wzuBWt
z_CL`qtUkqk8zeX|SUD~^okcgDERXGoc)dqD*_Lq9u9%UV;(#};nRR>jyXl|W*guWF
zdzA(lv19gs53No4AMY9YAJ#e-a;1<ye?!Msj+d|cdj|8axb*ZM-qB=*)lx@)(N)Jq
z_Bwu}iDH;s98!DNd*#0KLNlHI#)1w>bZ*T&qD1td(xD%;%3B^Aq^9~#+kd+i-F6>*
z8Tqza@Q{Q^%HoTH{R`SSoj-o{Y$W*&abbloIwq}yb5ddRr0QW~X57BK!349OUHJTZ
zcFO)g%Rr*+2LV2x)G`czTJy_V*AQ69f54%uw5dx8FWu`r!l9q~b^N)oW4i7KBmZUV
z<MB&-5lf6%HED<B#Eq8l52OWYzGM<g9(}~K%w4|88~<F5AMLK%%LI?kaVKP3?L0y5
z3QY{%Z&dErCYZQ7FqpX3W?SMD6JByB68BSa+SUH2RB}}-nqnTKs(Hq2SK#`STH}E_
zrS*8p7awftUEi8)LwhZjCz{S49U5!3#CBc@c^IsFx2cy;W&=2~!>{;f#h1z}=RJwH
zjF5L@%EQ!qfBeqxaM<xParn#KvF9{ivfSx=eILa|s!olYoG;+yOYz=xvfr012U){_
zr@Pjy(+{M$D!!{{l4lUlwp{C5L`1w_xfdc>jvgN@bACm1lSw@>_*gDsJaOS`I%?Gg
zH<Mtx?7{2IVPl5XkG32pEJ-V#1`|&l?(AgGM<40UT%Bzv`za&UHrP|6M!0_El^geO
zpvtn0Ns+$efbqGQfxU6E2ML3>Wl#A8^_t*|izPjw>#V!=ph&{`$3tOL`?*P!pz^(t
z;J}+x*TBn5riROUdyL{sA_qtA3+|@U9RCoC+&a6$7kAt!xm}5vEFR`>ovo7?JLWA;
zBK>5&FOxC6F-<TSlYG>czm-EQ5G`tvfIqowd*_g~Xf-pK&7N(`VUP1Vji>7~VLQ6U
z+Uc>K2rJ`O_DXc)v;<qLp(Cd=`FRi5$-%@17B0`>$C0xb3ThHOri*Qvdu;rF`V#bi
zU$i=QEmp|x@BhjiTS)eb=MQ{3LN{wy&CzE!S)CZqDC8{B>onmqfsr8J6&aZHPu#Ei
zLSyrE`g+lu>i25J!Nh9gFLD`GJ;Fj0#U5t=C8Ae?qx_CPOH#NUwvN^WIp3=P{Mff=
zUdSwsYxb$zz}JF(cOLbGuh}|N$K|JMj@#ig3}+Yq`~)kK3;i|)d;`6*t{rtr4SBi?
zH=M=Gi6TGIuHBX3T_Mffg}s^b2&>A(#rqW{7s_;fL;G2aeWfXJA!&+%byeY4nN6h&
zJOA{o#-Ra+)=pTmYWH80Z`ogSx~^*PG}zhLX3fg*wH){344%jKez^Q|Ef@Dz6b^q`
z)V_klY0&m+bp3_VQH0#A=66jYhjpS$(c^=jue6urk)>bc499TzP9D<mj)Yk>&6IQ*
zE!e0!OppYHN|xy*Rwqw%T`69#DO#^yOO$T><FoDveX_EE_O8yPMLz8G_;Z=nH%G}{
zbeY{(cDs{C=1$J_;WK=xGc!~C{w|sY0zoR*>1Hc|t6!qRZO0DHPSlKYaNUgkst+u}
zRcuWYs}I&6nh?(ZtMhkRPp*^e+`2y)xzVPgTW~dV&-(3+rf0RQRCe_0qx8-f9}UcT
zaru5VGP1_;SDW<q_V2RLVeGQl@zpoIY<$uG@#asjqOa%Xc(htj95Z8Ct6L)ZtTau4
z=&V_~zTBRp^})eKPtKykO2aC9tcms87o^9Bly|BgOZfa{lH{-KopGrenmNfeTUn7|
zV6X3Hk5(f^Pft1RGB4fO5*{8e?w?U!W3>cy*;p|;b7qyU;*bV^{!-C;QGLu#OGeUW
zqgL^engQY)mDc9#xs~_MR8jTYEU@}Dloj$*HhB(|%<4T^i7>E>y$Jje6MkVA%iVx~
zE;UouG{YNxfH$(u%JFhP%H7AVxcZnSwx?I-Gh3yrlVA1vpsl`9LAYacX;$-$3U;wM
z4R4s;pf(HRK6h;={$6}EIp0{`2RdoG6Mc_{vXAL9B2-#FV<m2(;hFfE(Um#PGpD`z
zD;4krxD)>9n_sPnSYMerWMIFu8u!d3H=Z(5oLOSE#Esr0nbX?gR@74@o;5RfDH}PY
z(sDGPHdUPT{X!eY&|aF>JX2#b_iH!FKE9ey!!e8Vg`$@mqGH(!uB}B9RIV*NRWJ`e
zf3KztQ(`rDq*>K2J**%ry_B5p`6HUslS5wnKg4}?RNc?B8153>C3u3nTkznJ;O_43
z?hq`vyF+ky=i=_}?hd`=`+KkbqwQ&5U)#Co+{58~?(WR&=+1t2M$0o*E!<)mcuVUS
zrWg^g-cr9EMx=@CJr$PU*E}!M8ueLf)f`Ng?-eYUX-r{^|4c4C@&Gs0#k7n{N8|YV
z>q?xjCOjMXs;~VeC41_!SKYXp(>Rj-$o}_o{dY8OsBv>|+ME0`iLXrT1bBLj?dURR
z?K2*puJPf3zVNH=j*;=c2fF7eq1D<24UcKPnytfhqutA|HAlOad;LGPjx}Cb)rYG#
z_vG6fr^^QGyyDC|s%KZ4j2oJ+TT%gafpeS&t2J|ld|LBmNru>)oI1k$W!u?2lTEvO
z{nkMFC#t9&e(RCmYyNhuvgBY<)M_;!eIDO*8mFXXpQcrFgN6r?mQ|aN8r5FbsZigl
z+cS($!PAc7`&OamR;04;+a9lkgRy6|%xN+}Sst(HdT+(l@H*T~zwp{Ky4S3C{4|@0
z?Beq6K22h-uHQ?RG3$Jx8rXU@a^{}x&YA_Qg>i_Mo&C)wQ`NO~xNN0nbOrs|gKxo+
zOo`BNnuYt@V{Y6P08ZU)bh98B_ma2FG^L#vqkw(L)NMtH&iuJ@slmMM@G$khc(vYD
z@n(fvn(Eh8u&{PU>u38<+AGFZk28AH$F*SMon%A$n5%V@E0;<U@M|NfqmJP`b0Q$e
z0sERtx<DQE;&!`K#siQzuisfb8haK32(;DA_gqQt8YueT`cbLSE)xR|0fWp~>vd}@
z538Pgfl2clm^D|a=Fk47oeo;sW>zkD@rUr_rS}0cSDx(Pf7bU0>$ly#G!H0phRdFG
zsceqa$6FT49yW_F(maiwlkgH;diXn@*;AiWq^3R0OPlW8s5|Y0=u+BixJRY$v62am
zAB?vEw5-(anm8!p4*<n+E0%dox9-GSI<|Rl`%G8H6Ot7@C9jqo{;ZRCdpAJey)~0_
zMtUzhm!}67U{zKrX}9lgz4mt11#V<CjM91Ftvmp1x0U{sjk=@7>);%~BIZyQshnkz
zQgB$hW@_ctxI!}=@{sY(_4mA|X{zcXx_)2E>SPC=Gf+{I%Z+~iD7jPItCzifttjW0
z1saXxOHAtETKhjuP5hk%JaTNKo0-O<ckYp-Yn-H8_Y;ZWY+vQh-~l`sdt)h{*{RL#
z=kum}<%<Sq@GZGGFR$7wHIp<QYdCMo-i8gX*~={Sd=2gIty`WB>qTI@D4s?Hz-9QA
zkSdZ9GTx(TP~qt?kOzoc!{Xsev}D}oAwg1n#5ms6JB%v?RN<X$w~R_VBVKAQSrt&<
zTeHnC@+%h3)4Wf~Tz_*u*@Z@omp*Q7yIjku(jQJ%Cv1+o!6`IXO8e^|r|jqY64_Ia
zpTWLA+qN6DHF1jCa6W!;95_zpF#L1WQ&fK4FDusjKF$u!XMsf)>2*l^{q+iFtYA&7
zw1PsQbupa}JUE{JRMt9}L>9RJqrUV-P5xk^Y8UgDjqQ6v(R$$ZlY;$g+%pMSk37V1
zf$7uQl3qfv-;!lE69T^+o0t=_c}n)mkS!sS1_G~1kA6isK$#D6yluKq1$-A`nq_oz
zmKXV<pS@iWcs2V+A0v278tlU(CX^Q??~*w}N_)HO3na#@gpTZ>6eIqz2%f90UAqH|
ztOA^=wzUHkVRtptqBe2KUvW3b!E5UQSL|~3XCz+x_*Yg}LC8x}OJVDWu-^QaYy092
z(}jh<gY2<PKc{oDyKYBmEInzlVyTxqz`5StS@U|SsdvSpaOT+@aNnta;kl|HYMs<L
zc4yVFxb<H?(q~tpCVF>gondM4o9;<lX<axskG*bSv$<>wuiY{jyLEMec1NS<6MuSl
ziMpZxX}5VerR!~+O%QpYg2(Trw%#(>H>dQ%af#q`vgF7ncdZ5gmglgygLJv3W?tlF
z?b4B~``d+MKe=7=*T%hD-EHD>s+iV}EeAfY7w*{X#MHLIm^8{jT*q&v4zzI&e1o1h
zfZGd8tFn;fd-d8SQH-ZT(O&2jDxrIBnx$h)@gVkAi^ba5D(9~%?@7#(D2qH8{_;9j
zjT47hKCJ;K^MePx6K-5!D8)m{2^|sr+Ioz0dTCdFn>pRJjq|j`RebYcT2!8$k9N$?
zCt+v*m}*{RnuNc9okVd&cqe3kmd^9>R_A@kyC?82!y)arUF|3rPsm{sk{I9W5Gn9U
z->a-A@ZHfL1dF{eK{%)=F%l+azSU6IsK8!pK=>n{MMLrz^l}p}Nk|%gRPDAHHd^6m
zmcXGTN9<MIRW@vxj@Br&AogkY2U%MimKR2FDz{mXkT5NNk;6n8qAlFA-<@wpCt<4K
zp?T|t!bEnqHAXS#&&?PjA<>hcHveZL6ub|l{sJ3Zya=#677#4ZsltIhOMtLI1sv-y
z{I?hhsQ&X@uPBh1e;^Xf@PBte3EYw4(a}*BR@O-6e+?*H(ZmQjybf#^I8ciJFSO#i
zfkpxNt|dX0*}vgOkqd105M|c(p&qt$|3Y;zKEJE=AH4n_5%Axv(f^bqsd<0OR%SW0
z`ab=`A{?SGs`~2QQodKI{J!v&5#=ENB6ZtzO;fJf_U5?su<k$3*Z)<t{{OA~e*i;?
z`Q{HU5vf7#@W8CaAE=(c;~%ApwYRN6e`dCG7c6o|!sXg7+lR8oF~AaBLX3_@LSn}r
znrjy1hgj;IhdHgAut7f>7qAKFLqkDH{`yzKHK+xN@!-xvGJdNe-*(_{-r4DPWePT<
zFuwAnEfeZLI#9X5gq?vjm`9&Bcl`!6eN+<AtRnO<&U#wUYBmcnL<f!pHglsUEWCqT
zvJTyjJ<ZGy<=<EFCy(|Q3K{rla29#a)4VP7$|HeFvrz~!Cz3K~ya%{axj%Nx$8V?p
z!9u7DO-`AJca_R7?FPkS&?k+Yu>g8w)gNWLS{HcX(<pY=-S!D{3s^!zq%Aj)32_lA
zT0o)KkOVC%4)Eg-O*->y8rcA>=~_|Z?d^%lB4zk2^<S7aAqzT{5%YkI25+v$P0Ydg
zj&w@d7^}0cG)Z=$G(~|vBKI`SXLB+Ds>4D;r0o7`f5o3Y@CiF_hnmBPU%vYE2$XLQ
z-ZV}%$NfDh(k+Ug&-EGECXjw=`yX914l6^}(RU3osoY|S|FS4a2WS5+a6PGu{zF*U
z08Z}1d`)Li6IF+Gro+-vLbQCgfnD-Kjx7AlFmo|VCq5E&R=HCNtzCxH|5+3&uRuf!
zLBL(JH)OKz;!IVX9zlCX9%0d{D3V>RPI_z#+cR*veCeo)X%(Gx0#Z#YjDT$U@$2T`
z_~tL7ur=j5J%%VVG7BI_2|qYGcw^vmPuTd#cVRT3O|t-YOWzZkDX5;7HLn6z$}Ymr
z&G+75bEEQNY0M8seIAmufB^jlc62)1D_Fz}(HDpUT7+k%0#*mTeGcw}KwYOuw$4}-
zj!-?V&yoq{fhcrBnzu{NVUNZ^Cuj`rFehY8P&-d8j)@X;DD+_trTB5=uCUQDklKo3
zkT|`e7O6LS<K#S_j<Dzx#=s{Yz6?ecosOy@Mfv-)cR11d1F$EpU)!;}%qLPXRY}ug
z&86{}7A%8H27=YZfvhx}E}91#^|rj8DQiS}@p|U!>>XRGFh=6J02AMo?a4R-_|N7c
zH`0ql;gcuLgk$LuT5Jm-?bwqxOSn|3Y%AR#i);O{xCUxNuRQ2G!m`M7J;_8}=CYmU
z>YFdijApWmx|`a4gy`lZGejXVikui$Dm;VVi*1B#Z+7a3IdbK+efsyXr#0m+?rp}`
zg(e9$oRBL4KtY_2`4GgxTHW~0Zj8xkf8&jf`RuMa?S?yYm-kBD-J=jo^iEpm&TLQR
zx*|qMt>7o4!C!;cX>|w@!waK11S7<x)%~0>j+On(uxYL+S45%m@Q8V|HD(EteiN$j
zw8mM*xG6H<>?1tJ|IYT|W0KkI<^+C-n0y+0$ppSOCcDa3qz=Vdyg6N??4!SaF~t-~
zO*BIo_~PD8+47xu1cebqMf|S%psUHxIJ=(i&1%UD@+Um7FN@%?o)7o>(}*hCl2tAS
z!#t7y()Wum)*lO~_?KI5HeHDD`cjpFDK6j+bv-SWs$Du0(4a&Rub)gHJh;qB{{1Wf
zV??dIvS3f?L_O5h$!r^r()H0(Nl4)8pHxkD0-yS>VL~6~^?*)e-9s}Qgqq@}7|1o6
zi#1sCQ@IALwI|M61X9o6At8f5az4CB0i-^#@u{fneR}YnpkM!zx9?J^Qi(hVyklK7
zCS#F`5)Ssy&C$xpzG#fc3{Rz!;?Iv|_gWoTVRM3S1D&kBS-QFk(4z2_ZeU{wG5){-
z4Zcq5XaOaN7Z9jot2O_uNYi0WRcL3Byh=X#B}<RB1qna6Ibrjh&IU^6h4e%Xxx))g
zrX2#Y?i2oAu#rK}l()}H+2*+-aHML*$PfO&SIjK<E8aRtyU6#Kof5O~!f3gpry+7)
zXK$WPm}3EGL$w;-I;ozOp0ggSZs!|bZ}bLfz|Kdb<smAtn^Hijt#5_o-eu>rv>SkA
z|IGS_Y*Lv|^W6C*aYDG+XAmx=P|L$=NC9Amx!Bdyq;uv$Df0ux2z(}-Mz86hHbQr1
zjk{>kC{5a(KaCK>oZ3Cp?&wKK>=27T9H$g~7st=!&`UExN??l?4?taTuDS8_cPKj^
zV=tQKV%L3nlZfxVJThF_URW<~q%&;&N#}k5NM+M?pp>CW{Z(T4_PYho7~?$6Hc#9k
z|8eKM2(M+|??u1L(no!p7JWb$8ATr|xEaFU7NQoU=eA3N(K;g9GUFq#(SR{j0WoXa
zhdtQzyg*48<0Kz|gV=_Qj=~?j_+|}K;qa{z87y6CW0ihvcx=%>egL$_8ry19{~#bG
zH@ansZPceMUrrNp?IdK^99qJERyj{&kS?ee|Crv9u90~Ipqzbl+BIKy&8k8)g#8r&
zoR5O_BZy<7g`gLZ<ZdLLBIkrFo6N!6Vi_HC`=qpn!Z-<CSsuL9ZUlH+_+zIg_(Hrw
z<$J<7SH(scFgCl{<-J!P4VNb>Tud9BWf*c~FfgF=*fTkOP2_XbP!5gQvpF0}Xo=V9
zZJLA!3}KRY{<^#ZXrfkkrv4imYB3wB;%KZ~So;eEmgSs&9kd61i0q2|Xf%BtY7Mzo
zKBwlA7?=#7=H?62elV3PjvO|25LW2mHLZPJlc>wiPI)M5qA-FNQ6{^roi%J`un7kv
z?7tI%{k7(`^7p6?{V*o>M7}R+A)!cv-Yia8>K({jr)apF3S}$3hjLdK7fbcJ1_GD!
z*^NGbC~c6wNgMXo{uqoDhsP)jT6)s8s3epjh~lNz=UutBRB=Hz*b#=<LK1Jmfg|)~
zeMjJAG#dt|%&Kt3HvAy)AD><jse_SL6tGK8Vr0-&?Hz=NB&0|X4V7#dO@PA#-qo;K
z?umj1esRV)sEyXvS_*OY(lMb5y(;jXjsw9w{TlkSw_`6H>B++K)z{h;<ioq8wGc}d
zLVT~9!)78uOU4}|k#Tzr6;Llm0UI4MTm7OETcRKDE)Rb(IQfTig=d1KbVQLy6y}xa
zHYk#TPPl0pvCOd5`35g6mKD%76~SEa6_jUlcUT#=Zw8DEqS(~)f}^V99h_q?GDWVI
z5X<I4xw1an=RpTTmI$N*P{+QV9i0-aEkNls6_&09It4dVpsU&F{S#B?ao2_z+|DJA
zw#W5QdERHWL{MCViu8{6z@;Dm*>&gfH&7a%ciah%NT$URKf6F7y;zstdD4{kfbVA?
zncLiKV6y@BhY7Y=^uvs0Vsbn-T)YufZD<YBA+QEVp$ClB$hAP1Xa{kC>)#)H;AmL|
z#mWlF2s(_~R6!m&eVc{ffcKlUvScpPn?ot2@xLJO>z5UBGB~5odfTA#Up3-*<VgGC
z<2TFD05Me)Tyct2FEAqEau1SLiT}j>5!HpEN=>tfwOqv%Zgf*`tKy@e#FTa-;)p~i
zpqR<v`v@@Cruo&Q%=O>d?g%J>Hco*MJ*)-xPk+Kb2GN?2j)5z7RkAWn<o<C@5a5Fh
zMmzCHSIeSxtN`?&2$ugAng1oOL&k&;YMYbFK8udaary_AMQ{SzCvzWb;DpF%xAOnj
zQl|ev;QTkN`yVEC63ugvf3uP%OoeLV<2iFpI24?Ue)jBATfIA0UEgXMVG>g2nNWd&
z_`w;w&EH-7$4Qtt-kOnfw=5Yg8MRToD9O0ZxSRZM^SvkL-C&04A;O|YMQB|E@8`Y0
zMTGmU)-*4Ul()y{8EKi`g4`8(XZ&eUn(p>iS^r$_1k6$S11IQVJg?~5UzSDXDiEgV
z$c{2~m#&U7jk%n3R!om&bx<5-_hzyk<rFb+2x?A<YJfe@_1Gy6UCSMuiH&7_K@(KY
zjrtNZM0Y&5Ni-d4eFQ|1UbUfwH8&(8LlkI-|1YJl|Gl-KI8?(D5EiBZ8b56asEJ2)
z4(}(m#J^!2LpAaJFWs}z=oDBaXb8`VM?RJgq<!t6vCcZWd?|9c_(I<sjOYe@Ci~9A
z`T|Jxtk?UCGrl5Td&?0@rbC?Qch1Joq<1Ge><>rThYfP~Y!H<Sj8h4n-Opw3sr1xx
zm^g8tqs)DD5*^W2<zR2?(3F@^aEx(|bsh^lPs1|y$3V}SsGcCw9x#6#FMaLF6L9@H
z5|pnU=+}*XT}@p6$?nV-lD;e3J8-Pj(V3nJ7={I$q;(~Jd0;*$3z*Lc+8diZ2kx)w
zn8MC@V`*`7Je`INZUotDfTQ!m(7JC<+*7=%9kwGY@x%BERXHEex__c%(#$XS|CtFX
zwjSVQ!U3yksbe6Z#Mr2z6lp*74(}Uj)BdKGpi%$_Ijs2eE6po3Rn;oU!<c|QEd+x2
z7Z7VC9f2oL8?jG!Z{{cPHvPOdLTqx-Kiz}>oLf+}4{&{EeL+3K8`wq+l_G3cJ2--C
zV<6@PpZZp(aK@x^=%4roD>~~(jd7LW(__gBj}{Y(;-mqNy#au7F1hu7;;m|h^?E%=
zQ=tO4Q}(6C2$4slAQ2k$BR_`r+Zp@Lal=x(KxAYrFr+W{117Jhn?HD|@K)}7P=SDJ
z*cG1%d@Ogi8+;lEuGd82Iyf~yycKlbD!7RIrh(>7y-}r4PT(gVILbRD&n<@5=nkY6
z!HsAz!(=bq%)t4tJ%xrZIAv}N&DbqU0%yOnZ0Yg}z`7><OZF}tBWMQ4296wFKp%KE
zv_Mp|=};>?230{<>n1*JFF@F=B5?N2@j5IIb)|Z>0ONkp#uWj|+VtRlv`NsmQV^!<
zGCSvtfCMKS`hc|5^85Kawwd$^=>RGEKT+oR(a*|NFEYo5{X~NO^jVxRqDTh&YSS&p
zS=n>-8F;M?->N}tg*<mIW8oGhb?-D75MZ?YBq_U7om;TJxnb^%HE7`-&OH*lz#diV
zVIjvHlAX$21XUEXbpIJ7#>v7dX@@-X2HECe!LOkk;p#V_0*W*V)=nj^%i;z84}wUo
zvR(2AdU~2K)m90qtUkVjMXVhlx5xrkUpIVe@v>g^tC(gA8?O^DrtZ7xTUC*}hiB*B
zX~+G~x1%jfMPxe0)jCesH}i=C&npYD?Ak#_&i~LIcrM*_w|j$TTBgLnOTOb?!RL$+
z6#2)gXFmm4cHASxm)_aBIDethM*EYQxVbUS5QMo$5~mGfrX8@wO*{%R)n%#TTit@7
z#AiV}S5KYuQ;Cgc5nXMcr}aj}1b!61tkPhAQ*IuSXom-aoE;fpAK!U}x?#Cc$M}Mc
z6|+zs2K{Eujc|KOrxs~*Ku?(`&J+o(Orjg03y7K>1R|ShM|N2Bu(_C$;=5zw{<x-Q
zVng|riKi8mFu`)KAl|^9@6?G1FZ2$bwMVSM1#3wQ9u9Go^=N5g6|y!PL0)AMIjb?#
zE(1*DzSPhrAyRj+ej<Y&6FTK9;<v<@j{zt5M((R8Zk~67KGQ-RQSv|Msa-VdomT4k
z_Ut8G@I@b!<0%96QEV|k$6B4ds<j&~*8f)S8TH4x-;!KpjXNX_@@@XjtUyg0xkR`l
zp9x1{3SEpYV`n<&MfK}S9d($LY>9kESha;_^3kTkybipWB(?94!c)hhp|DL~i{(h$
zD)<yx0pb~t+K&1!ls6r^tfZsK1SxCq?e>$SvLXfG_i%&Xk=ri1@^P~u&18h0Mg`b*
zFjNN+eB>ciRp;Kj;^f>xJ+y-hI)Z*?FRkol_V_01pVNDOPV)wfQUXJwdc&kanB_C1
z1<##oP9AfGsTSJ3A_!`eMVJvbe8(tL18>YS;Z1vgK2R%(K;^hb<ZyDhcny&fs{m7Z
zIP^6yv5qLOoJv$-odVg<^wYNvHo}&>V7HmJFt1`r1g%rgwqo_zsw_7zsf5#dr6jN>
z=`r%rbr&GG&~C{YyaAd>I3|tytXbq$(xwuwtc}VnV(R|ZZz_+@_4AGw#~lH8`Bg{o
zf7lbRRjZJx)ZQIl@VsM+Nj&RVzd90$u^G_gW7G61!#ZZGtt}~MbIdfbw;)8wjmkd_
zWOXdhbj7geBH;U%w7<oJa-c2!0&Qga_FAej*Qo;nvH3FIbFbI|ucjo_3>p;}eHwI)
zGCBoTslJVe(42{qplQG~yb8g2M^vMGOLR7mK<N}wE}FdN>kT}obfxX{6t@B2nM^6L
zh%3^sJmJ*I6<evvY2c=i#RCZmJ{MUE8jasPaV(=`u#!jXU(0a~W-YY)j?<WYM-cQB
zOw7J&=9S(=fa_-?q@A^j#2Nz*gD5l~c!77PQ**lnIfuw6P+v}D0J1~^4PhQoPbZFi
zTk2$t=2*DqXodZZl8i(GzRd?}!Uto0iv7G158MUY^;TJBSdr9SR4{)|ubXheU-QH`
z+D2HRBq2oyStoxow^->Bhoode1)7W;(dEc8ECqrIww0q2khF(&n<=N7=y%rAV#6vp
zt!m^%A_(nIE;vq$Q1HxQI=2+&@5XPKGIIz3`1)UOmZ4IZU6`d%McN=Qh!b+K##%@O
zSe{&)FXqwi$kB+LA`=ss-^ZaIqQ0$CiT1*25%zx@s`sU6{*1lttNK3eM0z)Uw~<sb
zE?f|m;CbPtO>7}B_8N_8Mlp4BY&*PR1BK)Pp5TRRcO^89zd={^c6A=yeBPD4nWu*y
z^O-db^4qp=9-V)!Hxaf9eFXUTjDVZ_?7LlGTX12hT%a_;Wz+<<J2(0MXw-$BA&6=#
zdnRnuubaKOUMW?=O%jlNVDimq_wJm>-|j_8SsOTjqeTnQ5qsc?Im|laW}F&C$kC;*
zE&G$gVVvmOf}$XVoO^Vq*%xk`Se|ceM@3)5d%@ZngZjh|!^b-qi~Og4ek0(t(aNDa
z{a)8m;)$pAIsf4`j?_E{(K&6&w9^>@_!cnK)Cip~0Bn$^UO{lPs+k4@-=DeJDJ(*D
zq!hfIBLYn2iqhs{J-4!D5kpQ6M)b1_e3qqub=Tko%hBzR8KiwJBA4W6h8+W8dZV50
zDd^ATAaH!};R&)Tup)dKX61ryox_o8RU9)0O-N=kNY!g9%dG6(`R&l!bb~Zu3mzzW
z_pGB<zLL0)T6x+%zcxJ%k{GTwbG-?@#G#s@-<%DRBjiD%)tDHd1kte?Ys%zL&Anj<
z-eB)uL`-zrCteoWg)0lJkVtan;D5S9gXy35RPCK-o7CNAeKlsKc`f)IHVI<R^NZ+<
zc*3T9<Q4kQdzzU<0jICgh$9`~b?(`l2oJt9B#oxhMb;#rJn+^gh}B7y#U|K_l+QYK
zTxo;LJ<4K6?j%NuwsllNIh;RTz4y8$KYuCWg+`<mXl6~ZULW;X7BrcO^d0n(;kYCn
ze8VfHmVbJcyTMK4u^vU5QTN|&0@43W(~4Gt<Yx{UUlH|(7aa4=o(e;qw6z|^%(AXu
zcQu7hM5cC(pcu(y()KG83F7jm3NB4sPjf*5t$(Z`BPPPPS0@g(Fk7LW9tq#8A3u>R
zUrEcJ@Td!`V~`$?KlAtJbX{L8H9%dpiJHAY;`73=Ae{&Cs(}BLRhONopZ)bj5F$l=
zC)kF7rF6D~%{jKbPJ|sX?ulHcZp1pf@=Tw*Y<)exVZexvr8YWt_VU@G-iMTQ1|1J3
zEhv|Jh6s4)(K&HiBX|NWxQ5(|YteJb>;R(*q~onzG$IG_JM|oqzzCg5BXTR0PC&{&
zWcD2~I=T-je8M|O*l{@keJDA4F0ua<I}I7}0HdsCN+cggfbZo}wo@6o((@;)7D*4L
z&-FV@VpO-^FQGdRb*r0+mwGgtJ!6}_(?)fa&>W!V!2jr~<VP0qMtU59rt5ya#>@61
zN_%<S@b|1E@UM?KX@orb!A*-g##e*|ET%FNk~zM4%Xy|^EcjteK=q?`Idz9t3N9CB
zyI|zyBSi!dSOoV2w^0U2u`gtIg!W#X8cjbNG<G)PB&<L`JAr~0J&D<PLfSovP9q9j
zYq^{b$e|9)ukoiieOhwr@&%#MfL^l@ui<S&su$WGk-II!0QRedk~s-9yjMZajQnB4
z+DDiqD-1Pn_eqNzrjeKyFLo6er_BorAZHk$qf^N`j>@&oKG_=xeuu&qJg^F4Lj>08
zcm#`{5Pn43GRvsT6hHaf+EF^Ik&b5$`J!VvunVdeW!m=K>kQm)p<iO3Jb7=OEI`+-
z*Su+PM#q}1#y8CORfrJY0^NS>CgF#7dfv9!Lco6lblUEELvsn`E)a#)$XrWSos*a5
zE1O{g7&gAc*Uk^{5gylndmSfc1$SG{W*)O6VUprA;r1?v|2*vrwrff@Ve%R}*lSM{
zxK%w7?#5wnbFu5`XbX18*sQjU@yLKTYKvjvn~;#52u|(o8ct{x9wPPiBjpqw0qxTc
zZmKEwZF+hVo_$5YodHM!(@=w91z>4_3F<7BxF;c_{1MAW%Jbo)>{!6^v4@q5;bXs?
zMGJmVFFIRR*)|vWQ4fvz^UrTap#?SJ|2~`8Ypx0MPhuA{B>W#JLh{Cc)Ym+U3|HYa
zBmY)Olcij;(j1S4&*crwB!+MouRfB|27c^K#u9xsgRS3It#jm`Vup;m!FT!H>~ij@
zm^||h4wF!mWvZ<LX$YoC(Dq)a-8jgZervcGZhhp=!0O&)sJH<c6gP?k>k;rtv$ZFQ
z7uR&I0+EWaes*YUh1WURjq*_Hs1T{oumz?mxtfoIifDMV|0-xA7_MpRA1h_c1Gj1Z
z!he2QHkQK97<Oz!_)O-X846&ht)&*)DOluW+Ao=c=J{hj>Qo;~@TH=GHJO@{2d$p}
zV{h{>{O6a1|9d(0|51(kKWRWR&0{d2ybKCU^u{Fy)Vwn`NepnWI^0T4y>cKj?c-v5
zR#rloy3vc<tPFo4Awf+ARw`nKx(zpy)3N7f3h(WZ%W3zBsmV70-}Q{v7h4+=qrwH7
zEA<!+T_S49rdD8er<>^=r#JBo@_U7iMj6G=+)vLpr=a~k-$~=+r8Wl?hn#}6?wzjZ
z?$tS$*}Ph?$6vASHR4~2!{ggwrTOmOtK#O?Y-vhELc|e1)Q<v|ls|C!Je^U_9csch
zFn4*W_<lh3%cRa*8{wXDkk)gBje|M&?Y5TC!)9EX(S`$vuNGZHGHF?#6mbnrWVOg9
zZ(8%XQ`I|`SMydmn>iHGW2TkSeYw)~P&ohfVOKDa84!venw&hE(lwet#6k!>bvYfn
zlhl<HtT<!~DxJ9a-cw;)+$WX6AIG2O9avJcts==o(pKBC9%(OnU|0q1y@87Xlxzdk
zzvB#zewEN&OD#`(-B3&1=^=eAz;V%3)*pf5Y#7h0F9?^M&zb8eOjR6jusgy%0hU~W
zb&Ze8SX?@3@*}nJ2ELmF4i>CSI6nAmz(8`$;np6i{sAO#{GAkeVAWzU&j1yHOoyXP
zdCU(T?i#FBo;ir3WsIyk+<0SDg2YgXWGIwkHNDM2qqV!mT5D5bK<*gN1wP%y9>Hkz
zd#??dKo-_JH1EqjC;xfcFJ9csvhL)au|sz{^C`EI@aBF9o^S;!8Y#U3G`hC6E<8%o
zIT<Mfr%i;tJ#uDhT9iU?x7RiYR>$i%3I5wie{W1tHKs*)qv&MI=#*G-J=7bg)CYBU
z6VH7($HJDwYIg37U7a^n=&@d5CVNYNn^7sFwXf`}ZJ#V!x))2%<Zp25y)$7A=p;Yy
zmpdIn;VqcM^gAM1wJvr!dPfTn10YZtFP$Mw%P+6ThB!#2C@Es>Q_*C#)2yIe--~`p
z%6o30*{pH+yY?Yfw2W1GQ4<LAah&gWr>;IQ4$?W=Y98E6RHZJhYh>$J?H#twIVoB>
z*y;cAKdzAMWa#oPPJQP)+)j6P3tn&Q)ZJ*C6>+jcEoS!hk#N`dd18NILEw5w{~4ru
zoxTH*?2pR`=}87@qwtL)CO`zmm4P%d3$KYR!=X`$Ug3`txPRu8$%Nf%ofO%O#c?qb
zyhtCx=lulp9Lcc{ATFHfa!iIAn{b}RV>o=u3Pz@TDCRB&eE^m`{+?cl`!OxSUxUdh
zApwDkXw|wZa>|`3HmS@8I!?k)UYh2j_e0WnEVHZCQ1A$;R`KU78mWo|v7sQG<ojUX
zN&LZW)1;tNXn6N^BIn(fVT#A*F_(QBK$}U<e`&@kc@A${#n#x)ySzLGg^$bO>XweL
znl13;xVsP>FqSGf7K^U7ez4J@)$~nyW<N%JZ}`V|z67c7Wu+^(Lk3ta4?VwEH69$H
zt<w?P^qrZSh#VQItIL0W2mXcl`=*WfJc2^*>@@q#fP|Ag`-j*UW~x@&<R@j($$Oq5
zV&hpFt1lAu`PxTu5e*YVFV!F&??1T<*TZ2RP!adMdNJ5=>)u@qveLxwG)+AWBotiA
zpVADN;L~>2?UHlfQOFtqfeq)^r{A-R?Lq{Sg<G8q{@moEEVuC`pQvl@5L)ir2>alS
z1~^bk64}I|T`ni3?-Mul_64q%x7u<!3C8X>tK1BH*)b4#uXy5bS$1o>5vA<lwh4?1
z2}TrsLbysCJ)rl#FSZ)fc2c;bWWK!NXQ^u6wgNF`x}FE<31$nHu|d>WJ47oo#ii0e
zxo-E#9QE4=*rTA}pndp{<aksDHIJ9KmCnc-Mq>(iD+1TJRaHraFY0K7WX=tchYwka
zI@&y&41-P7k@<}EnLO*GmiM9%LvLEmTqmT0lp6q9-NiOK>Z)_&DFPge#`G191pEfm
z8b<cO-AtQc$RR=P=%|8?&Q8TpSeNfcw#!6f>@j!o1q$^%B;sd`hj+=~=U5Cz<jn8e
z#VdCDWm%@J<Xil!BI&feQ)TLkP@>dCZ2oC2z&CDtG0E;z9{6FVbW0&wr|@YW;%enB
zh<HX6RB>_0mdUjF<MK-hHU-^~g`i%B*b7CWhh9-HXj1!fy0hEi{U7f|fd9aUx;~T1
zXAmV$OWQZ<%WT<(7-EwM1!%<%hy5CGF_|LZdvevQ7CietFA$9{KeysO)ksbvjs*3s
zL&SY?k3;ndmVHq<U%NFVBqGS{!@L*o7{oOTB@U%0cAt>>K(rb%UuK+Teb#C&5}$8+
z7(9*geH-N!G;?mS-HM-;;^G|X6C<>VONSz(i1ieM<21IE_Ct$H>gdM7d~s@99JBWF
z46Gm&&)|;R^-k$NnLvuY)&z`SUiWj?r!X?NCRm7OhqGWoa5tN}btGz$`U<$4B<C5t
z*+iClyX#E3)us=>BW#>QY7C*TCHDKZLJ<4LtFNwjG#h3~xW)Rr65*KON-Fy((mUDJ
zQ_Rrte2G~MU!iHU54z2WjZIQM+Ue%zYv#MSZqatwZXA;sO0VbJq*Zp&&&(WY^Wh(~
z;eTQX-5~Bfrv?XTqQRXJ3;{Y;B~IyIde#cA=g=`}s{%DGfsY$QW8q2@`>MjP(6lTM
zmE}zsI~ImNWj<~d=7#O02yRyITy&oKQ@7q8)2$W5$GWT{?5!UbR1sH?zsRv>Lp6<{
zN?084efyn>ywv5I8~50-WZs81L}$Cja~l5cs=@A`CLAYO>3p$wtb71NX16-}WyS6p
z;W*;)kN3Zw1>p6+ZA#$ec<=u0yJR|#V5N#iIN`+p<oeuccy#1<(;Pdfme!p!<qHim
z_V~qPCl?fnur9E+Z4((H0)g^+3bm7HM?n)=vkM|EJ)2?&XpF{*V1`@P?=M?cHD~T;
zaa&>sn@E4hiew9Ltx9Yv5ZwmHFG~v>Fnyf2VN%6`g-G>ieGQ7k@RUxRHnJIE;+KXF
zo;G0j1)yTpSau1{4~@M6_i9@v2hNcw;#{?hI8DR+SJV-XyztcYl$eUgw3<C($te6|
zuy@jPm7ktFoI1|80fEUh>JwO>umNMt8wAUvUc%H1@y7@+``pDZ$VBBR#i6m*b}^Zz
zrI+S6^0QcE;1A85Z4Kc7u@wU@A7TGARA}q@0KXhniM%DVcK`EdXg>p=46P40seTx`
zQ+T^|e$=-<C8<a93C8Zw<5TVc%G=@{M3Vy_!`+Qs=)|oNmN6E2jZV}pVch<>f#4fP
z$ry6(kY$db)ewSuWSPWkc02K++p29z9J^htDhe(Q&7&w5``D%J(bh#+(o3*R^+{a{
z@9N82pc5U7D!sq7GRG|u#aM?pDZ8c1a_w+NNDaHdtn6};C9E{JW8hrIjqf^}jsZFI
z@@6id)^LgNMdryyo${041~FoX<v?cI*eP|~6Zp%HtIS?=g&dyS26JhzNK{*^rn1ZS
zI(N~ryWhm5&}+8SODLgI{GBDG@1ms9mS>xP&|9W`?|r;g^psPeKL2SJU5<5a%<o>Q
zW$=PI#Y7-mK(vt^D;*f9$svC&hIh0jj*RLsIGh5tUc>WmP{$fqFidw8t}rA6Dy}mp
zrjT?$$1o*APO&*!Gmn^}!9_%%<E#yHA*chjHX)!`V^2W#K4%O`V#ycf<y?hQC`H!*
z@kd)G?MtZt`pLb6{oU^aVjR>)I{sD(r+IGQ1FuR;P~0~>rpg8_3ZV0yw(Wzfa3k^i
zji*6Fb+UP5`m7^Caa&odz(3g1Z06?9@to;p^)lT~air8-1b^-@$@N2Iiw1g)X8Ab7
z3O5JLB2E&k`7#k(DbhN}*EyrBIJ^h84YkqAO2Ng_ZWY5Ul?3w|y3wXOcx9KgCD6n(
z*PK`!>37}XD0OWDpm!Qejtf;H8MGHZ-rawu&5OK*$o@Fhi|%*R&`IS_89gT_u)2Ne
zQti8!3$?4L22XIo>T;e$ul@Zgg=Hrr^6gQHtj}k}ZgV|h!0Log$hrMp4!Yg?dCpzV
zDfH^}Xt8!}|6={eA-cbuZK<GceO<PyRtmJb`!Fw-P>vm~OfGSn!Chq`Y8~<XQkw})
zC?n()TebJAqYM{hf{F$k3Gtm}Wd=t1<Qu^7P!B9LbmPT8gfBBx=myjhL2ez@T$Ci^
zxSKl=SCv_%%1m@7_6uEUF(U{w0?(n`I*h|D@a}eomP+di=H$owaia`+aWNs}a>P}Z
zHngm4I_z0zw;Tv_I{TxvxO_;Z$4DBi2AV7ThubcHRZ?osvY^;C;Z{Q{BN0CFB%HNz
zo?YZvh7ev7*G34V{6O!w#8fvuq>9-~FJ!M+(dcTL!F+he?-Nr!yMz|fInbhRzlP>}
zpksX9P29ww4GQk1#mVx|-}A_L<`=QpOg;I>64z&i%<&BF=&d(~q<ma_?PS>z_w~l%
z+ljKu2T+dKKt&7UVr99T^2>wg`U%yJTTDw3Cmd`cay@|u!U8_hE+$Ba*0sJsJ=ymS
zsj+T!xOJEd(N$vVCRE#iK3-S=ofD&bQF=ah9@#f<o~`djYs2V!^x_nT__3VM&b^2G
zsqx}hfD4H+JWW7{Z3rR-#g>$`k0_d33+HPlUn`L_;*BaKLZUC_cxN2D|B6BxU;h3}
zTJcRFdvN-vwi$wEDKD$<MXDZP$%bDHI#y*~bw*o^DFToKcEw4abA`0~<5ojCbPv3S
zLt)9pJUUhr^`ULMM%o<iQT1m|Io#YYo%<fr+&^=~QeE0j98nXRKcn!M?BIk7d{RJ2
zlRz19>U|v3x1{VNE_oNotk5}O(Ey?KT6>|Vq)rH)pQ0=zk4a7tHu8;oRPnsUlYQdw
zyn!pvJput9Qd0K+C`u>8xixdX;;AFrQEASU^<KoJ^^iqfk!2<c=&h%>jdbNePpkYu
zMIpOSdf;|DG=tx|kEEGEfZC$xzhZ0yMYUNV89b{=$M<TFYwj{TN`tBiS5&z(MGoK7
z_LtO+`_-^s`<u=Xian8BAA!luK~JyyfG=}LI5zkeQ@<7><o;)5PKVRV`k{F5vxi$K
zo#J{h#x_TYHYW2CFC_)iRSN<_d%0w=+T(be7hT`TNCfMXo0I9+25+w4X*Z4Gp&!ZL
zM^c48=~)my&5DG%8<=u3mOez=x60Bd$#cx!lRA0_#o3Y@sQAu%)9#nems6A2lx$3-
zL2ZX_sxvq2aE7bvJ)7hv4Te%@pNOQjb#WeqqAPMEeb=$Qz0mO}J|@+l%I^<olF&%}
z-3zD<(V33vZDqB3YpF<of-N6qHhq%Daug}v34kI>t)mKjEPPSgz#y$UMO5*9?+jT}
z=6Lc)!VNBoIVIT9{oZT$&hKvhywdMT;r7;yw`!6F3PuJVLoymo0v?(zlH4O+O(Pa&
z5!pX~>B5t2ux4k^Tnw0*nVBnZu5&aufB$IZyEo#Iv^qLkP+jAiOtyXL!mHI+Z!u6!
z!%3qSk0V}7#Z*f*1tS=(1CDsJ>;)nYMl25I5wnx$4eZM2E1KD`&$-V7{BX{m9aO6=
zM{2>%nR}cwhRHp<eUTR^vs-jh8s}6#_Js-iDrb)&fF501rZ>H^#Oiij;wbUUsF9`F
zZ?1g8Ry@0|M{LI@0JEq}y?Aka0`m9HucKdnXPZFy<Lweu4eW?8PzX6-^fZdkt0|ac
zJ}&~vpkTP~I5NbI&8fo*S4))z)vz{ME)ZYh=PP)$1Q*J^Yvk`!ayrS|8zt?u7|7J4
zffTk7n4Y=85=X%-5%&|>@e9qg6N|w6m-nZ(sT8SwV@M<GRb#9Zjf`H5xTtGG<MGT|
ziM?V@uH|qf)!^F}&sVUp;|f}{lZ>B<JGW2B_1_0AcnZtvB@xqP*Mz!JrQz~QXeBa*
z6&qi`vGkh^y7w#{2=vU?!voaWIh{|gvqh-a<6cL#ECO2&rd=rtuBAYTv+9Q&5FrnV
zw2*Zuou5sW5es-E7lG||w|%#pwA_JL^m~L=og3m1(KF+ddNr5$+n4Jl(|9OYA%qCG
zn5pRGh}Wu9EPc)X`a81}V=h`!u#yQEJ-OnmzMJvcbl*MwN;ylIfBIFkZex&-lUzZ*
z^f$yAhN+*S2c&SC;8#%kV`E8x_7*@L^{#qhcTmmb?BOX^GD`Z;jvL;PEx#q6ej<iM
zO5U}h_p?VYpE{{^itq?2U-u}9ozTc8U9tOb_2DVwpImh%10H<M{Mz=~&hPRDOYZCm
zsz(aLXKPUS;WtNhstFQTR!xL3wKq!Z;F`i`w`kNIgrz!HLpNhsa(Pw4l9}HAQ9j{}
z+B#da<o&~>iQ!;oAi%~L$iIR?ti}E^Losk<MO}4oOLs{U%}inu5N)4C!0vhzU0tt_
zK{yklt*w^>;Fu2TPIEPS#o6v>!qOhr=h9I+QGyHLHq1K?%%Lu!nsog9!<C<w7z*OM
zZagkU9F^8}4d$0d=FVJj*NPPt@6;L0jo$gX+vjX7`_@<Qvgld}y>S+MH<hor$5Es4
zO@o8Nb=jm@LBq#@zQOoc)%42{;c%!!1ovNtu`blI^UAZan-s*BY)n{i04WX!0(DK!
zPlp@?d4}I9>*i!g2xP@L9UX3Q5}TA2H3*BXm49SXMRSo>&hnRZcU}wLBvu2j*^fDB
zo)B)eENwWU1hdqhG0D`@wPNK;Cb8cc`r6~XwsWRt;F!255B_>_^jm|N0FKF_R_?R1
z@C!nQ;@36(6(%zc(x1>dzkIDl59wDEBQwOFh(@G-jzm-CK#dqo+D$meW&rglcpzgN
zXBgN6g)ozi32J*Dtat3GezlFY-_Z(r<QVPF2e9F7L2*~gQ;Z>pOyz)oSDyJylO(0_
zWmWN5g$BUgtADKCyd;Mv8w^i#Ffmg;fr?TUM^PLtHBz#<hM{vomWN4eM1kAtZ`!rd
za|}9$P${N&<U%(*eaT~=$?@rG^3j!lvo75_v1z1f>ZcN2F*3YCFj$|aiG0rfuarqE
z43&q-NK^1?;fJhm#;s+0nAd>#Q{1Iy={4QaqHprpFC#FMrqIIS#me@1cingm0j|`N
zTlJaQ?Wax^ntK%n5`;)7)T;JVUBt%iw9NK30&|$K{5cboZEIRiQ^dBpz5u$N#@{gU
zI7?ZCQyka)kc<w{U?P>^_G{fH*Ya(Pt@}Tq8K@g4a5pYH?il%Xv(-r^`2tn!ce@8p
z#4EPmET3f({@}qCM*Uu+2y@_>2L$?p4M9Tt@#$s8!6vE?CG+SfvaKy@U`6!-q{AP&
zV%6LB_WyK*P4Vf*YS~p0MEjvm67my3&wCqr#2Z_UYaH?d8)EP{`5ROH)p$4wZFOBY
zhVSQ2@^-Z0msrdu-h>6_vq>=GwaCz>m0tBa%(r$yp}-=ahEe67xTg%Yy*UQV{i7=P
zSu!(@w=2%J13YcMyRW>*O&ap_^(EYpD59Z8Y$Yu-Zd93AE~ZG%D`?IuUZ%9B>Xw@7
zjmfiqx_0~S_K<zPeP)>OiHVQOt?bD4yS3YH`lKY}-dR(vLj|6;Lg_bI>MT5fmVuaj
zCi?)eZ7M$RYN_#fgU;uHI`24+S25qZmf(D&<P1eT&Fs^B9-@~rkA@~f4I_HGIFqOe
zZ6z-cFta=pgC%&VT<hi=@6m>L=*E)uOcrd(uLwv(4hXYO?{n4$*ezPC0!THLbj&3z
zj0@UV9G@hsR~jw}?i744GYyj#1F;16)Pnv*Q3(Zy2S#@@5%e$-r1Ds=jWAwa6X5QB
zh&7>m)W2AX`vtu+{K%#KPIUwh^IGG%Li#E~^hG||=C(;|<TG!w@fH>r^Yu@w;cL~3
z7xW-Hh{KSdXG@392>sgA$r8;rrh#{JpES#l$CEg*@XW-%tig@wT@{8KM48_pKbJbe
zhnwO#!%JySHo1;eT*W&{w6*jecfc~(e9vzaD#1x#99zt@+7o{v5vyLpwzHtnQvLk$
zEu=!laLH+IHAeEWer1Hot7rT$`&8Bi4~@F@kvERi*#G9{>FYTR)ZilE^{d!cIezg9
z(v@CP#QQYK;Q;Fs-#IBR3JR$G5Xqx_R4BFU)ad80bF_C3n^_1=c4CWTYJk=h^3q_O
z7e9q2aEk@Nt6d<nmwb_@ajhZaV~vQyK$zXElibQijP5b!HPN292AlYK-r3hFVDkV~
ziKt(XrlGU#Ya<#g$#^3%F7&bmlaAznjj^YYji&Q}{MWTtYX;XpZ~MPVaq^t#<8wdk
ziNBD5r8=vZ2B1uO4-HR$SyK#@b1%0}Rk)U*I#VT&nRV<f0nBf4gbl~jI!AnDlrM)C
zwHiTYO)#@&aK_!ah6FWA!Xzbx3ar3Cre0NT;b}ONq^}t+7=mn|Ph2&xp%MAscLNQ!
zLDHm4!JSSuFLrEgMYR=DK|9I#O-fTcPr}Icm7D9(Q>FndQ@n9nY(Z8&YTqIP3PD;T
zE(AYX?iFkSNpx7xR5ugz8^{-86uNxMFmlzmND(7{LRX-Zb(!gw7}?<uv3l1DU4y^q
z^<H&#TZ{S|bDX2o^|ubKcx-nt$5Gf1m1i<ymhpCg&uP?()1BFrmTyEi9ak|UFX?IG
zAkNQv_Sy}4v`$}H`jO<YN<xs5s7)Adx@B**(hF3rDsK+y6k3*m=bUpV2pKh3ZC5Wh
zOCODSCHJvTjV$IBo@=}PmEM*f=J>xBfrlr|1XSi-GvuyL#}uhXCWYE?zZ3TP(7Vjz
zX(Kj`#>LqD?{i+I7YK$Qiv;wUB+Xqj<;16ABNbV0_1KO)$qs^XA$_mHBi)+I@K9)x
z!gF`PhNc%659k~J_yFLP<QhiaS{S!#=%Iv2pLkm@P2InYFNAYRoPQ_%JLWnQkI;u7
z27-RN2IG<AkxTT0d|<*t|G~swti+WvC+hj0ggYbGKiI%u{yR3`X;Pc0arP=eN`xDK
z3jg%+3y)cA3He7!36ZHYvQIt&JYa~&86>?|(bf=C_YdKOcD<)Wmwyrcx37oQu779C
zhq(m8@V}G&mv$ia{}s`H(Cx`Z7?-<Sa9?^s(14j-rsX+E;~|6Tl?lS;I;K1^O&dU(
zM1sa?{*=eNvVkl@#48(!ji11-aj6(eS%?hz2YSMrFNB3(WRQ`+%~%K_1Ac<AI6M5l
z_|0hr+T{c(Q|kQjlfO_${-w3|a(_rLgY_n8WH#liiTdv8XFx+R)~%bphgVR0@nnI;
zYf>D*_K}kF<~gnu7iJLD#*Xd|d_;&`p(e9^+iNOZm7?&t^&1)H#*H#qfNhc&qqLrv
z_qB^Qiip}X_zfE@Kyxh4_RM(8otB7%mtL6D?o)LrLLz-K-a7U0&AUiaMZ{Qs;#HD+
z?3sUFJ6Am6B*%5I*9&BLD!;0cndvt3l|YicG5J}0ZSTXXXvI9rg>&%@lF+7TLX&7W
zayaU#5GdE#BAKXJ?fjBGvVzP#(rpaLC7Rb#iw6WNC~TEuYJM_UM|*<d)pl^HPV84=
zHM)YIPc1XvMhUnF{CFZL(YB26_l%>>s^|J8wg}L?rcB%^h9#e07a?OnDBDqTG;Jrb
zXiq|U0>5gOse1o(Y;|{<gg$|<TORttfi1~)c*vv34rkzncea2@zVr(^#M6L@&+OES
zX4SyzP=!u(NpF+Xf9RN^AE~AI=q-<q?^ZuRz=2r&CF$FUAgnYkol5!Lv*Nr%0bj)l
zeK#d(8g2sHLP6B79*r8A_wVhy#yU3d7^GmAC(}T{EO|x5Ohxv^B}`#dI2nBbPMc;z
z1~5MSDt>PlggkO26*lOQiEy>Zmn<$UfZD}|Uwsb;h%iWQG^}jsGhzz`FW8K3N;N(*
zi`r4^ApXRSrTf4j)e%o&Tj|v1$h!{c6b!#Sr)9og&MJNOF{ZK(f-$QJFj5Ds)w7>+
zoXqU4uVQhrj(M&Q%<vZ^7cnxvI}m@ugPR~kVT?H-j7@fN3?F-%4$^17^k278&->G5
zPQL*C7@bO1YcwGfz`+!=W^xglB}(2MB7Skm9)Dae)FED_`b|V{<Ng)!N;V#3EA|_P
zJ)CNprqi;Q{C1Coz70HzJ;~Bf;A9AfvYjedjjdS1KU5YUA7Tca$+8VJ2<iForH@H@
zjgbhc>E;X1U4wK{B4(pUTannE7|z>khp|oBD-#5y+gs6Z-xa3Hc8$(<dQ&#hKfi3n
zSE;I6+5Vu*b)(XP3oiY7GW9kLrJld{j*o6O(EFN4M{9Sus^E6Le7M!mwZ@CAO;q(K
z9U|UGylPbTrRw`j?NBfENQ5*G3+C(?6?W_Wy{oV>GG^MyegYb1=`mI8SCq4xX(HuA
z&RRe!{|I|KsA4Yvq9Wmrh7q#|D(|?MOPMy<sve&%L!6V1LCgkd^4&NjHecDq+Yzd=
z=)Aft9Oi)Dt4w=$WBiX8lfq2j9<SzRrq)+8Q+#D|zfsfz@9Q#g!5Etw;ew~+s&D3{
zBU6#LwqksFmpkx+YG8Cv4!(y0fiYOo>K?TNKcU)>3b$Vk>9lFkr`Lt8K0(eH5fM-d
zIq)7Cp9D)v0kD>}rpRiQuzsTYs9g~g2ZwXfxQ_{^Zs80s$BbQXjx4^abE5gWt+fr=
z9|;n5r8k<-q*}F&vt6~Dp$BG+OD;GoiHG3&i-Fa)nKG%58|Wn7Dzmq|v}rCF`K+G2
zwv;^+j0`x@sO9}6x`YQc>c}K-)Ak2mRAPy2QiMd1c+)EMU=z{>ye}n`7vPcG0-^qA
z-tC&9c6yS5a%^}I9%|mJwHu(f+bdvpcfF7qbjZ2fi-n2R#MC~5a4wRpZkPl573&8+
zmOVW}_UP6A^`xDE&!MNdJ-w}kmnUP7!O!-*dXU0*CgB0j$f#bXnmQ*7=jlwP_g+fk
zO+A|5uQuG8Wj_a6bXnJLZ$R(WvCQk^1Y88R;k`c*zYU-~$fx{Sb^Cg9&ts74ywFeR
zewu&n@$h|iMd3Hji)j4ACb8xpo=}6=M{u)y%E*`YsQvo%5ti8g3X~9D4BbZUZyj~r
z?)1Cj%D2#V#y*PPxH;P%Ne@yfp@GBain-JgWF-cQzi}SIVP1uBFOlIRJX^UJf45+$
z38#i0DvXN!kZJvtCsLjCGQ^bI@YUQwN)h{Inqq692VqrxXE?HYxQv|L$Eh}yp7*@)
zJt8yNo2@<p^jFXY!Y=POg8#zTTX4l0blbW>f&>Tzch}(V?iSqL-Q8V-LvU%_-CcsE
zp>cP2cZbv8-e;e4?-+O7_XqT=)vIdGXV!vqdC87pZUF7mo|0wEl{>va3*Ok}v>MQN
z<jYk1$7LhT+%G3ZNA4F)?q~R#ghj!Jp-1tf>`ZT}SVo}(#^mtGH>NVA;qQDYMuloa
z$vf^oOVz#8zSq1b%UdT0HPyKK`G<c>ka@EXH2e66Pm~X4%35s#nIC>ntgWz=3O$t#
zPiduSiUkIXzNn|x!%)GV1j=+CumG6SMM`=V@=Fa&-J2SyEV=Ra2bChC0N7;~B6kEy
z34RO*!N9p<jA3<Ny!pu}2dhvjb!@FG);n=Q`fcM6<<<t>-gGiE2&p@NdXcp>6H_tc
zt1-xexc>cse@qrZb0sWK#l!@9s1+jhzF@rqd=FZV{fC90Hosh95n|p2X+Z}ItJSB^
z)L(Kqk_C0ljrEHjD^Yf~p~yB3NUWmneA!1)mbX?`8T=-{9$TU%(nZY0V)%R22m11X
zrliBlC=E|BS<_OVw_Z9P|329LwEsz@TnPwr5$^EzErVYQAD<}_-nr+|d#Qqo8e=fv
zA~HSl%$3z>0f@|Hnftxe|5}|-NfjD%jFgPP(AFL*SWUK1?#S33;B+a9sXncvBsG=P
zkSa&<1}vm!OfaRb?k%%_)s_$UbX`)AZ4BIN!$zQN)NmWFsqK6g2gKYwnJ_)?vY`QY
zF8@lhhs#$su^}Y1O8b?ux{r-ax%kJ7S=IbGP55%M{euepinO7NGI{i9BrF2a4rQH>
z+m0H&>_1-NRe%j2KZ$2wn<NYX@pbCfrO!~amnY<uGVSVb*-tIu?k-`-=fqTmX^EGn
zPpttQb!7f9xkxkf^!d;-y$YD=?NzM)mHj`<snG<_C7bC`8SUAUAkT*Qh`RR3USf#}
zN<*5+)up(h=q<>yxlUM~2a117|Cu8?INnUtdV1bj5vvebrklOw#P|LnmeP<^3Xge)
zLi;obkBZm(TUwAUurG=ReO9XTbc01t>EAYJsuIb_Uzwn1*DD|E!dTXfWfQ|kwy9?{
zK{F~kcAdn>Sv%S&?o2lZ>u@+tpU@$i?Illv3R&J8Wr4(9w4uGHEm0|lCWLMkKq~f`
zCsNdINtYRo=^^PlT$@=OrRit3E_>z`Qf?KyWJFZWoZj78frrFtK=;FkK}@4@VDM2T
zW*u6Uh~I*32avY7e`lW?)vBJp#^Ny4@ZB{kP-vB?>1yAFL*=t$(AJ|!25WJ-Dy~-7
z!%A+EGu5J`v;jE*3B!!y{RNe^8dsDG{VyaWV%8pb8@|Pxqe{I%%Gpm?5^o>BvL54C
zxt{VuqFAP>4jztkHQIL5VGU0zWQ72Fq02NVxp$S^2f?VRRaL{YS6VfMZK(QNrSNzz
zxew1MpyHQ(W#2pB@+v<kW-L1Mnkv<WV>BhgPyUE3L12FFH#9+Vl4ypPnEoYM+9D)N
zL_rK<Otb`e4&W&d8J28khk)Qf09t{Yci5r<Fm2;!-aRZ1U-b+oT6FYJAe2h)lJIBq
zZ==Mn(;xBQ%K97h`glg$v;tV_`?L)R;$NqrM4b5K_S!y%Pjt%$TTDV#+0%YjE)pI5
z=*2GhbSY-$G-0`o#t@+PP}{V@i7C4yN;U*;g`{!f0wE68@fY=7tsb>N^QUSS>2hmz
zkwky2{KP`zJsQPGfAEr%(c_R~$piopX9yjoyPKz_V1yW%9&X@g>i&tu)>mPAla8Ee
z33d~(C;^mk(fA&qCKWfxV#HF*3&#=pQAh=pw3~09xtvLaKpKX7GRIXVq^wW#l-}&o
z<sK})39cw)@O~Cw5dDAky$Pmej0!mw(~TahkY=1}N!YJ0{bKPeh01<%LHUpj55wa&
z7sc%+KytY}lw!~^<y49Dp2ZNJCjT43CV$sXUTfag4W-TZudkkXz$;qAfCdSDh9p*H
z5*m(S59d`Y&Q+hB=ZEQ0lVX4z6j(t@kEz0Xx~IoV=5y%)#F@}+pYB(hL;WMw7Av^)
zB6{e?FJ$EYk#x*WK>Q(s_V9>_1AxVc<R(SAKyWIUJh<3LY;A!0B^iv*jl1*b$nkn;
zra)Xs1mX2qkZNabg1O!l6LxE1r$!valHn<8Vc#zt_ssn^1VQzo9QtXKkr6X={s`~t
zN8h*A4h7VJPY>kzVSH5<=|nNG*VU<Bsx767@Q37uqG|Gp+r6K{&m_zy4O%;beMi)&
z=SNcl_~8L8q^Jd6da#QQMIz_UDN63VS7+9B1xWqn-^|bm3V?ffV{3Yj39G6vfH_Y-
zO6O-ETJlz24JvqSG8|ONOdMZQxl6H3U>JRFE$4X0y-9RSB`dubF{s=E{VWRKBrVzv
z?MyX_2-%6|=;~z>KK6LYdY+4tsK<Wl0V8E(0++6<BPFX=Xsn&uA~)e*oCQw1rEw%`
z{fMW))5-s^0KtQRXw*A;XRkq+uVQLq2qC&B&Bda)r2vFHhJ*I!2H}+?OF@*KYX&R|
zQX`Fak5#1<Gz)?xGMtiBn^QIjfecAN3NYq0=IyVzwFdW1+vt^s-reP&6(AMs)gj7@
z3HVE>X5z3ju@UHyw?cfYN4_tOE@sshU#z(z6NVv!DnI4E^THD&`eN{HmZ&+Sqv7vT
zFhZu-t75_j7WvZ$=`Q=Es+~xdh!5-1tc21bM}JS6i8W=PVylP`UeQGm`j6?q49;$)
z#gShJNMEe^If%1tUf%JsiOkv`a-&giUG|y){fxYHz7}C1*SM$!V$l(D7S2Ywbr-2)
zA)9)Di~6h{0~7mBL2BnP`_I7$GqDhMEIzxoC@My3bbE{Et)!?6c(GS@82K>n*nv}r
z^rdpXR!K3K+?zsQ<AkRvQXgc?8X2^phg&5=L57+!3exHkwz%y0A8Kiyx79n>>E5Fq
z$A4##>MK1e(NXB>X6`A7`uF+J&@d5b1F2Cylc_^;kX#nku{%vfHA_xa6Y>Sq6nlEb
zj8(jxwAykhjK1RVP&V9E()H$gNHGR`7Aa%V-n3{DELNXprC68{K@8Q`+>52FcXF8@
z#r+Fnw7erkzSW%ByD9rpAa;3+0)y0Ev|(W6Ix5V)MfKr})<TgV^}6d}wK2iWUCFYv
zUCNrYe-Sg#$DNWpz|**2)Bjs;u~49CpUwqAsBFTSWa`>BbQ@BtyVk{Q=5L~36JYMh
zujLzaD_BAQNuVVZ{dD9<8Oofzo3=<WCkjbOfW&kM?kGgwz?5V(`h(k1xA4(3GUym;
z(Y{^$aUoKQsV&#+*)fu_Cy#Xe06Sn=42yy%9`^Y2H}XCJi*~YXA)0!6hEiz4bUS%|
zb5TRWr14|$KeH6@eQ<&FXK_9Ug$1yQ{aFT+QQGbb@hfdcR^w3E4_N*|X)j4mx)=>Y
zhg_#MqJ!&Dv6rkn<uBtPzeM5Z1d(W{j6{4F9ou|d55y8&zvmO?KkF6YlOsX$kR~09
zXRzW#=Mm)8=;<qaRjd%wA=4Uoa+R)c{n6iAT0O&W+{*-3VBf@S{0@c8c@(r^@w0#$
z8eCTzMlXzC)8Cblp2Z?Pv&KNOw5LISqymFx$;-?+iItVJhfqHO@fHl!u6N($VmpRX
z8jQYKkPhspX{=M<kbjRtn54Pmqq%ud3sHCpxpU_#3P`EM;j@hOo^|`ZpQH_K-kYWG
z@t)!Ojp+B(?9bVqvE-(&JmIQ;m&o<C%?r?dX}w1Net5Hrj=L>f_Cx5wLf<9g--a3I
zme{YjelNdkGQ(MA(!x~2N^5^|_OF&dP#0tH9a30K0F5^^jw`c*`~tkr5zo))(K2<g
zx$yI7vn^P5jaP>{5=SQiBvbw@QgmQ(x{zX69k$CVxrpS3DEATVlb?vyK`Y(E)&IE%
zl#xtA_=}E(r<BQYBHovAY`?XY+oC!OTTp?SM}Ks)K<rFMSi1wi=KYcubTo8h#ukQP
z;Wx+wU1Hzv)@o*6km?JTZ>(KIqs{Q>X>~?_v|nY*eAeRwA5S%f-E1iXatlxKgbDp*
z%sv3Iza~`K6U@7vM-S{Y478D|BTQ9CjPEdVfb%h%Gzr>sM>9g0oRtIT%nrvfROs^f
z2(tjqwY77ms+%J$U#9NA`)SU_dZ@{J1bD|fN;{=cgEfoVW>`nq>eDhWt+-_-_A|yu
zt4iKRA*WgBP8shP${F&!p`C@NJl!w8LW6Bx|57R+v~!L=%qQ?Qj*sW1DRuWgApBsV
zXD%6TH*_j_VJLw<d49ZNkGnKztu9A#(Yt|bBuQ_*Sc`f~q>xBW-;N4teI3p86wImU
z8cH$XvvSKTV&`a3&`Ee5>~@_G5^_DZ5~NfrA!4z2-Nwwn{VKSM9=h*iY=vJsho=&t
zav_E)_wg#RP=$$?#`gG5DH}czUe<_{Tuy5yOaB|x#~J(&I(m?{a^pMH_KzaL3FiBx
z2{~XiiPwnm>F*hzL4`fLWYuBQgA%#Tv^S}8s9Cb1+eT%#fc2^wL2@-91^MMtWcBzR
zGVD>Cbq#F5a!r*EeF3|SP=2FqVAs&!3c6K>NP&Vd#M-obQRiNXuB?J=eeJZ_b{~e7
zR?$HP+O>~mb4b|e7Y9rK@VF>!R+!}7V!iF^n^tK8G^DxOQ}!_x+VuiZ=od_f01v+)
z7k6Ruhic;Lb(R@R!p>jCwK{!WDcd>G#W(VM(g%fnevnJ8Pn1XqjYxc?C~@CR?f0Z}
zQ=`6@l`Xp1bkN+RcaJJ-AhIAy-KUNnilW&mbKdCs*mcVvVv1|`U=%ePY==ZOhzpVL
zJ+nFVD4n=a5)}9;c_TfizfSM0XbRTKHu>=jHI2cxT1!P7k$s3`{F-#xA35#ZZDeIw
z*$h46DY@D{w}aAxG3$tfT>vOi6!81=WD=C<>n$3c&=s~z2aqCvjS_?^mzCAkBA`Np
zZs^C9M=o=1-p6J<R$4b141Y<0OFb|fy4llGHZ?x<YjQ=X+xnY3mM6qTNwS^P-nv*1
zPu_tvCbd*#;c+ZVc#DS6SyUp+a+nM!Nlj=lb}6hbE0yDKj8a!R&To+ZrZ>X+r?qf-
za_crLXTu}?MJXt89{7~he>91a>#x6C--tST3B=}1#i18ONCPq7gv9owA2ft$!G}U}
zD5uFqLzS0{3=K}U^$f}`YJ6H7mRC-=E&?W9T78{&mrT=w$|4cBZl_l$g-R5iskDa7
z{bz#h+vDWEH6cGo4?o1t0~=&-B!WPSfpH`m*FN*ZC9*f>W;y99&@-ukhc5!EAU6{`
zl1Wf^LsX^7Eih)hPdLi5vu}l^!o<m=iw<L2vPw2Irr$Ot$i!dOu>S_lM@(YqZy&||
z13cO~$_|zY_M@pc6y>g9`R<y1zp;J8*}E(UjNxPkMoryvOh{~KQ^)G8rA{iD%wPwF
z<|wm@8pp1BZWsc5y*<&)>pgUNBiLB?+nmf$0c$P@DHc}lN#=!St5fdxB`G8SxQ1+)
z<zJ=WNfmftw~`3;Nc1F^TZfGf+;S#Fjl3se+h4m^E>@IV<)U|zI@Ati;0!a?x=qjo
zYkouvH>~8{K3m47+|{n#HyW~})E)a33{wyRMN;H>O*J$ZkDDjy5B$|OUKq-)5O&)D
zN5VwpblRO}rX!SIjWkea%FKfG92uC!Y~?P2Lr@+TV=H8bhI{$Nj8d2R7?hWP_I@!Y
zv@Ln28!G*Tj;kb7wY|pqre(*JC}B3sQh9=M*Yw4h`S;7j4@*I!lHTuN<1Y<*#wEkY
zCaV2pgaFnA+7iX8sLeS|WDr(1IZUSA)%Kvo?j`;;k^nuuUFF5sV3cNt2VICJ9<}m0
z(yxc>TCS%=BS*nqt4)m%LYkUBd|B?jLPj3!bB~M4mWos|1{kcN<1OBw+}OtXQz=w*
zF<{!c!Ulir;0p^-qoM05d%$B1XZe#!-o(g_)X218Fq9~<ETI#dFtL_Te{c)(SGVuT
zlR?=@n$86SpCZ&)ntrv{$x-^Jniudai{lOCAJymui8?pPVd6*FVqWQhQev^(S12!q
z6zRpsYyEpEU5H^0s`+mBlM{#V(>_}5CRfm-sP6!^NPg!@?cF?>^#d%aWL5X`06F!C
zV;4rotJyT!fEsP6`#2OlM`?N0(bPl+7-iMBA%7=9Q^jixEL8+Nb#gIdQJQM){Ze<9
zNwJhlm!|}ouJW@lTBIw7rG+YeQb%%UH@1FlBa%z?ZQU<~?bz#?T_NpgcwOE>(2(tF
zC5;9owp!%=+c^pwHhiL9ya!;az__Mmi`&q&f^v74Hs*od<s=G%4p-OTZnvVqo4T*>
zTZ(39=m}QBs5Xhtiy$^bj+fSQ=dIp-QK$Zo)*ElA(zbuy7qcV<FoEJ01MD!v4Jrm=
z%Yh;ax}m0VroyCa9WTL9z%YnzNhsGYBlWN<52lLADeWO?PRMV)|6T@-hsv`PAiUNg
zeTTYgl^vNzEyxNYalg*s1@va7qW@78|83SPXWx^w-^a-Yqpg=zYqvk45q_>r4alo5
zG2?08o%-`U%Qlya!I(HD5?JND<B(GEEimxp%TL|4oJz*P9NCi`#Mx`=?n4pRH7)ZN
z@06x~Hawcu@Rmx!>brT2Vctw^u8qOj0S50GQuds~i|--J-YX8gEHEYx&r$qnTT8oM
z4Bd@R9Qt!EL8-G!x!hXMIYp7g!I(A8w^}p3VX-f1&#_^8ryL+Vw+^ul;o8j8<&2#?
z2H0q{5kt+da(5>f&?Ei-Ivnf%^v<_v)dN#Y_5QIwxP)A>(u4#|{}q}rw;J9{UWJ+E
zl)HoZj#av4_@$xaeJ6hp3lhr*cJYCYJS#O?X&R-&hKFH#DejqOrJiX(6jj1F!$zaB
z<gy~ZhAMc7br%I?nX0PQF307uS))2m@c2Z&j{@-XK7T)X%8#11Kx|Qkq+?W&HYY?u
zccz_N->!$tv~n1Roi31i`f)yW^btZP(AePeOXe3uz+<C6dD`6(i^;sOzq{~zM^EO#
z@QrPFhMP5mZOsFeHBfCqv)nt+u+Snfyb#>FxUM95$k2(Qaq;%(orWO(MXJjzs9<WD
z%2*zEty&vnsYX{Z2JeR;L0(}c_NkD1%gGnrY->Hoz^+wAbl;!K**56v?)9sl3Wqz7
zqj71^Uz<3>E~wbUKR}Z~8Z$OZ>v*&#6xMBJ0TTSYE&nw<m?}@_s}WzCiC8qL(OFAX
zb*>A<UZrC_ZT3gp|C%?vjZ%Cq5Y3l~S1R@7pF7jR8Q!ElyE)NF)NDmg(n_jiWLe*t
zM(Ne$yO=G5o7Xg~|DFYsi%axyEx9<?1MDjef=kqZ6BFk7OQC)=)aZ$IDQVtIIh-){
zW7yLS;Hx657o~q9Me`PV%DZ6#72sN<%Yw0-cb&f=mlbuzFt7#U2b|-9gN+?8wd?mt
zP>d=Jd4r9fkuoh$;lBNo=fE-tgvajhdh$O#Wkc3gSvQlaPUkU&RZ&uBJ1cc|rgjP|
zEEdeS!ubz-LJn6*Bf-FkTAU`oUuIX?A~*BDPO9VB@ECMrTPiu}5t9dzl|inwuD%lr
zRb!GP*FW?|LdZ9@4{ka;dkf@<^hl;RL0jcNu62%M4~qtO$SGY$3^~+xq2tDGM5fnW
zShKt17*1^GvVOVc509{)f|j4!{mBM~R=HOMFH3h5<ZRLa9|JVk$uMMsAg0+s+T3nM
zw!si)B9?_zi@${clzyjwwkoyrXJQ<WsO8m`ARq$w#_MB6<%n3+hF~1a+<8(*uG!r$
z1b&1#vZ;HyP|xv3Q5_2Pw_0_b8b9?7$cvFR5HrsE>s@%+qRwZ@f&~%~59U}q%*l2J
zX^0E8LeWWH@8Nu^Z!@ODt^2L9Te@Q@yLqgk7t1!P$vtvQvq&}w;+z+z5rO6!G9`74
z)>PoIbp6utahUS`UL8Dv`)r6}ST;VWkT+f!rf%b+M)cPsYdPFsmXr}b82-d)>a{62
zH4RL~qb-8!q}vt95p?*|ohF<H3XY8hgFauQ8s^nTOwV|GjeOMr6oT=ehmNrw&Kb|k
zP>Uy8KU{uKM(WsBM=KwH9H@1xnIYk$|BJ~y<p?!N)(U=zj4uPoTn2fU=^HMyOcsCI
zRAphIB*<)X=lXU3%+2o_35^&)d}s&tq>A%ZGe5}%?g`fB(xOBXs7#Ax4MvY)^jq&O
zjCw`n^DaFpin7C;hJz#23a$Rx1)b3}1?uMl6~A^4yxV^t6~30mn5L8K1M7`;Mee2_
zZ4;K1Qm!nuN{Q9QL+JcUo`UuZI_GGV>yGR-dq~S*qC2IBQ*)A4-UQm}ZJ!2rc1Dz!
zkuoZno?>K}GEDAj{%&0}g{VZwY%Is;TBO+>b75=sSuj89yH6HmXSQl!<7sfYu?-eR
zA-WIAdv#mEyPm1b<1qlD1#&Emc_zOOrM!;xpq3mqNj%1~CA(c5+2#^uMJ+bC`imBR
z`UaX7IH#;rx6fxiBBEY$7R+5_emslQ@DT;0SoyrK!UtHTRaadU89I^t17Y4e?ERJ~
z!OcHmK&>wWq&CLJrTo(Z#cG0LAqs@XfRyt5#Vn39h7ZPH5HNsHz+&R=bsGNa;c!}^
zg~&2jH-tLXtqb3I4>c%CS~EzShhk(q2yyc*?$~t8L{F_vZ0T;b;!>TyH|hc2+CN^*
zQ34(AOrv;X;_X%JA78%|3%Gjd&_6<pG7ht9eyFuPsPR^U4%=?lB`m7GADE4F^uSu-
z7591fS1|fgAZHVJYRrYq5#*JM_Vs9J#&eye%f)8g<*Gc{mj3&T0__DfG>6?S_9hpS
zT%x$wPKglLWawzUKx?^3@F2#oNc?ACD?#&TxK7dOqkfeI^Uipx@w?x>Uyp>Xpy^bX
z3bC4Eq@RG$YiGu2puYODW+xUQ8V>B2sZ75t+fX>Rq2JMU-anO_=ZdEaxIph6woJ`*
z;OW!O01N*Bs>-D;%czxqxV~b&qy{$8CU7`9x<{AK9NiWpFBPGEhinD$$bCRxw^!D)
zfBGRwGccc(<)IowqSpL*s~YOg3X#YFU|;Sp(>Z6>i$>uc_fx8>b1{>ZQWjIp=g;p(
zGcfSN_{j*^K6Z&6_dog7YEPGS=kPnISetC$K&t897Qk69TU_})Mn}t@@&>d~@sjq>
z*=I(Wr)<Wh?k$a!p1((Cu~I%G63V-QCP6{2gkJZ46$ynd3$%tA(qMKC<cIw`^sjCI
zA3wYjGz(F!emv>ahic?T<oKX?4v>>ojxp*^0Ii^Vnl-_#p|ZvG_oBiNrK7VdZ7X#%
zZvfoXMn^J+W!kbV*cc-3n1n4Wdw`i)YtCNCQU|izY-twLEhBhQ3Oe^up|4*)20!C3
z23V00&R}zqtIElPGN5Erg_;(7w-K${1clzpbYE@D&E^*b0dk9n9_=5;w<xFORh^Ed
z$QWs$F6g!*b2(e3oxXgUWIkO!3#$NQ^<4TY5rL!9&vJ<K{W$VCS?fV!HHhYgudcLo
z$x)Tiasatc7u@<xBpQL8NvlEhw07F|nG7eaMxkkFPE~#L#D|Mf@3mWjCi|~E453l2
z!Z+hLF2GZ%{vCTuqxPGn@HMQwLbrk4RpSkIAlKjUYgnZnq1LbA($)_~o57A+>Ge$F
zQ4a(2vXZ3dBO!OLm(EJ%knUBdK8fx#B9g=_KQ;bnoIh%uY!Q++Lvvm01~PHXUhdIg
zX0vkM6v3m~jRd>i!<nKE$6?y-3pp`YTK=daow|WeiG+)MMTg`vX{h_^TncEpkwAFj
zzBe%Ly1Iv0DG}VjJ<b!MgDRqDnXn%_O3JMN?dRrDTqHT!az*uNgD~LtXffhtVCo1Y
zb{Yte95tD&2(>}-gVL^*^Gpji+#F}^{p#!xxUn?MlZ<RN*a<ClkU_7vximvoc<?KU
zb>FX8_U0WQ7%VhaP6_jBZ7P=-C5TnfszX_Iv35#!s}$5k21XZV8jR|Nh`mXLB4A!Z
zx1k|cp&TT=ctu$F6~1zMd~?RftidC{6?g&6M=pESym#$ka(4WUh{(d>K`RO9@U(Bn
zcA6it04PfJI0%cCoc@vU*%0$UpZg-fHY9Ao)aFOq7X=xMrqE74Y<Ct3+IAW06rmDm
zg2>0F7QI8kQ`Ag%`ZuY?t{#pn)p~&;5wYx)EQ$Pc5HPLjKWkV{jLKVr@);A|T`XHD
zw3<NlJqw7Jb&lH(GMFqx{(8h!!sNfG`>M;0x>Fy9VgE7uH;(2`u(*Jm18uPqs%44A
zZ_*0eVNa?CPAop`=n|F50S{!`u5!c@Fi-5qC3V?o#PXTDe-Rx05INd6>p!O(be(8S
zQTc8}$V-{z?+}&cq1R`6Rv>&f3B~8$)Ax65RjkK09cHCL=+|?)kDz3Q0l1MK(~FIk
zxj8|RdB#Q~$&MaFvWvv^?>GtI9*$3?5ZHt{hp@3oT;rLwt+LIUSLn7X2YjE}JeO2a
zPj9m`(uA_^6!{cl#$K{c=y-9pJ>`(q(>J<tQQCQD!5G*m=&!zpD2=nq?$~mEe}~eN
zC4YG)4zYMEW2FVfWwEP&EZ*0CB8#S{-O}-ro*PUZxt^_9{5dU#sAmk1%Hm;dWN_99
z%r_#U!;wn~p2XSJDp8O)=EpF1Y{!m?E)#iF5nl1*P`R|jgzWy~BV7V6@f#e8^&a1q
zd!0zAQSjhVRb-NRA=vveK2b^C6%OYwEn3e2cZpyhnCWkniLm@<Gw3Xq&&6ekHSPp5
zY@%?eEbq8-EGDL=RrsJLEC?_$5(28hHl<5UHNov3N1`;g;`U-RZB*Cm<bn%*RW<&J
z-~ue_uu+Z=2aDcDRNpUZo{o+f@mdHXK()VE0Gg(VmhrGhgdXX7D3QZo5D}y4)?KEN
zWW-FMZl$qSsdE!mkYA3E+@~8Ch65YHSd3U8+B@!mLPtzI<`f1|&WPwBa))Wn!9wB9
ze{|;0aZH#M_?WjxijoH_tVVp+M6(mq(5G^^foDWCPI|c^$=B;2Ap@u{I?yF0k0jf;
zZrL&ys*4`+{K*y^IK%=!b_Rnd&NFsM@A{;_`9+PgW_~}?MM+%PL?BMFg_KT$YZ+^L
zd{L~==x*!scNwI?2`QJ3MU(tunF@SbnlL|JR-i-lZ9;~Q9LF>DMLi#3+aciixh_d}
zQmBR|b@1S+G{TbMyIFJ?4Cog`{F?%$C0(%V6pJ)(b|FC^yc9en;ArPh3|FR<@!%iE
z#G#TVi_(Yp(<JtTyn8`|lQ~<*=!URNkLF7+l@a4!`ifayg}0R$Oz{V0&l`r|@H|^>
zZ%c|rqU=p9qRIF##D@qko&$@Q0Vo87V*g&Xl4gzkU6gOB@G%CoXK!UeU@U$;@7yb;
z{}Y>n$C`=r)bRCdCeOPok;)W)^q|f1j1y9nW?CqJ*1G5Vfk<4UhFQ;372s(xGm7E{
zB$ssr4SCOcv!a=TGtSg7kD<KE3@|&Z_<O7Znfq4qZ|)un!3s+?altb%cL0?RH|E{=
zaRlU_b&a@x4o5pUX}{gNy+mGevvkcQ$)jg(u&@}*1ZI{C<PSMT@x3_ytv$LaYpADt
zlFPlRh9Z<gj8O1rZm-~R<ODFVPHhgd!h%=p@wA|>hS0N6uRBjP3waC!E5;1+54Vgc
z@<3VZHDWi_!gt+J_c7Buq8YgbY?l-HN8bA^x8Zxl-WYafHvGLg&gfMKunpVmyKa0H
z29e}|X2M-HbIXl#&?q-<yCDs3!)lHNH5gy!s!Jo@=#F2q!7?Q;>3NVb^NKLk-{xI9
z^r1;rzh`lzBRn@_y7<I3(yY-M4{y0S+y2JL<SWp36`+h$a`)=vC1C(Uul)x9iu2+p
zcp?&KleWl<`x-Z~RQC6H`-iB9Tn$Bwbd@k}Vt(cgGhlyI!RW|0p}+PSM|b=&3Ai_g
zGwi<2Qe$91!aM~s^iW$}%rJj9TO~Es(z9xp>u!*{TK?g8Nam`i`CcFGv}kx&pue(R
zW-!uHWSHbhCONVQWr>0Cj{!_<JB`D$deVW~cZ=f*(m?b%w4b@Ft_9Y*laBPb2lb3!
zu|FCi&~1|z-D+ZAxo`g>eK$+o2_8`kdLn9O(p;ghPlJ!??P|DKF;9<q_r&R`ntl~w
zy`C9yA2z*7K{RP?NhJZJ>Nd4-h&kUKcTUZ89>d4iI>&q!ZJg`QL)Xn8_Qf}{r?I&%
zN_V3ggcMVw1vH4SJ0`}(it)1E6DOA3h(t$%vp?K_I8HlQ;CRWJK2-*>KMwOJj-TKw
z;K)&6?7Wjq+;xI_+WMN|mr>zbo`3VkGN<I%s-AQ6aG-4}FuQY3F5$s%--4OWV+*)C
zEQB>*vy5_~$bYvq@0-@e_r||lHz!$B;HI2a_Xii%?h@H*H}m03xjr<FUKBX5@0Mpq
zp#KBJ3V6C;p;f*fyXi9FygI?;P9O0T6<erHYV4n_`>jX-?W%SGz1A0+iypCNEFa1+
z&swoHrI|;4L+Ol6&!h0ZKV{h3-1y??Q7AelQjRlsjZct{#Zv1l!Kf#U1Rcyh%Zu7x
z2A>LVmjEdD9}CBc$x-@nM(->HmdT3Yo`LIGvVgXHMfS&}0P3cmsk?==?eC8dRE<3*
z(NZ?~A=%fF@bNP|*|!^S0t_srqksw;YLy!7KUQ}#slfQ0@G&m^Lcw+xdzOb}vnS`s
z3yZ=#h)S3I#Lre&yn*++ukO?#s}HQu$x%e~Ndo25f_tTpRmxq2C=gg=R?Ohfa&VsS
zST8(;0~Kcj+|oVwn`#_m9C$@zvNzsfsBM(-rMiyIej(f8gfrF4PicReD#R7+5OURZ
zR9_fV{-QKkc)og3zvejUr+AjgY-J@O<UgkJ-{|E~SE-ZMGuR%GZNC>XA!J9<U0lDe
zdRqU4Nya)2(C1DPwtj?43kh7KLqQVp?-$`56<F*8wx<Y}szy0BMyl2U>E<G#$|Egt
z(NkLxI5!PD?h9nZvc#F_W};u4d45hg_hh(?kX(18?>>H=Q;*8qN&2;dy?;v}zJK?L
zPx&y-^7)X=*FuA0hha2C+C69a@!PEgI$A@*^e$p(y)LL_O+;WXG8B`hMbEsyo3Zlu
zYH7QNv$n=FdZJ@xC|$>%3Ws;~dPIktE!LCz?RJ7*6Z6RhO|-@pSG)1^UOt?GzdGgf
z4OCj~rHH;C)OI>DumMPJBME>`V2g50hJf~gWocSjfRgVQHmKvAt7V%SWR?8I@Ik$}
zR3wZM#gDTOl1`k%i`MOEaGN0iMT);(16Ya>P(@eF692b^-MMb2Hn_81ATh6*4>v^p
zKJ48NzF1{P>oK2uUL6YZ5n%;;TR`{}RQ9t?#%d*Y+|U1E0Y2+MXMXPY$xuTN7~q;4
z9P33iv)L5q0iO5<k`mVva3|orWl=j(jht?3X+>i+fCn&yS6<ZTRj|m-5Oht<rM<4+
zyq?c8sfs&V6#5SoyHd@CcM<d{W7H32$MW~2%8s!%@c`1WXusxeTaB*Wp2liK_`zJO
zgXt6s20Qm~5444s>)?<?Cz_Y8ZZUYNP@UiJ((d^Zo7FO6f@H=Y!*qC59Q{-U%`#75
z9!_9_b@4znE2bHMncXD!5%NN&`k)_3^>I2ed+Yv(2#(>PK-{{1_^?^s9nt3$VIJYW
z<CQ(#z8jn4n7{YRk5@May0YhW2QlkzJ?EfS)8hEM+e4mJ9(`*9vI~&1+cjX-{*~=f
zUjCHEpr6FWAq%C@o?sOkO#AnIB+BB!T`whwy{wxJFLptQ@w}Rnk5D$R+K|$vfx6F2
zfi3pQ8&03~S~|6)QwxU!>0=xMMx5C@T6Ot9>XS)!JW?~3c2;VQxm#w#{EkEkuuP2N
zUF*Q{10B(D_>1tK8Qq4{%31HbLixEik*8!RS>!Fg?3_mF!;1g|jm9{<xqH3c<S4<?
zX?vWQt}cH%5uT&cZXA%9mg>g(k#2VxOMI?+0v4jZ-L9W9a@KK8&Z=*a!=$6`ofo*C
zz}YFRo2?ZH*0XXPufvqB{fzyWH!11ES;WXuLw0|^+c`2}$6Cu~J43XXfjlM*@OaP*
zCmi&>Y=s2l_y3hPC}3Qowu5{zh%E3+pvpm;BD|g@z?yO}%~+usy)1=R8f>p*^>&F!
zomY;V=0u;+Rm6T%r@f|tWzE>VnS7WYpEp7Ks#@5$l%)eOcwB;|Ls&<RB0#tnA0GkL
z+GPjIz2*TN`%T}hmzT%aPu4KOKG=>Q%k%&Z!8nr2XMvA!sK17#7|fT1_Dm8lz=VR2
zr)?NGBjhXN=p+X#T6?&miN(6RX5hhr9RE7*8|UG_H4`91;Vwo&8QgBndOHlyS&Mmc
z+v2(8WP9L_2KxFhhR*3a7m!QFlDSndnT5_r_6)g|B)=ymcx3w~zr*r6ikhY(?7a)G
zHxjMUW>)H=+gYHIOSbVPXBDx1S>chHXu>&YFq(&xicj>&BLc>h$%KXdWy|?wTPe;%
z;(v!N{yW34n}6UBP<psf`tRVz|KuU4|8Gj<KUoOy@&6r%*v*&Z#Wy_?NqD&8Kz=dx
zo{Yf8znTBQ3t9sH5Vd#@CCb~A&0X+1V|6=~V=su$EfqloTa2$M!<T$`uOzh$0V7LH
z9P{xjScc%=h&!+&jCsas`vG19E@=@G5%)H~TrtJISBpPKKAt<35SFkAH1XlPD8;{A
z73g2GR{+p9=OQO8yk5l%um3d)KrX``2<Rn!9E<jC7SHe^hT&-)v~+w?{9`FdNc@lp
zd@2j;-gZb__<-pC1?7@<Tg?0Vbq|7z+hg-W=MQql9YFo~=q}%?SX#E9<TeVpZo|Wg
z_VBTJ{}c!Zh;g*m4Bp&pdzi&P{yrTJy}~HwRW-I30%nOG%X?{x<h^d5ab8L!vUXTL
zy)&n^J)$^Zx$a9L2NNk6dN(6O+45NDBmYkj0z^Mr7tKZprM~Zc%KdxnY>3UI^|6g{
zV+xtSt>gbcUa=YMB>kT}L~9q8UeRF~wIq@VN8<KPF>VL(|0AjqyUwl4&%u7B(yUVa
z*ap$+3}Ud~ApGw*2#c&1rJ&m4*_k?#`-1yxK7z7X$*XDL<O5KQ)YvI;&3PJ&E{W4c
zt}Bh|HgK4)9K7!Ggka4w*?C~tKubIpz@bY~`hO)C?h`-`Co`{p1>X7wE(|Bvbzmsn
zCziN?LpIcfS7&HGPV(u!5YxfmqBvY>yPg6%<ncow{>aw2h>HVF@acQ2IFkl!DHsto
zcx1khx1zL0jOi3VL2^2<fh;H)0;Nw328SHOl>wBF_zp@xX<4Q<W^@VtsKb-^CE;a4
zvnVVVu=(`2&MWc8euk4bMaI5C+_WCf(JGLZjiY8lbt^=qzq*CDpQy0|z)6N+!$Q9(
z*^_Z`eQ#L&%cP)8^nGP907JrxE9pg^#bbY8**r2o^_w`3N+R}?pP-8%&_Q=iE!wS^
z8ksc2Cst^ZsF@`}=U_e!K}UXTpYmHZP7@kQoLmtnt^{eBQWO&@Dw{dA1E;#z*>8g8
zb;8r&6|9}PTgd;5`@=Ht6*ep6AKIejT+?az=Ow@Wd6>N+iKQ}tQwKj+`|D{!4V<S6
zK-^<myYIEIfre)9#w9G?!9DUU_EtT8ZvI=|k09Jm8+uqjsk6j`IXRQ)Nj$F+|2o*`
zhS+K<LihX|pZmP{t)BKCIM*D-sNa}TL?t+ky;MI!qL87oKPrEmgIN>%s)VZfv-<_$
zyKSiiSfrOuTsr>IjV3F{!146@UKY|l6^qM8qcHqAx$*Ab>a-b=8!(9ogKY?bO8zDJ
zKN|r2`p*Ucve7McIxV(pYI=)qXGNTLEv>7`<Hzl#7J##5*vB)X`pJj8v*p?I@n5q(
zt3`I19Rf9{Pp$N{sfDu9ETT~;6yccSxD?+|zcTk3nm<4aIYB^?KZ*W6^VL5*3hIMA
zID8zj=;^h`$eTC3DiXiBuhw?-DBC(Wfb80Q9^${NV$hd!F)~i;zapF{WFcBv^gF+6
z(~Y0&l08%W)9;^!fCk=l=&j?nQ7G`>6+OMi<+T{<Cz;S@22-P;CRuZQ%F6j@1;O&=
z>C0)4!mAPIqkZp;h)VkQ;Q$*gG-W^=XsP-4rk7~pyDb@+C`ve>#p`z7_a2MhqZ6q$
z*$(f!gnr~y6-N3?1XyED6&yD)Pv{r4%)8W#$=8w1?$zCoxLmkxoOSKh;VoNq(1BU2
zyuaHNLZ#k>Ug@V+;Kw7sKVV^{rxYa<Ha3E$i`d&o#M=Cqs3eLQKqVtt2vyM^C5UXK
zP;+x&!;Rl}7Rz+iCahB#P6$DzXZUf03+U;cc9T4fy_s7kF6W#riXX&2n$`5TNo<|u
z{jqTDHC)unDFjmey&iCID;JP%WZdvaS1q<D4fpt)w(eVB;iGHlf;z_d{XdYVt{4A6
z?4CrivduSUM;Z|e4+t{2xcxdlGv@1(+D=ub!G9idAlgdb6ESQrRmtAm1A~u^zry*?
zntxy;LfAMlIw`m_pn&!40$z-gB@KqrUvZwy|7?WddiM?J*R@|?(3qeV?;?j{f7t&?
zJyzTBW!IoUn>l)xAoHu|!wRSifA-A|BK-dA{^&EF{;o<>H)ERnjXnMO&}>+cwpQr$
z3$uLAjC3zx=dGB>nL=$SlE#p?2(v<UIIThV?Ub*Gpz7~M&F8iX3Quo^5pJ;$PnyX|
zwHoDOykF(%Cbscb23(FFu6f(tyJ>EQ3(l}#n7-blmn;M8<=$2?9C9v4`lnAc$lZV$
z$`!zN{F>Qw-L(N?e!iel`FKS|oO=J`z{__Q2E}@)fz#PEUCjL9nJr1=+<ueH7j;<R
z%F!<OVVGE-b$>a=g>8Nn#bozrb<}VC*l;Ho2lQY8kz;fitiAE+JvrcDwpQeg*D|-@
zo0(D%_c5kozuLa$Rp)l)dG;{f7osNvcPzXm)B7#{x;9xI)yR6U?98^>UidJ}oK_<W
z`VFO->oOsU<8YfLSSn4y6CR&x4eO6HX?WeX$fhY8e1u3$a^f8$n6`T=nOu1c4C+O@
z1SFax#ZHtK4Edm|Oz9%~wD+x9hS%Rd5I9s@`m;>@LrDV$)G^7cXP>TC#yroAElfs9
zdI_lRLgVB$<01-R<aetKz(IMX&AG+_TUkI=hg3m71~%EF45VGQ)g$v&V4_NP?yJ3e
zY0B$3Iz|hor6yS^S!b;d8dmjC!VV-0__**3+RHr!tDv)%<5|p(k@}8enT~w)>+zL2
z>4pP35(8|t+~+%M`xYi6*D?0@Twr2j^X@qvAM(a_I>95TNj#HwFN5pw`sx@?bi}!M
zcUfES*{r!?bnf``oSL5>GA1yB9bf_G0y4;A5t~%K)S)1~bK_?M%9;R8<Dl0{U*+W#
zo%tia*}gs$f?4X+`ddK*v%4<=9ZHe+lAc$dkz$qMHNnYFnq3XUOT3unGvUp*N?`}v
z%32!@`l-Bh`oNoGUM<W=vY<?17=zz4HOn!dT;m{j{yul7OZPNPnxEu^-)1o%q<ZQb
zxTRnm3Cs-)w*Pj8`>pp7kUL}`o%}p?*NlL0RFdXXuSm<pxSz?K#}6AMLs|Jt7=3Av
zlmz(?tiqaM*lC*e3yS5{Gzb;H;yEFmVq$GLAAg*~<(v;S%x6xrK06%8@=VyzzKhj)
zuOAWFdG{<!DNv7u^#bcWUva6)bK_Xw*du_wT1FLjZ|f7Kooq1>Tw@%KpJW3|5gh7y
z>fJp;izO_1J#3m9d?doRJxM~(HHuSs?Aj8>O<d6SZ*0GY)!HwOl5<U%31wEYJ5>!2
z$PjjcaCSB0u`b*TslPJ-8(8~MwGrPa=_&Q)lSj)33E^fxEx%A1({|n7cw>=37b=kn
zm?lN?$OZhdFxv<Voak%CJyvJ~z1ybLTfvSXPI;}FZk<BL1n3$!u^}l$Tc(f0h)#-l
z(+z|@@&}*8L`eQc4EVwl&@97(%odLQTnMwGmQ~)W-qyZV(29E}Xn$iVvcL8yyPBUj
z!e;y^iEXQ@+4YBl)c}Mzz70bth(;b@7pt(*0`!Ex_D>GlWtQK%Ine3B<WS4sbwxic
z8Cn`cl2<K#6-)0q#H&k7O;2aEH({#~BW1S_ffrU8h@r9cq|=>X!DBzR^2m7Ql65Zb
zTWb*6Bs1zDnrXku=#7pO84-%q1j{$4ZU-VYW1~s1u)+A=r2lqapN)miOz8h$F}WFh
z=1pvI;vO+0tUVErQ%NH?GfQ128^QFJ8-51Ru1PsKz0X5@VH`e+=UwCK9bF2BEs<DN
z4E?}jd+JBsZFP4BwAQx`AB~X66LR9<|8w#8jkYPXzUxM}s2ec5_)BB5NDz7KN*K>{
zM<2U|oSKpT2od&RfrmpFI26yC4WQXNfm|VEKK5#XWbYgt0^Vc6(3r~=-<Crprr;+h
zLu0CY)YX<Skr~9#WKTHl$0E|=OliAz#E0QL!z=})l)vy*+b#ai_XhQ37}d~9Ogf0C
zKMR|WrN+P9KtazdGMk`(L4#l&9N3JUsljNwo`7$Z{lSZFbTmZl9v!YUJ$gC#T_V?2
z8eEo5pg}!wUnI#858=BWob4@+ovEG`GzVhBjs%N$^v77*ZIhiaP%UtAaFnI=L5@x<
z{ogDHkt2?MI#34mDpraEKB@C6(_?YFCt)4!!6e;s*u9HiKIGx6Y|Ny}h>J3QL~kPU
z{1ocJAd$E<x-I4AC{ycJ0hI{&g@FDMm0uZXq)WT-Dnt{o;T+#f`TsdsIl+9agqd%C
zzWWuufc2fwd&YR1-6y(c5PrOLC)-!f#LhQ1x^`8wLSiQ$&Tn!k>T4>OM}5=C0yx!b
zk17i`j8A9~aG)D!x2_5)<0bev-m=cl+es@@_1rFHNu0d1EcURtYxGocJbA>fT%w~C
z5USQ!F8)$$V$I6o+Z$iiWAi8fEWvD9>e+Z}XsA+V_s(@N-O3x;8L5ofaQwD`RdD$X
zlN}shu8!2JXeoeG$Zf^^F^cO&CUl90(lIHf)tO;T+FoyVWEm4=n_IrvXsT$1$8qf6
zfWqGC1c(X3rUD@6uQ#MWNoT66_n&=Zy^@Ma{sr&wS?3C{-Ya|&irKXY;Y!vTp1`>6
zmWmMGtfw9!V{rIrli@tbfjhbDofF#BfR}r-bbBSnareJm=xr%%QWPf^8WH|ph0mJS
zBIOkZ)rdvQ`nqzQx6x2@!6PPJ#yFZ_;<7ug+<li`tIwldn63V$iC~Txa5y|fKf!Fp
z{F3>(GG~BUM)LiZmLyI9c7buR?TR(tkaTbxjRYoO(QLa`U5oCFVtuhD>txz^H$;6}
zxl&velrHJmy{MVoU%2@@k$Nw~n|9!DrpTKO%8NWZ>TH_V;XCfVx0?%&q6cmjRc@Kv
z=3tFfp05Tvnm?WHR`;cysWKQOi4u&Rnkg=fJ&@8LoqdoB<ybUwpQDL0Q)KrhLN+;Y
z)&++~Gn7w-ZoG4w4Oyw;#Ezb+HL8kIhIxmw2qr$$*cyvD^IsB~x?$0kiE6GIJr*}P
zR>>WLo^z=6;&J8l$LyPK!qm=gWwQ+ezWA^!NI2~5F^6oxvXFy?yhv#u(DpELnuCR{
z+$x<mjYTf&kzgjpbN!c^h|}PWnj~<~ljsZ)I{5O*hBZ&_0Xg^gE?v+B7k`8(TaU%7
z_O5Eo9(-+YgZ|b*x{ez6t#M>cAx|>zMaxNIZqrkjpol`J><QNL&K^O<7^tgX2q4^<
zXkp&%$|>69@pFl@VF=zj=QZ6gAZ0m@`aGW9+5Ml96VK$|Us-!i<2zkC@9C}GAB1c$
ziW;`&BP_aamfhHsjo5IG4{`|)`eAs5`sd)8(?O4V80d%wkiFUF-k1!%ApaF;GxS?)
z{m}~#moS>BC_@0h^icKB!zpbmU{dCVB)Ny-_vGm+-sI^TJ@PHD(u@4p_J>`3cjCZu
z9y<LAr|nqVWBQ8mwIrEafW*-e@(s}Rr<5a~;+Mf&tBt#6%hUJ!^RaWd8Cy?KZ>qU*
zmn#9K3uwa``v_8mJ-76{44zKM_ok<x)#5K=?4IZA0B+)q{7o?YL5Pv|ulONO^)^j-
znnOyvDy&Rw=|B~w8tYiX#!a1#GDn5>!HykQk0Ul%=bqH+^V#AR1BZ{@Z7v}bi1Txb
zjSt7}QrW-@`R_?KG2QjdBx`44xpyMzOWO4f^;wF`d_M|U6${0gRSX|G0!VQ&Ov+!{
z-I0!Uy~e}s0BB48rm-TcWGJZH#vORK#hpdPH@uS(N3Q@VP?rw1`i}0DHoAH5Awoh4
zpD`?6<y!5^StZUeRd5V`?6hG0pl0>AJYyJS0sO#YwTbSo#EYawf+oS(Sr0zB9p(Cz
zK})52fz}M(H1h&iLg1-ab}FU>QrM9th^1a^ydhe%t;eq&*KU=c%qlB`mZw`GYzp&M
z&Y;Srs^FCAewHpL0Drr`wDkm~@JyiBmi_2l8T{}6ymdb_Xqe2M)Xp`T4au=jK{G6J
ztwne6!HpThW1#>m<w_&>r2TzdDEIMWLgO+WW_PlniGV#a)?5lL?@|noEN%C%;L}Zt
z72cTT&Y)uY21E4pseY`lYX12*f&qG_7whoy`)C*T$(<zW(r+E`3-+fMkg>U&4u)F-
z5x{_zwOJckx6Mv)L@Eq-1TP9wIwCAb9$$@?;OpJX7*AO(l9DmZ`LFvKbA#?CYg}+u
z!2Rj&FgYLK?Y(Z+|I%-@w`s8RHWYj7xDp)0m*-+YM#peYvJPuK7dv;p;G@~PQE|ca
z9D*?{v%6(ci+Nj;OGbivsGd=&RLfX+LHvCIN8um1TZJSgWp6agq6^&%>jXta41%>~
zG|(GFY*%eKxtC5qw#}cE%#n(_)l+k`YWeFhy_X=Ay2<tUs7CZgMm()*pASfGS<N&4
z0sOfJ#VL62a9q!&*C#@<fD55UHeKBBoSX3*YOo{Igrs(A4P(iY-YKY<kJF;NVKdvu
zBjfSI``$f4cWq%v8yXR#@GK2{vVAm>rT?HBhAq%EE$d$vjSie4sql*$19m%b{YX%S
z)7tBCO`@!|>bO2NelqwP*0&W*`BKY$9>d7{mxgV=-g1$}M?|Xi)T~^*d?9bteOpC8
z6r>Zx^%)wHAi;j%N1Lw+SQr|`=)8WwpX0btqnsQ(EoK<u?Vel(uKK=&i&J76U#Zp&
z@9DwQ!@40Z8eL#vOrDD5wtG$S1~+{T3?AQ$SLyQWgwku+Gi=Wbr=3k1`8%IoThJ8Z
zXNFIQjw*_>qQl|8m@&cfpzc1&iebS}lJSI?883NkQ(iyKa1(rgYs>dT$b~?l_d-xM
zZF!8HQcL=v{n)|JNpC?tC{NpI6TFb=1O*KH>2xq;s3B{M+<iaBK0O|COZW0fxgO)q
zV)qOYxxL0Fo8(med=%SfJxT0R-qpRN-4}kn&Mt#Q&7mo>GpC)&c^~6LfR0?RO*`}g
z>JGg-YwF~WzOVcq17+cu+r{S2qGZ~1Ni24_W`+<(YtVK)Vc6R1dk|xiCsKn4bFK45
ztxAjr2tQZLNmy-q{Dw_i-~HUsz~c<(r?Y)E^GkimiUo^jkwmi<@^i7j5LJP9QB4|n
zuavQ6Tr_?RfcR&N`{<CkFoN0~EX^swF<-QF9)%)k71vE$yDtN%<~sW*rB7>EIK5V$
z;-{Y*@2zRo`Xq7+4s3bu-p(D-m=K*AsT(wNrFvZDjl`y<d9Z)+(|fc}ewh(b>rl3J
za@9Q>y|w>7{izxHG2-`0$SM^&##t!`%gb<jQZD*OwK7e6bzb&f*~X@3lzX8mx*^Yy
z^)dS9PnL9h9;Qaxl1oReHfMEmMdHyMagN1=;HO-_`59yK+a9BP-S3gO*)4|MZ}qe@
z1-@9zHuE|^TH)E6Bw`cby%qj!zPL2L#alC{yW+C58GQbr)$0LB$wAw)l>a93(5qrc
zMk>QLGVDf8a{;P0J#ZfzhnGpBv#AZmKy~#013=SGNcY19V$MdE;RymO2DHt>t-l`m
z(kbIt&Xg6v{)yG#%+khNA0IKGhVcoFUM$Py9J>GKn`!nD1)~isl3qA;VSWD2v(hjW
zTdrqXrXG~xF$?NKz{VY47rr>1vl7HFx&^|l@MuJ@TQ9V3Uc9K-^0Oj^<6tvOHCbVk
zK>siH-ZCn#pxYKr2oAwrgNESl?hYZiySux)ySr;}hsNFA=>%vTI=I8@@0@$?c)#Df
zKi-d5V~^h5V|4GXy=qnMs#SB%Ig^pSMw?J{{e0vS`?gH(zH0anH3xs3mws;D*MG+N
zxk-<R^tbgL@Q4>l&WCkqT|9ALs)U~#Fl?DN!4aFOVk___$(Dy5fE4~i@ZoczG7;*!
z2f4hDg;Q=T$#!+3qgD;P>gi5~i(j0QAR@Cf{os{Qj3E=YW@ckncea_gGiiFCBXRk=
zUEHh?=ZO%p`8~KfC;#aACU~Z-8C^Ykz+Pf1^~NiR1DAlV&(I6RAZ*{rVs%rqa<OE#
z^EBsORN&D#skm$QT!;M7Yq{vfsd%)+c%s8OaYEEbFw@v^77Av0*u*g5p!RgSu^*9x
z#;swUubU3J-{<+M77L%~%$2}FcL7;;;n5O~X*|4bLP&4UchI7mjp7Bf_b*U6h2)Yl
zqr*jSqPQ(ce|+TNsEWX_mG{@}{fj&_N)eL^X6LFoCL3vw?`f_&uTghjX|LzIi>;-*
zi#)lxi>$Jh*^=?sx>74yhFQ|CdlcIxOM&H_X*xI7xCGV+^2}>jH|_qE^$0R<DB{2>
z(6d?DVJA&Yacb0MHm(Ni+s#V-BCTWH-R<vhgw4mlAE!j3S3K(Q+Awza$2Fr4L}VBv
zJmxDkN~mXcUkPwUw5uREwz&a4Y!I0+Y3Kwu#QSI{5L*?|Zc%`tvb(CN^lUh|eQ7n{
zx4U7e%juH=tbMG+{U}piyr*dGs2;d?Dz^3ONIz8h$GpEMZ#roy|7)1sgzDLPSyW!n
zXe!d)X(2f~5dd8R+QlbkAsqsovyP3B+V}0;H2=hArAF(i(>S;@PdJwV=2u!m4O7X1
z+QdV(FOQDlVG035Eq-^VSFJ|$cC=p=fq+2*+cK!p(<*Mk1JQJ~-wm60j&l*Wo}aqE
z#u?eem`0LwE>0%nJ_s$3L5upDjF%T{5%=tDu!CyK$ccPvPdq)!1M2p}C^smi%%0;6
z`=0lA-!37jDq|4ugeV1E-`MdddCtf2+u**1Sk8lv*IUpuh`rumYVp1m?E=nCn^)|X
zl=avLf(-`Rxqeig39ikU-Os3*cM}}*nq1d_S7;`5GY-sRR9oSe-J;XT*oOa+a?DrZ
zW<Jpj4cCUGJvbt;KH8;--|c|y4LYWKllBax+<2eGTHIi)E>V}<mkYtx;Vwe6mtG9w
z7J^g9b7Hx^^n-trh;XfA3u?bJt((RKS}BRVD=wP#J|(n2xj?@3)HjVD@J-yuA@PMb
z<dz@5_$Ub~<)i1NHU5Gl!?iW)zS?QvhpoqNH~#N#=hfWTMWaq>_Z}5pA!d>qS;r2>
z|2$IxH9zcC?F(_fW5^n`XX@3u`;yQhm3@1f;QmgEYN$8Idgb7?NH=nMcjy?KRH6pZ
zfrOcQ>-ipHx7AP4m!1_u27ZYo*RMSG>K$Z^G<83NFP^A$TU|BmG}LZJ3Hl%n3=sx^
zP_ethSV3x9%qINONRu~4P6^ceyB*P;wO~N8u%9gJJ^Z2o8(`!D*)FN+3cfVpol{e`
zfw8;W?MB=x53@#8ypj-=3rzo3>gTYPJ4N?8H9Y!}qr~ZSt(gX_>=EH(I6JKyx=24u
z)DLeP{d9#^>dyOj17$M!;+$tkql)lt^PKW)bhjiF$R{_&eY_J*{wV9!&|@E^*0yoc
zm|fHhGb$SWXLKFbdn{wif^oZI3>J`R(}pDe%&6{j5R{9M&X(T4b>sP+GQ$tn8MyjG
zPFig1z*%dhvTCe^l>(P)!~wJhl^)5LF^VUd_~|6(i#GzcPP7^8Ai4~X<&&;;@7I-2
z1EA>NJZIN1_KG-o$M9HV>~&k0&n5HT%l)dv)c#E>xvn+h4jV==ygG)L#zCDifz|V`
z(qG03IPpA(jqksDfFC-uUE>=cZYXC7kn_e_e)0@xm2;z5fN*WhQWjNN<3p_yu*Y`v
z|0x|}E&0`{>(qD{oSnnu5R$HMKL+ku-nG2_4;H|7V~&$~+hH+ad~a~L%>Woawg*Xs
zLLE8rVwHVKKzamntQ6qd+FLca=(BvyvhAsZYcaQTm?~LGV)V%USXObAQ80xZ_)-JM
zagMJK6t{V?j{Dx`b4;{tts(rDQ%47<?tHX*36ar1H*_cpjY9)LDs<y{k?DF76I`pu
zG<WvWimC0YI6B?z=Q=o;qz+CCUp@o8DFfSVuO@_kvwFOA{(@I=BDSgYF*s}Uk!<@2
zW7t?z$0tN}ew9vz`@YEi-ZQo1=WnGo!d|SQD2`9ls(s;FTa#d4of-jEcC>!i;PR&y
zM~7hM_%K2yhIYjH`>%Z%UHk+HO#oz+Pr=d=CiP~Kd_YL~J@up9-S-LtJk#5Ib<0co
zC=0p-Mm1*Y9*(Lw88p{dWbO>q+j)VlNd{M~imF9}a@L5w!`*ljthfJVDEyAt@6Jeb
z5nDDU+BMQ@w<9)AU`Pmqa#=$08{*v=BH$9NbW)o}6Jx^TE~$x4=zh__vE|L@%eh=j
zYu&4_cu)c7H~D%;OmSRLewB^6+WBsh3YMq_kCr0Q(+BP17q=P{#kltueq;v7D|3*<
zuyMv$I{^oxDb)#pAYc281UGnSY1a&mHWT8{$oz{pFpfx+4qfwjr6ncBD)z^>{5a)x
zW@k<q+NGI~H*_F9h1*@`3$-!JYIcu|1n?rSD?v!249>lu7oG*@)HMGvs?;$de7{0W
z7BAEfp$p;6BnNh8=goC~6#P$V>Gz+M7pUC3&rnAuo6d}%Ea=oZx)N}@-Kw@qZQq0=
z-z$b%o(hQ9Os<MYj7efEyO<t2(;-V<%q28ZmM7~D-}C1|`Oi8lmVByOl$9;SG4nfW
zM8E1HV)VvYAyvE1CgrqL{j9b@9RhJ%#6~f6SW;A@=YV-yW|xUk(I1Ro+DKQ&oN~xu
z<KZR~0@l2Oq-eErPkK6+0&)?fg1(uz)-T(2yLQIBRwP#~y_u%wzr2lI8tF|r-vB#_
zm3O_i^3DVWzK1~-0G-LyiBc{rJm8|>Agp)9`WQBXQ)LCX%Bb1MM5>dte)sbEpkUFS
z?=4pLk(vRS0E)By3bosB4Tq^rcWZdX-R-VHZB=)WL9qKi)^i%rJ87$4a^{Nn@iNnU
zkm24W3>)Sdq3<pe`>yOo(u>rnn3wsn9k=Hxje}Tv-QP5`=j<dHF0O^^5O!!!{3RnS
zi(ybqt>gkdC8z{sHKaApawmriCF?{ppx6l+i5X3uV*%Mz5C-8#AEUi)``wMbT|2Xw
z7G)cuiG*fKhOpuN#yyW}7S~HYWuRE+e$(%G-r?QUS(f^F-=3g>UD_k`Wr#$StWt*J
z^d}3U$zWLz@||vywy&Xd0K#P%qJMCcFgW&|XiHW$cVL4OaKJkv(;N7!>oXgJTTGx$
zy_NFzPsldgFWqx~G^F{fKt-2`@WV}cztk~KG2~vuMStNT2*E}KIsdb9YS2De8(m24
z6-RkSH0ByI3N-T&;$aZlmpLg*B<|_gT4U<*9?`uCJilH7NlhyEaZ^j~PY_~ZJiUa@
zgl76fj;{Q;g*DuMX0_=TF=W7U|9pzkCPAAZ_0bKK71A_-S&x`N(j>6Wghf%XuE}>Q
zBvF$8FR8~_(Vc{srREqi9Uh!0vY?$|7fN%6yU@=VJ8X+9d1PoSfKcE`4zjWL_NW!d
z@d<=Z%HbM=VtV2u_fMHnRH{ZK-JBEeR2l+qFWqF7kiDTHx42G*;>{cHh13x8TLd>M
zF=VB0*2%e#gmc#MUgg!kZhWJes3}k79uOG>*#ey2Fx6$gL^8Bz37=ST<t2?`eGS&q
zmD*5c!Q|ahl8}ZLz_CE&esL_YAr-z-cq8Y%zz(YwXdI2B98Tb3^{wsTEM6#H@@q><
zVng@7)Nl-Jor@We*;39FPl-;NKM~v+AIej7vQ^<1j;XTU=+XrO_$?6@Z%kv8mSCgQ
zzYeKJxrK7STB4OG8yb~s230-rUkEpXhDP|Y$S;NN1rA_Nlo)Vs5bcyWnVAchw$V#p
zTq#hA|J`>S?%kg_k3A_-Nl`hmPyYdr9D~=QMpR?VKiLia?Aq$@(#nNMnwEShVfNe>
z5T5L^mp6B58g%#?=S2tjy@68jU{D?0e%}dw-_l!pN}yNyEjicB(gj|dha|@V;3?)p
znVY<*xftl0fziE$<;jS4Z`gI}ayS}e#JBn>`xeOAIjXUmfo)O*-0!-n??J*=T4|zA
zZ;zoOK;cgm<qN(vgX3J}mCZ21RIA$AImPMT4{BgGW9%_?0yxnYj}!Lq?pMKdc_!yO
z{PuE@ZA}?n-E&NNSwzh;%&^6=M5sx}^HzK~`@@(7S&%0aFnGuC-@g;InK(r66va8M
zl+}i@Jx?px-Qbzl1iN$1$cTKT6w99r3UR}Ycw%A%vld$Aza18y!QOZVUzY;PgZ`R`
z>?zPn%2Vv(mInxjnB|yS74y{f1s#3NL0c%E@mb?H*IRTr&Pw(IO&<?t0XpS}y<3ww
zRz>My^tk%1(lxwqK-<G*q@U06;g&*J#V$0{{mpT?PfAX(UEesmMsGQp{(9)H+UZ2B
z!y#!g0@5i;V2@wSG3R>F${c9KUjjwuQdkuI+mi<L!)`6JW6HE@9&~Y3yC_a}O^jxP
z3=;trqQg1t#>8aL0l?C-Kk0T{AB_^#g$p;ju>irm$Ip3}ODxY|jd9%>77LLTZ$A*@
zLpJOPF_m6DnCkb?jGUMEEcD5XWG+C*rswGmxoQDh5Ples7ya<D;I~GYK&=jsq=8Od
z=k%;y`pkNQ`X3JkZ+*++`rF2owk&Usne~-u=*%gWD;C!ef3}b^F5_9}{3{K0IRw&7
zgZ)>b!TA&T5Fj#3$9v8c=db!w{4K_x7A~>9*p%+P!}A59cu=`z{2Kate<itP_itd|
z0yagb&gA4%Y4y1LvrGP}QdrETvv&3jouARg0=Mm8Rch(!Nm#7-R-|<X%x{@g<kL<I
zvshxD#@sf0`PnczXghRPeDop+nJUJau)a5+)Z_7;mm*CEbPQG;$u<Z7#&ven{TR8a
zk>4dP0eezd#DDoMWz9Yl)=tC=D1!jY`!~L5?Y`8U?u_KkvFduN868hBQvmP7G}4E*
zBj|ja07OIj;2ZVBX5B8dy@P#hwt+*tQUBENW@>?oeb<62?zY8inb`vZlakNj@!7er
zO4NUh6m)c7E0AX?K4+7%a9{PoWIpbyTrhgOUDCZZv*VJpz(yFGnM|(M52bmnUwTy+
zXb_G0H3Z#9AMQ@u-vs~1jTZ3yM+-l~xZi$DJ!_{oX6SkMVeas2C=<7MFM$+q3S`ye
z7Xq}yfSf$9#*smHb9xEOD{7J-3>tzYu5ir_qY0U+iV=T~blozuX*irZ7`b*^nU$6W
z*Shjkcxg9h>yVG$5yF)s@ZS?)hx#vGUQ=BC<Pt8_cFp(d=r4fC8a-&YnSORzNMU#(
zQYN|DLP~2CejGzj0eS^l5bBH><S1N)t^$W*!hYvWFN-jwwJ)(PW0rh7(_q)yfAuA;
zD8;jWD=)F8F58*#OTNQ~2V1aNF+cpDH)qG@%o*!gS>}fgH6+4(+Mmn@%wIw5&&||%
zN?h~t|Ftu{qUK4f$lsKs<@K2#XSTj+ofvdbz=d_Qao04s7RL2<#d`FGB|Kh==gDGW
zF9Kqq4E))*Gwn2lrdM72o-WaY4VvBx!V}#&v8x7q?N~s5)#g{fiYcCQL3y^-h2eP$
z7Lu+kTEhAsOb+mlx^;QjW^EL^0~6>(2*2n?IdkB%!P=)OGL96p+anu!10X=hXUA;k
zK_hy@K)|iTFR`;{Fh>p&eb)DiCvjgiKTSg}(5JOMnDb#kqP?Kn>HB#m6~iP+jmkdn
zQ5$3t|M~aSo>Wb&Kgr(%lgW#52)_C6w$RXc7Bi%i*qy&z)8ejWc-5%ae+(SvG_<qR
zTvCqQRL=~n6rg@EWqS9%UxY*~S=xED46SDb<E%zP>#e5KjSbP@F%Y1CIZqYiZNAZ|
z$RcQjg}FvK35y15a6Er>WCg*-xO^yO^I#cAebOEv%d5@HI*ZeGvJF#Pf6duKs2;{3
z$Oa|q|GS{5#IkT}rJZYZ{TfVSBLin6M4%o_TaNGqiY`MT7|MOMPaOx%r2Vif3~H9L
z)&DSRy95<Sujxyu37lg`xOO@3W#{P!!1T&c-~?@p2E~G2wxPMsR#;%I-?*^0d*sXq
zsyera@-J@~t&HL{btT;N7Licg(_ZSbM(lFaL^n8A25GB*dFOO1y@;wY>#&Q8IIa;b
z%BlSpt2?qXxDaHTM~1mBvncs7SS7?3rK^9~s&T5T;a;=9L9J(>^|9=7UglGWqq~|;
zR6FTo6VCJ0cdGjmgAGv+T~I4}a>cBO9~geX#qU!a>s-CbUG8+~l^Kc+Y@Xu(h)1j(
z)@FBYbrq_{23H4X!K10@Ey@^lsbI%4uV?YrD+s(4$6JIN>*Gv!n>Kk|dSCMPA=4gv
zj`+&0;J@1E3UW>51hjTlOg63J6|x|9N>to?)eE(MPS{S~r~7p(aDV4>u(F;Yx@U`t
z`kV2_z0Rr5KoaoUukWTM*X@>|bidVBOLzVorJV`t^unSdpVg|~Ok@SPb>v#l1rvQ&
zGcs7NH9Ym>n1O($q>1s650HKLjOe2f()W2gRq-;VPTwixjOR<j8)-MNpoIn)l3Od*
zWnFyRfOF4DvNpk8dU~;?@=qGLDTv&XL98;>J$-o1c#@+g8{4C?&;Q{6wVGJW{8xGP
z;=5^Ps5P5^W7CL8Q7cy<M|)cdM_SC9Kirzt)*09c5{%QWzV_OA%dmbMHTUV7>pn0g
zR@ta~&^qhl-N;izKHb2zcRER7U~*VFvg*5FCT<}>7BWM7U7^LDbwE~&Bt}`R^rqk4
z+~RG%Bbkf8F31Mf`;|%>B(5KMrGqW!4tQr<^N&hr{Wj!aRNPAE3}S-iy!~?s`p%NF
z+PMsn)@n3iZAtGX{GMui&(CSVyK%&zjL&(1Bl%5499)BqWB+Pm_=Q#I`_SG8t-A~)
zdiym@qxw0!T`E7DGlbpEu5WkG4w^UXf*`o9bhos+9k<oL_YwnCY~WSDcu1eE{@8sl
zfAKz({A0v~RmsWnaj{>cRbSuWU>32?gh7M!*W%O_6(-6f^!QXw#xK*n$@w+0dT)1{
z*&C#ofQe;XK+m(j@M-5eA<u$wlCI;=98p)21+pwN<JT!=CpG3;eiQJyV{Q&wpM4lq
zR<P`zW$O3oPi%e8jzhe5A!Np0Iadkyi(C677~!{4R>?~bnse}Ep_L*~W{m@3uh}7G
zyLtAl50n(bFNGr0Zkc|3Gn%b;q<RsZF&=*qAy5Sm*<JH<aQZ|p{LRX5G(EV_Q)XVS
zX-HrTtydLS@O89Y=ynj$%?_9D%Vox2zu*oyuW3Nnbms`XfS(9&0(QovR?e*RU*GU7
z>nS9E^fEj9WugSed-+o|(Q`ih68cayQAl;r_VV4^$7rUmmH+G0e_j%xC;$KV{|D`$
z^vC{kIzL`Hlk4)3G3>vsF6ukrwYp=s-&+&ZWMSo*?#|`$<9{`X=y{}TFV6q8fG?%;
z|MT~kr`O6szmc`K0Xf}RkSkvDbSmxfZ3=pF#~E{b99u&EG18~N!s?)Y_;U08(z>vg
zfU)|&T7Njc(n$x(4B{Vs3){;(I|cuzs`UTncK*NoTd2OvFKByHliX;OBw8N@5gZmN
zldrr9(hxuDSi*{3{e-^r@UyLdFZjWRub2t7vrTkc0Wt&?RS%<B9=E<<darw-+H4Fk
zF29wu2<?n3uRVVV9O`TR*{1q24QZzBAypVQ^lz)B#c<f{{6td}&ANs>Cwm=Reg<VP
z8{A*uI0c=l)gLoRj~p+LgFavrZRm#esDhf%ASsCfL{yCz!qBKb8)R-vhlif&Su$`h
z_C!F=mIg^m#12CSp7kv?<Nx>qE8^8Vvw##NoDIty%)!}AgcFc<q=DNkZ{R`6jVbC|
zB~wT*zJql;`Y8%>1TmxTHa*_nsg>KS+fdy%9NOWu#wlIOx7(!shF}hOD+6uV?gJ_^
zQ!jTQzIR&M8j9x&31UbQFRem$sUim1A))xZ@qafO&GgzH?|5KZwGwbn+->WRKfLnq
zdeIXH^)H1G|3iV8sZs9vb4ZcUa#UZ_^hng?!zoF#LuqF35*#|x!a~*dNiW3Aj~I(6
zp*#?x0E|P$zACh{1m~uoplzVJw0~AGC><>ijsqFnQSa8rYEx&;=(_KXZ|!yLz<aTY
zX_%NCVyGPM?Z+!t4g0kBnqYhDVKC|<Ct7juiiRi*M^}MOzib6w{brM|MV0~50)3Sx
zJlU{fm>B=9aqG7YiW{x`ryn`XO)(+C3wVWP-=)vq2mtXamT=^;xeWgfm7IOGj6siU
zqQl^_KOMm-8U31(-4fO7qN;z*^;o9>{T>r!cUt*`M{Mk*4u2kh4eV{12O0(fmW<5(
z8zi6a9}U9}i{ObB&gyzePOgx-4!OW`Hp^C|@JW_ymdt9)pE}wi(r6R2XIa>cQYKyi
zh1M-X1mz=vngxKLRzKA?6~$Q)12qf}^<q*3dYY=y0{RQz3Ke@~s4QvzcphjV((BJT
znQV_%b82bXL<R-Ly{$v&S9RTl)vN?D%3l%{#Z*6!QL$wl9ZJM(T$ukswx)y?b#*-*
z-b3tyY6B(9Y);4NOwT&p_^Fk33W<-2<Y|^fqeHjfzU5?`Jp#}~UnPG@^II+MXkIyu
zn(u+IN9!H(&yBCYz?5(ej=Ll^{n<R`Iw755<Ml%Ybyo*36r)GcS_YbQ)-rHwK3xEC
z3*Ulv)Z_eKT>G^d6BWUKv!PXu?FhmT8{KP^^-<R;W|l2qqoWT!HRVuFrcvVADlz0N
z@kAWe^Y)+}UOX&Cw|_wf4r3{PoVj{OLA;}b{h+OZl2xPmwKUM)-QA!hnIKxxoY2|b
zAOY7T6nENi%2#BeY7=$2sE4x_QUsSrnVhP%#aM;(XmUu1?=SOC<0O+MWqg>-C#>tz
zIjvGzn3`2<=2zd;S@Bl;O<ub%Mkod2EcYqwzuG5}<m-NjgRkZ<v<5DmawZI_h#<J+
zXE4)8@8L^`D@s`B)g&wKSPgF8VG+$xpIu_4onX4SL*(T?HKd_&^=Ebk-<wK$g{Fu4
zY!-K$o?d3)#9T?(Wa7%7HqLlhSsSy^L~YH(kNJwz0@9h7uTZCSZsV^Tew%SWP$Z-P
zIj!^y{>iCX31A85w4I@3wgB+@?sOx!T*5nHs6Tw7qi9WnT7zI`-`ySe%NlHMP6D*d
zY4tkGf;Vg&AOAAj=@Cepb)I^g9^OpHn`iZynmkRyy;**-$p3LEhWY~dPi*5}<M=N6
zNa(>KMi-B%Pte;Y60mQ?>*Fe4;+rwsG;<E3$2@W_t?7wejUeE5ne?!)q9^qg66=~e
zcy#%@1u}geqS6oORwfd9Bi*}gw@VFpnqY@uySdKXgf0Fl5uDg-e)Lm3ElJE8+|Tn3
zA`4=cH`iJOM!2z!4d>Lz`>FF8u4ao8`BRH}C>5S>@@7HZ;IhWP(<kg0IMu5&%mz55
zneNbq!#g=732Q|X-)Qm|7#&)cFlnwx6CoDr)%O*n1=I#%CY+h=tkZa|b5@eU)GZs=
zjkrDoc0UQpv$W-K-N%thzJHVK7HVMh)Wk=fQC*Ph3|z%BK9*>BKSOMOM;JIa=BmFF
z*7{oIDoD74MG3^2KNcjUa<j>*D!UfK|MHdp$`{^c%H9sKF^J~7f+sjaO9cR>e#Y$e
zib~G@LQA2zYGj9fJGw_`uNwB~82eo{fy-b9h{=|ML^j+z%x#-knYqF1;+00{hD_#c
zZ}xP1Btr5UJv3N<J*_vDvVFn}$QEU;wWXgrrmbw93=0e<QTGkHmVD|EDSMNrKqW<5
z)<6BJ#*7+z%!gL&@@*(AKz9pIhgg|qaAk^pW_`U~U)@GMk~HS(c&X@=sr$!0-kGm%
z#aj?Pu7As<t%M`EEA6NpBR0(Q@kx5dX+OIt9p=UwgbcF3(&cLV$y`kPJiz?j01*?&
zUeX!7V8nb|_vuim9IqSjq;Cy3nER-nD(<;dFhBk2qJqHkRW=*xL*I$GkN)jy#DN|m
zHEcbuXXe~3<FbUXtaYQ>#PHWz=~B=)0o0e^1Nr=`42>*Z1AB_XMSd)^1J@sw$k4}L
z(}vc(a^;3>PuHx?WRT?o1z{R-d;jtj{-N_H`zj7~GN)~Nq~tsut=TurG*ejS5M;la
zp!`*pfmH~R``=H^;OF9B8BKo-Y+{(v)EfnxGoLBvWJStA(wX`CKtgK@=?BBWA0Oah
z<Z%BjQc53FYF1s}E%L1J$1mux0pFdwzpZx5hf7}fyct=6Kj#h2h5Ly#Nu9nT9_>!x
z>c9^2zA>@0Ix2t0nlAZClB;RlB3r}+d5(2rW%GRjF<I$FY)st&j_;%ad3zUpPL|!K
zC?lLO6N;l`y$8m8g<Q~)9Cpy9k{J`@S{6zsH8T_}in%Cu$V0{J!MQyrhH&hA1}jl%
zlXP_)(?bRxnlCxFPl=Vm5lDm&q8gTh=8J&+iIrE*dbheUQDov89Lg8yFSmJ|olm`L
zIByL7pYwu_re{Q&&Wsqkxj$pGl^c>ovAEV4k1=l5cQXtN$U`Q5?jsawuAdAL@K++C
zzx`6eEzt~wH4r)8ZsCMTZU;~8jaiLmt9lda)yBrO9^e2wx=A2{LMMTu7Fu<q6l@R>
z4QW*k0Rw}fnE?CO+i&#xlUO09f%5rEQg&Ul_O5BsJM|{Y|I|omp%i{eyx68ayuV1=
zfX~HAUJ`Qfj@La?h-{cX@*anbHs4_E7WW3HHnw1#t;vVN3P-b!R=d6E*ipl+-ZyAt
z7>`jyPwqU33+9}i1f*fdH|2Y>$*y)RuRuZ73p_*zOJ_V*YB-{iZq*Pn$>KY9oo`xZ
zC~-{CY9%f-$4JQnI6|6-As!b^J;37_=e;wdy|V231=6Ay-S{ZV^^A0{r|95Sd{7py
z*tS3E8Dvgk7d;VB!QI`WM~gZ{2Alr_HMwp=ZTF?(ToU(8GxD!c^nxA%bt%w+)scOy
z^QXaqCcZc<WHUiE2K3rW<nnPD{Cm$A(2NVP9|23oi$~&<oJ`>h_GVgmDUi8y5B}LF
zGk*WWgMLtxhZqCEogPhs7r-8B%Pk!lDX3*lx%~vwJZF0F0GT#b9?voj<K{vp>gUOJ
z*xi&sMx0>T3r2%9MC|>*J_7T?zW(9ReY$l4a|7T$#xMJzsE@YsVf7q(9?gF5?cWm7
zQ^uMcKq$VEsKHkzXx#|HeHauE+m>XTpKkjes&V(1*XPuCF}6=SHvQS-tP6jQaH6p>
z_%!;}K5G<}X)tvIvQBy&u3M5p5QRyxz4yI<yIHg-!}txqTxbb+GQhklHih=phV0)M
zzKRfnZ#WNcK)1(N-1~Ce(pA&&D<lE|29nk2nATI<R1+A~?mvGw(>_;4m~?7l>y1ng
zIC|AM447ZX**<X}Kz7R8#ajGK1pcW@6LCc9X>aMe?+D^zvz;@)OtKy^fq__B*+u^e
z+xaj#v=my*)+hel<9C@9f|f7o)C)U1dr`Ln*AtPe8(yc~eZ%jkVC$TGYG<G2C#0{3
z>&yYdz+XxHXYSWsTCM7;cXP^w#tMUjQ^@o4xvyyH!I(`zE8<o+XgOEGnRjkGHl)`f
z#gGOs>eCetwm@YeOti^h%jhSw-p1u7_v@J^7{Pkh_!)@g<P~;h8E^~+>NA1%t4?>R
zV7+teHD^k_y3L1LRC_9E+A4wZpOg7=h8f4JuUwzV;yf{@{eSAnhN2k*t1Q!6`+6eY
z!s0A@t}lBb@B~bw!*_-$<2K*!&i0I~t($i&j2&#fwG^*I4lB&}|G@&Jb2(T6#~Urf
zEc<p~_@r|>CB2u#O#xtC0~1%$5YtbH5YixrOY?j{uE`ZbUMSdWz62t)I@Ir~y-Dj$
zI;|rQ+|-3@V6^CbGEKY`a3jRs+x;)Qm~yIyrld;&cB8{>*sc`Q+v+2to>Yn;!AjEQ
zNK)j5H#|cg`uD}X)6VmIImVUZYD$u*>pzX(Ads5u-A0afZyOi;m{#JMXnbisR3AII
zBTVH53J@v2o)NHikE4=SG6bqgHx*5E`Cj6ZMTaKwc{?P85R}7%23jUdkzX{wQV&|5
z;h#8zQQ^~qtx6XTeAJ+wTnv=la<p2BVkPMM1<-p9gyP7EyOw^tMyZw2qe%R&^DQqZ
z)FF$R;SXPXX1LasATOIirN1vs&InJ!<*r|)1-w&2B4jq8_tVOru3n2fCTw%3s=DF3
zh8H*jB}7TeQm9b9z5=pBk&=%mMP#{~n~GJuyiMzqJ#ujhPq56<?z^a!0{p@5_P(GD
z+*Q!1rmb7%%ll*EamO8IxbyZm3~i4qC?*f?)kX67lvB+9(u{G5RQuLFz0Z@n(2xY)
zm;`>2hv+kZtrvux1m~B~T+*f;dv!Hl8m0~vGVI*)omp&n-TP&-{yNCXZa~XTtH#|y
z&#~>a73U|{*B{LqHuX8SWqtx(kOj|wZezz1@oGge!01oo>&79=&#lJtX5B}99D$;x
zRYLLw^ujU@;nJT!P?19*sbF!}Pt%Cad^Fpw`Y9p-iMo!NesnX(m2qmcM6JJR`tB~6
zJzCmT0mV99hy#U#<xzvyP6F51VNZKoF+C4Ayw(tulLBNs-9m)mQTx}TrAs8~g;!e7
z`rvaRXze#l<N^-+dV!v-;01!xZVu^(Ax_NtT4sMcE(h2)&B<yMZ|N}YqH?6CS7Msx
z+)ajco!+NaC|h0)j~M;G@pPcNN9X^{((+U|x*)A^k9t0X`?$j7E>u3QxH(N}XD<7(
zX3SeBHF!&kif&NgzqEYvSUq^Y;*CTeb_nO{D~GZ#Uw?IzKnLah%d7^Q;$C8vDkbD_
z%i*sYxkA4yD4cfYjhEVRi&1fw;?3(b9k`)wFUa9gL7;f=;K7S115MQTq_Z@c(e;u1
z8-=i3#Grid)X9h|FCUqg+kV+uUNLNHmp7IfwT71iuUs*(uvw|1j7_+9baOnD_pJ{Y
zc=v|&i_d=Gj&p|gVp>NRpJ}<ZC!M2N%@N@C$Ge>;Y!P`AASJQ0XMQXKv7!Ovf0nhF
z1D|#*2t6z7U$Jctox@M8_XFCDEr<Jx(QW=o$8VXL_~*9FoJVk4Wple&3uMvejGox5
zBo)V5J~?gTw-+O;TYYGc(5G3S0R3@RVR8;?rH(Ed<sgXVkmC|;Ij#{={zqR|6XT@d
zne~qxV^(mZ<5JLW9A>QR@vv%lad&sH*EivDH!U5f#-?j`9eA=yFM9p<#t2bBk3orF
z<u2$J3uDYAmT2kQ8MDWT@nQ1F`OK!Cb9)&r^xbK1?DR28aD9*1wMzIw*`CuW0|asW
zS(`VQ(%WsX9Ndna2QbPzSv%7c>Kb-yY*M}t_gXpJ^g5er$NF2{e3Bd6(Z;f&fe*t5
zh$wd_yA=74J)%TQ1vJ|Z>&+`O5%TWqVWyC@`?bceC%XB5SVFR+a=2M^Pcbfdw;M7#
zxV5xIl$nb`JKc2*=9A(K3K&g2SA1=1g8Ji@D^ZZ01u7OS<Ds9hq?MJSc-C5Ux9^Uf
zJoy$49wo-*LTE*ObaU&s&sWtgLimh1eoI+XEIPbLx1l3@Q8Y7UeKy2KBt+wB3`Iq)
z;rLH&oeP_Nsi59IT`I2#9ezm6G-f!fsUr(zfq3h2n%k7sbNp~`r4E>Usz)6nR%SlZ
zAi0$&L;ksn)VA?_q5KPt;(G^rWsP`t&>St-vh;S}R@Nv#1k|+wITq+4(=$JT*+Pe=
zA8hEtW)VTo14d>GA)|4?h3qfGP`29F|B#sW8<NA!f$Q}9fr|1oDb+n|=84089(Tu~
z9sB=T+hV4*{rcLn(pc7<x}O4(6U*4~-}*`tQ*fw7h=}>?hdb!~je5Gq3+H`j!2OdR
zh5I*Yse=08hEx;tO<ehI##I6Uf|e?8!1wr7n-mSBt8H2rE+lTL5W<~RZHnC2luuko
z0<ocrdx6pE?J|ro(k<zTZ;pv#N)ZUNxSAcd&21^awt@xnjrJ@P2YxU5Lw$Y*Y>2;R
zc&XGFx`$f|{P7{wbIkN!+tl208Q5ErGg3sUFNl+UadFB&nPPe9L|XVnY=DH`{gw_i
zxztq;HiZPqQz1#7NeV|+z-0ew$%mV`f6s^oy44HDp5vwmkYwH|>jXx&qh`MB=|wFD
zMk%xH*=uZVlrUySEwkR5TL8R3o=!UrAp~OJW<8t@m8n3xT81q-R>L5qX=puYUdW}n
znx+%7$uLkYI$vpf;-1y_v!Ca<+g(8fuvyf=t#>sDA@TUrE{c~P(RyfTri=6ao;hiK
zN4-HS_wM-eW+w!KZz!V2rG8!0By)QGumI~7Z1IA9fOFbNNSgiR@QL)4(a2)i?3CF!
zX5lzg+HJmdr@X&&Iux_px-U$d`0@mNW~aRxKOz+kTzQX1PBxmAJlqv?T4Zud-O`rT
zi1qB`nkpTail>#0hvkfZmZB$r3xc6H+?hoRB!>^C?2ob6FVMhvtUdC75tw`M%KsGC
z?H@li0@~dpm}F$s8E}`5wMPEn7t^hIcRBkNXQ2D<-+7F)zr+q5I*;b1RTLC{NK%1y
zO`PD?!97EarPx9u_+j34{7YLH3w#QN)hOzec+A}W<=tYN#|DI*<{{a)7~0DVd|k+-
z9MBQ&%{KPucrPA*avjIydrGAN>LS24)rX2)&ur07dsvZywaI$%FPc#AurjmwT+^){
z7J|n!f=9adALanBq#&D1d~;k}+ze;RdTM;H9(%JZjO(!&{rZY0$bi?5oE*a$&hTwb
zmc7MmV~f6yhpARiiACjV2Os7T8AD5GfE<)gR9Eyt6;U*$g%K|8rx7T|L|0iJ0@C})
zZCXA*sxl7g1B7?-$rvM1IbBP)xKGx+F6<pzQ*6-t9V(MI0Yms(pY2gEF!LVSSmP$6
zyH_zeeDC+MmZtuCQ)>G+htGyckj<KLdkA#<1Na@ew*mPjbf<JG*-<S$ID^#+Y+h5t
z|768cU_6^1jQmZeN6Pqr7oG2^Ahd;~xkxi<QW|U7L)5I}9gbdWhF;4gW?wgzH2OqM
zQbKhPEfdljGy9T^ZbOi-IGSuCo*q(=BSKs+n6i2fY>_d?@sa!ThP~COHXAqNSTgok
zJF&RzwKaFKC=6=L=(G>0bw;}9(~_rIZ}n2<p4E6!y$~{X6|FZZsheU0Dp|ZL<lnO)
z0u5T<yfn+2*Q82+yH*T{^yQgcH>E4Bp-2HoMn*tM18sOd76_oTR`G|rOF2+vM%(z|
zR>-q>bb_MY-yNWC{CcLsRMB3LQX!x(J>x|~BKbs)M^p-XnY=<}p|ADK$~Z!-Wd++G
z{^|IIEo)XQ#dL9?N#s7JGL9<42ZUT6u0Hj<&MT$`!Fu7G+mn0R!6VbMJn*fC5?sL6
zkjy)jfkmL744xb-ixmB6@1eZRJm&daw$SVy+sr^cXN{W2ztvI2!U@vrEiCr5^eIbk
z(}*TIlrJ}dQXu|4aDILK=RY2<9scmqM0?>1vmW~)mV5PehuhM`3U+ky4>p1%rK;Mr
z(1_vsF+$8prvjtY1U`h`F8?Yq;u(mR<T%cN4bb6q#%+#$WuA&FjCdBnXgVll%A_BS
zd7aIQeY4EJDim*!>|bP9L<&p0i^8gSCvUZiu2Z6Ok(xDr9>iF}g-(S<9q>lig5onv
zdYl#qo_k?zQTtQLyvky9&Wd&Di>43EZd$TY%RE3a{m0QK=be!QghZ~>zU{yXnBm$^
z8S_n}Qa_Ds&`&Vn`=hXtmPWR&E@Ef{6`W~C=D(A~6EnQ)3Df8Ip7IC^RuuutFth~a
z&d?D!eGr;Zcw#0zQesEt{FM$P6?nvXWpPkW`9nX~9hZe4fT&<!TW{~`@uoZ_2J6gl
z@Z@7omSTK+4jGdp0tbVmpW`;MIW&KiOPnubG^R5vJaD=*GImGX&~$h3!ynitx7%@c
z)!0zfN~Anex7+K;Ku&A~bXH00)|AzwA_Z*oHU8NqFeU?(fPaY$bKM;s9sJVA)DSJo
zKlg<R@%lmv30|w!eu$dcBd-j36E*n8&0FlxIL*kf`xws%Y4;c-nD{9t2`b2Xtn+G(
z2gr?qs>U(e?Mpo$eaOXgFqScs&?|1pU9l3r@<*#6<Rur-o&$t-0%L$HxTQV1X5W<0
zbZKPKHYck&CijWG9@)k^S^dKGOHNHG2~G=<SU`Kfh2Q7R{6vHZaZ_~U8FKt7=QG*x
z6K?jlRR2e`ij5H%G@`K$X9mEn2J^c<=&~0sSvn4N>MJbcf0b@A2sH{9Mydo4Z$e>5
z1cL76SLboD27S@1Bc|@ACe9PD6!DWzAh9T-ccnwBvMDorO6QjP`{R5%TH+byVA$gq
zO&&tO0qwfdP3TUM@*)xO;R(V}CmXSml!FX-4)4putVYx1N85b|5(a4PUE9L?X3M0#
z_)(t#jS;(JM#db&@1{+p5p?cAf2jN?2OZS8ouBSb|EqMyG+MIud4iXD(`fQ($2QPg
zlUOX6e}(N$OlnRm@ejsm6$h$(N3RUK4eh+w?V>@q*X&JkfQZ-XQEY(7MmbXTbpLl*
zEhMT1Jx=LEonFuE<D>{_8#WyDw7z-<maB>4M|4Kf0xR8}y!su#nnAqxf0Yc5V-IBH
z4GmX)o#Kl-W?gwU9y8-e^1qxy3;v|mR}nE<qf$E;rWp~0Ys&w-(oRlVQ@OMA5~O1W
z^j$k!2A(9=y$;Se`xPqF9cvh!{0FMfXn`^RyG$J+Mhurp2>LR~6$fXq$9bmzC1qfC
zo$9^X1pOWRzZnB^UATy+8oAs~M)VDEN#lqZGi5OiM0CE*zGNnAc&vFFV3x*z1lqg)
z_g~M$|88H%D`APwaD7>nK{pWH$*Gg8W6|b+?(e_O8RWVU<q&aS*SyN2UbXe@3moMK
zxMLplLY5A?T`&<z*1d-H@ud?CB4Yxqj=fKDL|X>rb6Si&JVS)=f%I2_xfl4Z-hXdK
z87~h1ec_CV94TPa)#LHgue9FXWS||hYtF$1eToJ|9&N>kVdd$V`<$XP)7;m(f{6j`
zUn@<^gXpDqb{5011D&*iGS54opzny#rlBc)G8-&g-*S|nw)0LO&4TB;(t!png_yaT
z!=X%cpd~NfA&U6Bhy+#?|1U9*cDT{xTp_)a?qG5|0|XaC(o(4uU4D&2zMthcsKG6i
z9J6`Tf>1}n(56D`_!dL_T{=oG)Li~O!v}lXT+SMaxCI#<e$GirTPrPdb}a%wiyOz3
zmWX2(2EI;eVObsDF6q-TyRgVz7kP`6C^J0}#QJ`hp>Kb#ceV)xU?N(J9^ncpuzdvX
zdUtidwf}eXP7TeLIMBotF3VJ=Puyls?C|8#6fS#N@~VhUr3i)xG>os?x}{K^X~?`^
z)TgW;+RHZ)9nLX1UW+w=LE=l_F)C9M_jv4q6LY&%34yw2WQ+S{AjiO2^td1oHp+B1
zwu+mDk2HxEE>8Q^KUdew1#88*Is1F9QZB``9FNET0@5$~4g65<3T|5w&UK;MaMlK2
z;YKlRz6zqJHi)C4Barxub!ve|gsz#NgfL>$Sp2~%l@*8>dVMD*hhti+?3fbdn&0;r
z^eDGdUzYDl$gl4fB^j$`Ra^q!xzxxz=|d_=btyJRKtCmSNWX0&vSomG$X{l^DF<kR
zVmKhAm`#+Nn!?)`?1DXBk6z;6oW;wuSUT<o#M=JRdt3U5kac95nU;*)=_&1t<wkT*
zLyX0g@}G6j2}f8xNi8Uh%h#3U0sSQj;NJArE6BT3)`Sm_Pyu@F<+sY^zM_pi?aUZB
zja^%f`dLS$tIL|3nlJYwDqdZ>piIMx@(cTT=v(UCjal`?knE27o_)6&+*4TKhpL<l
z`A~m?Ew8_UnH=piOiJh;nfKt_t;Z6jdyV3`cy@bK)0|l0Vk`6(8H)|(wt<*O#X$ct
zilqPdtzE^{DeLVWlhR)Q_+B;U<bXs|XCpZN9hy2+OF>_MFLuaN5{K~U$EWB7fRGvj
zP)i+qhm>P;_8ZzRxWX@C+=A5Blm_fhKzG|$s2NLd-toAPtQ|%AcSyt=)rZ->6puS+
z%kg(}@nLh+n`=^%(%J^U<v*8m1|g9Jqvu}ep@dCLdBQ_0lB$w$Tt&}?dPe7~>N`ie
z77{G%&7uF8@Mug~AI^?_==Mf%Gskq0Bo80+x+a^0f8(qSpeT+puqeMbWF_b?6r-Lg
zESIqIZ_q}65f@XJr0FK^L5(SjO(q#G#8rX{2aZ+qrB_n}x6{+C3?a1OtIMq%`H2Dj
z)q7bQey2C(uuB9?`K}^_L2G)U!tnJ1+Qg}!V&Pi!>%w=_V3E%g?#ws}=Hcw&OoS47
z4Z?3p1o`K8KMgWGD(3Lz4HobK>Ur$mZsz#0z0L6Nyj>-bd>ae=j+4MKDR9BcJagLK
zboWNj2N|gz6P%@0n(`Y|CW<z{)OqpfxI&C!K&HtXiR#aO@6LQCAq&>~5Q$NUKuQCg
zKJxsv1cKTRU%nKxSKH`D(7@1s$-|uLAXhT-_SM2ov1lnHn3E^T=us~&tC&Q6vu6HH
zG_YJ9VdTqd@xblJDjq;J)gd~fZQoijU`4c}y*XA-ngy_1*h+>H6(nKrn<|F%43RPg
z<nTx`@$$dPLRg`-X7zS^pvzz(HHEdbx+(M=SY*H%bCaA)MSj*twPV66!-q4OrF7>}
zQ`Ml{?86~HonYpsS$H!OGSWHP15#$O(4oqCO)EXP_!B%xkLx$M=9tn?t$RU`g@qS0
z%z76X$;b0g-@mpnoglx4jJ|rq*1(?>Vcp0^-e;&8|B^B1^xJQ2YD4#LE4?5?<A-Q4
z$QJ<=18YZKzY`2`3EcKviZR`O_GlE@Fc1UAASS_nw#xHH_{RBi;zK38q>nz~K40Fv
z{n(C`a#hTGlwlR1RaP?8tiomCBCofIM@%=oc~%0}p*@YmD$xA=96vdZDdYc4L2Q$9
z-=S9;Sx{n7PiAa^@5?uFu)TS&A|vo)D<VJdXj@Im?V@+_8QfITAsdos(aYZ(^R0MU
z2Kv`jb@5yX`Qc|76Ol%|7MNi*BoC;8-EU+1R|DeF1E^e_$w6wgRc#ff_P*J4d>Tj|
zFIKjRpWq~R2U_@PYI7<i9>te=5un0esBB{z!5`T=<O!E7fH^S&Qf0x1a8_l@$ibbQ
zndM#We1X4I9&|HfN673_-?pn3S4a==I4x#z)cs7+IVg>QY3_(LWKoU@vZbTOr?T;7
zKSCTc3ew$~Ce!2r%~(_feEc+m$|M;-nX<yv*phnfSZTEmaiGN|#J>igPrML&QHdKQ
z2>U9VI8F{;ONXL9Mgg!y8*x29qG8~d`|u7Q*)UF$*h9eh$Hn3YRi75thM@V8bcwb{
zhu7QiK4|1x)QYYUm=<T~(`gDr#i4?N2^YlHazuECa8fdUA<yv?%Cg@>AHPOJ1N97T
zX@@k$0`tW%CJ3#l!RymDfg0s6b=sS1DHEtEzC<2qV*0%K4@++xpZ5b8S+bzT|5alQ
zjHy2J676XuZ_5S|e4Tfr5q4NTSe^)8(64a3B@`hykH2araIp&dhxxgfDI9(>Cb!Sf
zFbZP<h_tk0%f^;y>^T@*C@&WfnC~F%+1EX8nYQRFA95w9Gk4k@3H5B4yzs@%RS2`x
zkAC2B8}?F6@cF$VoqECW^&F%9$o8NYF>kJ#C@xa>wHxb-C{TFGhRnO$@V$Bz<b<hn
zKkq4C_SULBJPj3M^29FZkHB$UawCEMq8{ETK~Ya#$zIDV<Y!P<B$x#nwTh$qS_+%h
zHst@j3$?#R;IhUhzgF48wK1(bgJ!@g<G}hYvTfkIh3h?_>fiDWuS-Sj*Mp<9wz$PM
zxJ47%G5v=UdehD;w!F|E3+oy~3pj8N(u1@~9y&&0fFrxT5E<%>)vAn7Bw61LQe@&S
zk^TRC<DeztuAYBM54pb(-Ssl<0-zVOLFie3M*sbRBzHHzSig9uhB~Q)gVzQ61m&zu
z{uuUYhb-vH2ERqCrud%Q*<#;!HxV`rP`@@W9bZIBsX_QSrTX*x+HFy@JEU9eHRxW#
z@RhT7;35X^C^PS%a{d}V7vFBL3EZ$7+38h@PZJTqI#@>_l`rdfoRb(^<{TlzZ%2^C
z&Q2w2R5{zESX%v2G^Zu~mCqIFfI{V{ZW~864zynPtQ|*s*0IGN_s}>iH0gW!)vRhm
ziV}J?_2+sg2K=h<-V$=kcMjt);_AoHxJ``mx4?w?7s)5R--Gx&j=dwGk2F)>l!tn%
z!}(R(U*H+$yf412cj%Eoc1us)a{Y<I6NuXt$q4tQne2utH$p^IO=6(0*oP|EJg8)r
zN_di8{DZANBQyjfDhXapc+7uigRyE6@f{R>HqW%|D;RIZy0WX4)@uKXQ7&qdCoUYy
zLC7wWCEx?oQiMfnpr4d1pO96W;58uwHCmxnZj87SL3z8yyFX=-N<lJA6>?$}H$XQ<
ztrYK-5TT*uX?2R0N#uO}bqBUe#PFF9L2@6aVjpE-Otgh#YT2i0SRrUU=~J9PWQ<^B
z(dFLf^J6O@y@2x*XC#;w_>Ex~He5y}$lKo2Y{~30Ci15Kb-_g6{%y&xhkgpuJ-x!y
zg;4$-$RRchEl$ZMr_W`v!!SUIAT^l_1Al~g=^YH8A?8J}4%ch{xhjmtxmjyQ&egA?
zV?!dp10v6FpMdro>2XlfVM6((k8rp>CaQVqt2X3%_n3WXCyu2Db~MvZ_;B<t0_Vth
z%5StR0`IAFhMCyRn2u>9oPL>@F3SOnzU7r5A=i7bETwOS3;(JQjvhIlpprzA1Wk9y
z%(V};3}pvNA@axvSTt7ZZ$j$~(cL+3^cslwb@YSGS|?_Rvg_mbBd|9&Ird6WAc`*W
zexd+-ea)(zE51Vin();UM3Z(_%eD}^kZnqgk6oytDcZ=yfTeqQom6m`?`^M^SPIba
zpj$3BgDttHD#h$Y`I<s)<f3=5LDxM`HWk*akFK&6!vZJE3T4qxT2ieal0M^Z9o3kc
zajBuEtxai4)Dndp@||N(A{wKWLU^qiG0um|6jf6uW#i0B@aY>Q&KE~V{JAS;yGVzU
z+D+15zf`bNa-e1EG)`8tCbEciWLdI0RW{B2tV)zo$0Lnpp%QL|=^%}y1pjy5Ozls6
z>92y`{&HolKk8W*A-OnjKG(miYi$YWuKLKaS5|*^TYFuqk?v-WMhI1nO3#)k9H61n
zwYWwN!)l<?tJH}88mAUCU1%9q{;evbnP<%EWS*oMBCAx8QD`vUSK_N@^3V1Da-lnJ
z#rQ##fN|knJ<^hS?2m`W=s&xvOhz5Pl8zej8F=i<`BfPcn0JlRMOUqixr8F3L@S<i
zCRSJI1K6xXMJACY)7ucx%>)V?E>6~ux81gdU&6hSO)|>Qs_7fuU)qMILxWm@L-~GC
zOTSQ5YBFFK4pKzEI8+o!Hw&Mqhb+_o0=r}u&`@+X+O?#9VGt}9;0dqCKc+U+8D=cK
z<W12wJMBfe;nLt``6M5*g^9B;q)(0>>0kjj;mVBMPobjzl+w7+tPDgid~3wo$5a;-
z(2iet;Kb+H!-wtbQx~$K-@t?Sd?Uv?Qskey*xOOtBJYI`MRLNVB4@y$WShnU!{@*b
z1rS=`eF_>KGsW66zRsZf0@97o(yDa0U;HzHQ~LA(ccz$KR3jMwP~%EE?eyfGWhU~I
ze35p*I9xMopb|LE>M<x+x)S2LGRXRM^!>+xW)&$CPM3neGW{rLr+`W-pIT^`dC@-A
zB4(i#=Zve-psj)#UjC+Wc&&KsI)2>YSa3@p#-Dp&T8527r=2H1yEwG;VdPkhKV&8u
zuiYZ_>yWI{^LMHKX6dim<*Dwzoaxu5j!~XzrL5x!zt9IS!{yQ<HOV%HJucbPLt{a~
zq~0e9?z#rgVbS=lB?(Yt-hBspvG{G>w`xSl#Yz4#bLw20-_xE>T!U+#=gDcUz-r;3
zCOOq&c1_&RJd_KLl+xT*j{7^v>g!TR&4OL}HB_~7%=w478pO+AX!YQ}+zMvwABFjF
z99Vvx^IJ*a`pxFeDnm|(;k&JLw4failffPEf6?{M(UC^ox^E+$bc~KWww;b`+eXE<
zZQD-AcE?V|wo|cf-TKbm`|NY?7^nWPF>2L#*Ids#pWk|Bz1{QH<`-aBIf}GN>$Ob;
zhYY{90=u2^zFBmC`X%7T^qxwE<*eDGu&;w!Wk9~p&;nNlMup;hQ7PDs+sTJkcnL4_
z_-l0hl+~WMPl1?)+aHKSpEOENjAO!=5aAb-4F6c14rF|&nCGJp+|1T)8AjU_2_umY
zwd#UZn9&cg+<M%}o_V?=H$P~Z?VJRp>t4-${6%!8P-0PqKI1tOk~kSKA<a<rR&GtG
z&2UH(LV@=dz@|wY$t9n$iszB5Xv9ajQC{B$#p(EK(hQE<P`kmLD^($@<^a~KBh1NB
zU6V_(U}rXzgH`FGlb`g;B<BD+ynG=J!%~OT75DKeHt|sa%r|+X8(D?JQa3YqvN%3O
zOq6(@zR}3J%p89R9`l+E8th*j=;zoc-2{C}GTs$X_H<HSi~Krp(@!FZ>r6kJhf9k+
zLjx54n+51y7WV1UjGma+h~}{^Z|uM>-Y3o6!wq~=PWi+fJphdH6HpKl%KOrcSBR;;
z61a;X3BbeHnf|U2$NQ=Q>EQB=yQrPAlY0uGr*iZW=+VZ69?2LuWpga8!*)@vULAUq
zmRrpmEM6D{`V|*pVcs@hcaewjo9B-@0`)$rWGO3upYC}z%Xr?G0%4%mePhzw#3__8
z^5FNgfB7`iPlmKJLj^aJ##p@Rl#cP?{coWX#RUeddX@R+hT+Fn;fecqqcGmS7qpC;
z5U7aay#XjalQk@&vCrlfBbPqIoA8@S6Dvo9Ys(8ktnmlC)OarlP)DD)Uxorkw8!-i
zOhU>SEngGL{!U)~z>imZ$KlM_Uk|l}Ex&12cqs}4K^-hE=I_<WR65X;*{af~j;xxI
zdPgKTGFrubfBwpdg!=V6t-?)}bzuqavRc@d!^&Tp*+B!_2A=BJuq1KEYy1WXJ=nQ-
zX<F}ah}4?oBonlI<5us~hkVzv#!fZllrcK__*s48n~OlaK;}!c;4TKJ*N!`94`^M@
z?eY3HLzF(;rpB{G1%_$i@EFtg(iNKTw&dYxt0G=RVyjwHb%9@3YmBkde-sG_e}b1|
zm@pIYLckl}Z(){a+Z}r#A2RcUN{yi`v!blIcoS*m*6E9FE2H!S62<K;75OW*vUF96
zWSOXWOAM9p1q{;rxA9!9y_<R}a~3g9`Kda(;FQkvWD$t&FdqZ|u`%F)%|Y#h+wmHG
z13APkAO>&bmHLWLw=AOEV=o5#WmEnqow3{Jxph{5iwv2VOu-aIqbO6<IJ(c{tC=N+
zI5@S^G&RX>(4x!&H8J?l5u5ub7d23&_)wS#eELsQ5yJxvl$tPW(S$e%=3H3hePW(}
z&t~L1v2x`bCb;t3x3p&1yDL>2qOF#*sBZg~3MLniwR!o#?Q>yYzHh8*)hrQ6Hu|zk
z%YluQCKjPXnMRi7X8y4X!h#)`zIsV29VVcuBa-z)K^SDb3G%y3OrtqU_xbcDNC3^6
zNH4CIcP^c_rfe`{!tU9&Aqf9*W_|}C7n%A1p7o8-i1nJ<cOGi!Lm(@ABt4aSJ>a5V
zF8|Strx+={{QV`X)k9D>$c1ObvxSL;cc<`}F+BtwKl>G}K+Fl!H;$w^rc(8ybSHT0
zwvePD$~(@)XOb+{QD4u`RPR3d)2yV{8non$vM(yJwt7_-aZWJ$%^`1?f8z)xGOL%o
z#QDKTCjV80{#xlY2{1(RU|;=4P>AJjBy>OX8rsx2OjbvYT{wsCG1)sL^@v|4EZOZy
z!RKE}LStnjOa9bC2tHErqpNgJxZYXt6vH;>|2@g2TW{5u!WUtlR~EK+ha62nBo-l1
z+5A4GM_=aBfwC%*W~@;v2Kdlv;XSNwwXEW?4gA4;4<Cv`V7?@Q{*a)xCbu}&l}<|S
zR#TQs`PDq(-<|<hI4P3JkZ5sP!&N7)P0Am9*Wm?&taGAr-*pLncal$v?CCMVq<9Us
z6fVdJ@`VuJQ&U9L#QH8C6+>(>q=I&ZAY!=u3#^E<J+nwZT{&Auj+&GUGPOuvzTzNi
zl3J0KfO(518qvDSkk_9sO#$ZH1`eN_hJ;v|z+=7YYQ|?K<hb|KzjZ{P70L=KlIr=F
zuB52QrMI)|G)P-R*7G9<+0TWnEci)S=@XBJqlqxcH6zQ;-#sra9+NAI%IU?diP}9(
zy?O;32)C*C7HrA}1$7JPRo|P9a<b(a&x1fVO02}xs_I2ljTY}Uszt~3Az<aAO$iTU
zUG;6KCc*2;W+q*EMrcgcajZElz5eJdrK~292cE%qW}wf2&P1>zReU<%AeY~Ppk;Dp
z?pv{pA&Zb93pgQKQq1pT3m%CFPv1e1`fNZP)743|=3>Z7zTjSp7o5#%Ys2(Bk~R*P
zF%I{w0NfMOe9==p;w)f(*KMV&<ll=6paNfQZD^D%AY-fCh^qM6&JYf>2UVvsq#n;$
zZi%p6dpCAB8nMYk_&Lz;gr<ktq6io3cFd>VpI9hS3Bv8&zw?kb;gz&?L)YQhE}=YP
z@bw7yFZjIzJu04T?1E;n5&fFgyB7+Xh@Zg6{HRIWnN5}b>I^AL{nN4kNj)lCcJ2*G
zm0j}`9$Ve@WO!=bPiXojB23PBL6`(7Vf36_EylJ=Q2-`vm<aRO>+N28(@jk<siTOj
z03xU(U*3Nf(D3lDDqnHz=9&xp1iw_cTQTyXb1)h9u)LIfZFI~NH)V`dxs>H<Sa?g`
zoJRArv6z*&t(NJJdJ#4DAAPR9y9Lqpo8YW3isF6b0qeT`wjXLuBo^*MJunjD%g>H2
zom8)g9%VHc@!o<8TWX@z_2~r$E%U#dd#`-;jO&#JAhy-OM?yvuY^I2ERT`B_5&wc@
zyP`@M$~e99NixkSRxs})-Q-4{rlu)jgW_j}qJm0<-T<aH2$a{8=!^OHJsN41kB&*z
z<Ez<#+$A-yszs5Ip;`FO42mnP#)pw)(V?H<u6rPsRzcC&cUKle`@8}9YDo!o00;K*
z%}LIFv%z_a=9z2;GoN^2vW9Kk0~E*g3*>KduD^z>$b%bX7DMgI<$`MgpDJZ6I*VXO
zlT_UQt=!3GM7L><*!9WW;>#$!GFmOWc^-a>`5E%h6|zs{u@)a<lUj<G@H@_VPa4CP
z#bZ_F^yU)puwBg=B@qO03|fmXZXPei0er7|jL4kSzl~4_)&>h*+_Ss6BgSwYwm-h*
zc{l~)yHWx<XGjcWEJKSLjLDDBM@Og<+LM*sOGaQ<@?@Ov6kt2?qM8OG=3Ald!cs{x
z<JuD$70c(eN^Ir&Geeg?)cNgtZOlFeoE5{_=1UCh`D_@F9v@&Qnfs?_NTRC^NfaH5
zmgJVQJdYqA<O&gb>_}ORUr}_A##eFXu}0#vM&?mkTdvcQQ9j)0J0kQ_JUN~fcs3>F
z*;E9|pp*Y>6cmRTc$2w0Ep7$X68zhe(;6^*Q=Extu|a7!nO-Q%o448(EHgA=)_x6+
z-JL3R0!*m1-wdg)g_tKiHGQYW!xm)d)@I{jqLs61fW3DMYlN0Ymq`kkz(d73FqP>s
z;5E^A9yxQDHZ*Q_6Cm!E()TvmQ+rzqA9YzZVpVsYh3lZc?o&f;+o~jd?^NdPpgP%W
zZcR04O)7yY@<00(7peR=p~HiyExSmUILF8$`<jJKi6J2u(eX_K@kKaf#&;@SdvuIq
zVUxf+v>E-Px|Hn;^{IZ+?|;-yx8(}b$>oa81&!1y!C_qbaW=xFA8Hi^<w;!N5R0*H
zp4UiMZiudflG6$7#!J$4I^Ivp05xP)o_kWy(P-~Kb*CITZxN{}z#k^)sG1MGZ^vGk
zmgsg=DpKb`h7MJS?SIi7Z=AcvTU?4@W!DrK&yzqGypDP*8L>aB^R)x+(<Kx`he~r7
zku#5z_}NcND%6axV*=ach;We8s&2-U(Z-Ocs90Ph#T}26*ElfX571Rsb8d*polQ)Z
zZuv<ZhUJt}!ydIm)>>%cV#S~8zF0@+ZGnTb{C{L;68pR$mfhvr{c~XzTSbNDL(Y5s
zhe(=Y=rOSD1PO|MGP~5P#{1?lEhH40DxU3UPu|}%a?i!5m0->RD)Fuvz9P+j*5g8o
z3%i2|bHhC_a(_-ua;!!E=|UlVAQL4LI8X^6>;42{)&;4_{2Xb#t6(vKXIYD^2L_%r
zsf|Vc88CPN55xBLdsqrQGG!i6Lb^~qehct(p&g4t7&6NkAto$%C-i8L7_v#j)J2RA
zwFEnN({y`jS)+#{d&0Ot#K-|3?o?kf4;~`kg4lgE$LrB2VWiWJ#zzd%k)YnEBXrvp
zX%nojIO3a(SW&<B$5sw-<BegfZ1zPH>D(z}xiMOqeD1>weg-}CI=$mM<6~SHC}b3y
zJ6tI{OFZKyPP%H$SmPcR#KThBSDtAsb=zEDi(_^*`_T~^;@3mI1eZVw#X&RYFdu1{
zAw6`HYKqzRy&zOPHMKQO#$~9|0b|JE!!Y6+*e}clP=(jgvibdIju=vo_n_$3)q;V}
zM5A5=J6F}bx$4F=iJBBD{xCooK6<$^a2-@qvh+QYGjaF1ttj?yIwwkTC!hGEqVA-`
zvec>6S8N{iMEok(?;eB?Xw726;gft-C6=}^z1_OT4zQ96-YkP+&L}{=ySYcS$VcBK
zLIYJ@@|LMixG{jzA4TjmNa8KMyr>&@t@d}-St83owsfPje)!2u592Pk>A=R)+iST9
zdK{Fi`ibE74P{#^{|Gdt(}|z1{N|~1H{Ng9fN=yKHKp*+HB|)_KgMLk?Z$!YCDfdc
z^5Boa0;x{$KL3y<y}@VjDRSgGE_{Zjj}MBqo7BHbyunhb)1<10jTqT~vMEJx2f`+D
z<<YD%{;&E<s}%p)o;Ul312%w*jO*cY?@_!4#inv!j=oj~<FIY&7|!Ym9l-}r+Wwio
z+T_MaEx%$*CH^QAl2YdX{YTYUW{fKbO5{%Kl$+~Eo+;MLA?jPos6U&x6YZxB7l}<V
z=F8*0?3BUFJkfWPxC<cxUG5GLox9TRO*epwiD_SzZ=-7Px@86gxf^Ic1j&{Xr;%#`
zE-cfx9l2e6RI7B;v?g@4*Y-OB2$qVbOc$i98vWO@p~tQV|GYQ`moWl_Uxl)c=>OVM
zjPz(k9@q{$IWOLvbirNvw%>$iRB~ZseExWmeLDWguqEB9F2VGwiL>Ph+-XU?Dwlkv
zHdvKS8hBPT4`91|D)NTWsc}<t9e71;dqn$qH;-6Wg$MArrx-R?#9S;UB;T#Dr22m2
zsew59$cVk@n*tj^Pg4nSa;TF?(M4EzH`9*d`%k_KU!lAuO|NoUD~a(=VkvN5`M9He
zOzpjY2nHLF@<}ow{hZzSwfD4>Je9h!k@mwb4Syz`uXOgwFidn#|Hu|4N{w1=94jhk
zV)~Ai@1#6l|I}KjJH?m)XRoZ}{qCa+-vRyF68k$bZ;US>a?T_EEdYq-e{{Jxr^Ak8
zLVJMutPzpEFTDgGvZWh9U(x7W6l1nyN^Q>YVUw^Ejc!@m!Ltd_`u<M{G>g*x6wUAs
zr4o}<%kgQ0-r46-PrN=>tNjO{Q`Bn{&yud?_kxb^`M#1o=05y5GGh{&E>}+ihr9wy
zQn|3JF)XGfCLpB*nCntIi9Q*|zg~?ow0M<l?+aK5_h`>6k{NdnPu-s>q(993d$qJA
z^#?y<t<^+S2uJcsA+h^#xC&+;GX|k9joEk=)nc*)&O$`ie)(|zCb_eZUOR2flw438
zilHV8_AEiK2*H&{2g}fE6jmdvF(4z=HD7qg9M!}WVQg1^)|kU{O*x0tNgR$QXg*Aw
zfr#)JeWZ220aijbx<p=h+8Ih6w5M0;WiIoV{T`$2W)sBl3^t)U^?!yQCpy0E?X7Md
z%1zxS&QW(`=9*wWxj;%J{|pfko+Inj&riL0OGB?yf;KD>34lkbSGQC12d60{@9BLO
zjJAUuIWa!#mmNa~nn(_-i@<P8z}K%_riBw46#y2Gm>t_GMpbe_2M%10kWZhS8P0J1
zFZGTlAJOLecbA9%iUP9jYpudQom(B6FkfGvGGH<97^5G7v%~Nx0)VD&SB-I|w|*Mp
z55bSsKO-ElMk-<lU{-M0%<<<XSzv^-b-Zh}^_hv$S-rW&bHm^=-&@<+)_pij>gS(m
zoCJAG;c1x!$D-H;t$|G){TD&r&cy@?CXFL2@LK00i3$Kldw#xQ%w4i9R4`!G&YP|O
zQt7u!uyY%b-7HR5>0-Je>v-|GsGu@Ty9xlP)w=Vr+7z+hrvg?DMXZgi*S>T4LXQT&
z0hnCK@NHYg1O|FX(QYQj9a;-K!0G+j>|?-sBVx(`WW)sX_^28lYQO{l>*bZEzg)Oh
z)i0~*PwWzWd*6eZR#aCu(M>ME;WzeJ=Dd_uvD{?M@f=kd`RPMG+My*_f@^R1wx8kc
zCquTQrdc!ikczEY@_KX?^toq0lgej!JH4Cui82Cb$u&@#B@8F?ixdb~lZ}i=32Uds
zfQ~SrMreibZC&K&2hJL<B-j_ca<=k(he2g`7eVuuf~c}0wnW1>1PWRh9leCw114!%
zYp-~ai89Dn6Wo?6SHuCCrat|nleSfhD};S0FY%*#*!4&;w)G3MV0L%Gg=SQ)Oqq4n
zX4ELtgp+^6`Q)<<Tc2US-*7@J@2RTw2F%D(Tu>=6+ibv84EToX7g0^TKHVJ${)h3f
zP0o_JaY;|fa%^_#a)qj6i40N|NVQxgF}*yzuSOUOXmu!Vz-bQTRm(`Z?{Z`)*G-{)
z_L=;<j+|py86`s0An@_$X<5n0q2jI}1`5e&hN$kJy^cl=uSU9a&e4jf`Z9v%`nsDl
zk7?ogLwlr3GkjA;NQnmtN@@ybYdbOJ>^dRw3i|DuyA6DgllbO}7iSKAMHOVdyQgx2
zZb0t7`ud<iLxIN)61bO%d`bcDLdjxn7LLdx;1TnD{CQ9x(I?QI2slzMuN|{u^bsY#
zDVALO?cDhF2)*uB3QpSX$P#AuNs^~~qhp}ET4R`Qs#C>QSzsZe45eIQwaV0M4=%*F
z146Gl|GvqNx7&W2Z-U{eX7$CQ&0_Xn`X^dNDT<|SNHsdRgIpp7@yRai)Cqh3$4Hgj
z1t_OR(0=f6u%KL;vKsC{lh4$o4*giLX{)5s_$#74Vm@ju59RZaTE+b1+-B4Z&jtc$
z-^|_L{(sVJk89&j_A7V}4YBA8#&5-3Uq9Dtm!=|pm4!NuP{1Qxh=}uidc`c21-rWj
zo)g>Ij=Nod263SUL~_=hV}H<PCUEF&EC`IwxI+w{JC+Oao5%g6&?liu*sq>O`!OE|
zm1~w5g+Uo6{B;m9M74aNjAk8v;rEyF+}%%&gT*ZxMsGw{$G`AsN}1cVC5_)Z1mkC`
z1K1&MM=>o&G{JI|AoIDqa#X9R;2Vi=e8h>tppt)rUFfJi8feIZ@k?1niW(wz$7BC^
zbM%14vbshKBbSm|Q1n97nhEn~>TBU_H|($o)kNqIF2cn3e>a(jt8zWepVp^4tz2Lx
zRK;+8vYJ}R#a%dk;%es+m-7{k2nv;m2$M2$c?FfNdJpd_F;>qAACGHy;SQ+Yi2ca%
zVCa0#W!faVi3(#)<9%H!d58DsNwkAidD?0iuHveOM5J&sc}KYc^QWt%$UVPexTXag
zYGP*jAjE@<+qCKI<GgstIc20vakX->$vP)^bz0!^4fP)$7z9%MI)t0O<Lkj9Z3u=y
zU&H+h4l#MVucPnj=wjgt!o0PNjzT;!7x#Di{)f)Hz^eA>-**hLW>oSJ-5T5{?CHc@
z2V=Yc^C|lI{{ig4cpxSC`<HAs(3RT<B0qi#$IOF8M(lQAhTesLjEvjQ(dqyC^1fpX
zS5YUBVghD{7$$lG?_0SaNkGxsUsHG+Jt=0FjgCXMZ<}l#C^B+TW}bpzeo^q@OK~7N
zTN>^JgA41UK6k(cavYDo)&59OHiFjw$$<&{o82OwS;ioE2}-pKOrsZD>lN1=-<ZXE
zB@lX3=&T#sEj*M{dGLtzY3LnZIdUYsX3QSRJUA&aXjTf*{fc$C<I^?p-@lTu2fQ4t
zZ};y_cir0}jI;+p=OMlZMys!_n?gqOVu!zVGqe&qJ}(Nd$WF<EyzV;C{(9H=Ay3(R
zGDvZr6D+!!9f37<LTZXx#+a?;z@?)}X{gZKfk5KzmTde#V90Sst@VEa6APWbty*Cy
z*5GWw-)Pe|$Lnn-3DR8RU&HM(1ubpi<I%$-eN%}(D6n0LD4bEs0Zcq53Ez=^bd-yi
z3gutt<xX;O3Xv-f9Kb~4YS9V#H*Z<zQ8x^f<?iEsm`ZDFyWsc_BPW{-ZZAS14hw>K
z1pkW+n;Sm^UxGj*{C2wEDOUXc1OFsz@@hJeAxvfHdvf<a(vRB9e#4J<eezTJ0Wr1M
zC?6#wualjTZm2F^UcrAMH#~yJaI>EY81AyB+vlit!3*#X>dD6+9${v3aXD$d`=_T-
zOsDYtzu~31&c#>_lHWIl4lg0iw_tDa$)2tP>O3&sD}}*bR1d@-N8TgbhnJgjh5UIw
zxlrLcc++DC=FhcNSq=6+XTmzANnle2m*`~}#dLF|nhscjWs_UF`(zM28E6g)R9QUU
zaA`>-srS0E@wgvY2fySrhHM_9zH}ihZjq<comJXuhU{Otm2MID%tP9^5T8F1^XcIE
z5P^ZQWAd<xd9Yb+dMS{GSO79nLyCLXH%)m({;C%#=t;j<y%ien$a=z{P;gzFJ6r=N
z8Zy3twdShr?wEVqLi4S+`b)OlYuH)5JWM$mwNV}B7^Ay_(amI_{!{Ye0<J#PX`J$5
z;cjj0o}tGh-2Rk^x0jPW{Jy9EQT}@x;@}^w_Z3ZItS=M#PuZji69|qis-llq|4Dua
zIPPZS#4lNzl{H|!UU2uf2v}U%Zx4Q<)k~5`BjjV4FvjA`G(4NzXAoZ9gmL6w#B$rp
z-(C13<0(QiXY%0@q1TZr#EbLyFf6r}O5dEU&QZ$CYohPlU!aBC4l9@{iyUWxI_4!Q
zq|zVX$K+}nP#=|gte~Q$MPWi%6%Of5#J(PXSs}bcoc{RnbyqdPUlClh8pE<B3(|Pg
z>X8VykRynqr{<?u&PkQY3DEEqx_L?&9R?}R`Y?(fD`nPff0&sk#0f)r{{b#{lSkf&
z=5@7%kb8E{vxT{1b(k>v68DoIGLL8SdUUG@If{Q#*je_yH;5_%M>G7rw<IjcfeQ>o
z+0c9V)$C07@yOC!6JquVz<SCVdfPBW4eLNHeuM^Z9$LmO^`{m3xsQ5DmJ@+{SoO9+
zkXeYODI4@_71E4;Oz&lJ7zP>stZwiH#y6HGRLJox8AGbEWsUO-SrdDWA#7!jmkx;c
zZ%63Yr`Uld4Q48`G(~w~L;(+{4q}ivpiBCjO_j_$hio;kJ8vaY_7p?ANwFzM-jl#@
zdsYXw#jOvzA<ykF-~3<tgKx^T=<pMt>@s4vrZ%(PByd?fuF6vf(n&@F1=<QwB25oD
zTvr7D2Q1rsV+{Q-u*|Cnw;XJ}=ct&iXXfr{{BKL%p~8{`JZCjjN;{<w$5(Y{#3y@Y
z4iX*Zya;d<Z|0sKa`cO-H_y+Q3p0|8SqQs76+UVaJEprD3C4e3W2_=}Wn52v`-T;&
z18-GV{&uVU+qV9A`T$)CW>+(Jcxkatt^EcKT`uu|P{hJ6I($>_N&S}yRn&_Eb8xI2
zbFLHw5sweH5D_Fw6%Uh937p!GIwh8hjwdlZvQM#qI*~7-&$qByZ%AjJ8+mx_&dW9w
zwiiYpDIx^x_@E~#rh$e5IumgcYnoVbyWH56Kp&6pBZ22o5%6AXsOu=M_St;mo(GD7
zLG3P#d3W<DKh`Q{XU{-cOG4nD;E1g*GAqUGO+aXmVoDujVk_!)^07}vipzrWO+N`f
z5O*LhHRkZ$$ycW5DOXAwT{T@eH|XUzFG9ARxV9XlokCu|8tLk<y<@-d$_(ib?vc_Z
zA)w>eRoQt4U(0w2cz%8a2^Jh^6j|(bH?}~uhneTf*y<$<=u?!alIKAcsCz%jrQR={
zQ~OWPo~JSq5ZIi?s8)tT8jCz_7T8hQOsd{XPm*bcp(3-y=8#*=)5zxRe;u}ykqiLw
zw-`4Dn=+TDd7o4hJz@x(VX@zVUQWI%CKqdz2QqQ(0e9UegV<vXUNa=tf_WN*9{)l<
z{3g;}r47}bk`BMKtb!@shQAm$kbZlq@`1dZdZ=}_08A~GX{*wj5qeE=(snyVeqne{
zw^VxD8@Ho3&9G{oKXc!)iE~$mU*Cjd=dY}OPQFv9@me8wlm}@N6VQ5YZXYh=FCsC*
z4^3K(kFN|P<G<$K&OL?_z4=NFB!~c3F#h(ArnZ_yp1Erlfp656fFLnK%NvraIDSn$
zAYZ7Ozp5mR%nE3<h@-pix$q+Nf=xM28XG-n2WDD%Pl6u*K-Rm+KbL4O#S8ye&JWww
zRVgV>_@5Z5q7kDrDR8zD-r47+OH;_3X)JmUTMAnop2<4&{jT%JP>a&|9)HZ@)w<fd
z+b(AzIk>bF^#b3W@T&IJ?ZR)#TO3jQ415SC&gPm}M7)D0e~+&f0Vt6EuLNMKh7Ej#
zQ|Dg)Yx?97IoUDSvu&m>?9AGaTQyd?*v#en&>>=~^O^-B6Ld_Ru~dnobHll|=YBW|
zt-Oyx24HTy>S`ecvO5+3-95wCsc#riRim>mgRHrD938sgdmgo9ict|%8&Gd&Jhlw)
z1kxuLMIaL{11qHF|3J%kMEr9rANE-mbFT#D{ZtB@R>6`tkGf^H+)U?U#(TkVm#KJ`
z*rN=*3S(ooFJTX!tt#br<$tQzoYb1DRk4R;4-env#m-Yu&IHP1W7(_oy%6KDxl4(T
za?=;&{?S4SBkpJ?F!Rl-!HtDM*^lUZs2g34I<Tb~ZSL+Y{L;<fgdhuzh(cEp{(R;Z
zB7WW~#vlQ^qVijY%vYbqVdh^n63SD5O?if~z)EjA`E|VJ0k_AwG<TwG5FXr*Pw!AV
z8hgMIts8|BKV9c%>*Yjy)qy|Jq5p*G|0emP$wE;io8KT7P=<;jXL51jHAGG`5W>mn
z`Oc!l7)H-E`t;u{0Ig{)Rs5?y*y}}H-x@by^*Hv_Hj(>za0r2om#AQsnD5FuluGQ$
z415cajD%qVH(@xiV<F5MX^TVe1YbmGV1K&DP14$3C=$Y5S0A@g-h6zSq3?NX6``Z|
z<;x((Ej?6E6_g_B7jLay3L>0#BjFza-`VLG39&`HaT+D4j(mIg&0C;%gVe6?&_eT0
zcIj&hu*=T*74F5%FH{l5l2m0w^lOUYEk5?c)a%Y?Zp>dJ@HtI@+XN|>WqL6lTf&*)
zc7s$D###TR_^b21)A{2Z$~aRF>i<xDKY|6l`b4dv=X`p^j=YI&msi{+)c$RPZayWM
zbnK{c?-Xm;$8#i^G)o_&0O?&uGYbe$Uz7@VHTW$4jP<OI2y*#73CA?H<QPGHZ`2!M
zs>fY1TO?tGlJVpT0;El)yy+yFS;Z6Nm^?WhTyQ)c&)|%3@bxzOC9#BU#2vf!hO*=u
zS=2J@+zpo&48Gmdd@o5|3+^2c5PxU;&f@Cv>{eMS&1Q{(Hb2pcC93!zB#_cP7V;Vz
z_Bhj^;N6F4NpavDW)|{7c9%G0wrIMPb3EomnrLoir>IKF*keL4i~lN5a*8qdh4X7Y
z`faWm+}7RnZ5!gdVZd0h?VbtW2K}qKO;Vh7=3deyxhecZherP<7dGGLiidY9e=T!Q
z;yW*qC+S}MXg*8-p)*P_qY?SMQZj{7a|y~Xb5CYzHBVjm+5gQ|hj_F6zZkp7LC((^
zS0uyF`r+Y{3ZTZmJS{!<p}~h>fdA<q3cS@A3`#|5sB@Q;I#|jfz74mi&3aBNm7p7{
z!Yknj=8_0F^BJ(}T$-gi@w$6=>bhKxxhW?45hd)&(X_lpGIM!g8&Kz#3`46@eA{AS
zX8V}uc0-kOeT%)?i=+czv0#8kC{^lH%iAEHgk^L;Lvr}3O3Y@uKPH?M_Msc*QZu<7
zihx@KvD~gET#wtBHW~fMLKcatn!E%ZKg`U^{Y*-KpWo3A{!5NDzySI3-)b8%N(~Oz
z1(zz62&hVtz~j#<MlW+*RwdSZ@c8l5{V%jtDCONqOKvgCBGK`g=(D2tbp_nj9lRmA
z&|u*I51UV%fLC?MB#?(fw>AN?4pGrdn?_;l_~j{V7kl*ao2`KXIe3yl2=@J4y_ovD
z(IUy8H0p$2&hZVJ74K+M%(o9s?}j>KBZBSwR5%Ha#PsU|S(6z-s7kEBbfVZfiQll$
zWl!jO1RU{N&hg19qo3rw230|Zt4HQHhm4Clgs7@EBGn)d+txv;;qQ^VtqV8$YlbGH
z1-G@K!@T~>0}k47CO)m4Elok$h+Bx;dF0keJdL=wjGqE2!r-JYLkba)jTc*3lC^`U
zF}0GHe~2p9O7oLd$CGDhdhWz^Vy+6(!^V?)SYue+d^#P6p3xf>FzcB3<_A_Vnt9`&
zctDq%olfnv)z2r%69W<UfhkgU<|3r?`??m9VnA+c3bCJgV%9K|i-b|VlY%jwROQoK
zlJ&L5c!6cE@IM8=zy0IEA|{f74c2$4^7I8sZxXqqOj_$CEU>ur@i|{)8qAAfQJgHP
z`mb(IY^J4lbs20-E_3j^GOHQCzsJS$R1pz3tf{9oSvzp>lWbK>!UNVsr8uTR#H)!$
zPZ|C;2e%|HgiUo@#l6dXq?pw2-o1h?BXaAK>$dB3R27-C0_h$#{cjT}tzG7Jo_$0f
z%TgrYuDRC4GspVFDP~;6MHe03fHF{lG|U;X#X>N>mK9UiQn6(?o*S|Car;A4PAJ8~
zMtbmKS~wkq(3b8vir(B{R8}ToWbc`@7=z#+AOBRQZWJVRk#e|7&@9^C({J9tT%3iM
zBof9D^DcZkVgWcczhGdXm#aO>4FRLg%7zI4e|$Uh+t>ev)Z43PG3{vZ(aO2~yEd<D
zllogW7J0(<kI?`Fu9RV8;t%t8$`EpzpQDX;P)E-pL67afC0tv|5vcD)g`p)V{F`=T
zwTbg13=JsOb8d>6evErqFn)cu`KbQeO%$9$G4mlBRt>~Cs42$GXuQ6-M)od5A4`%S
zq>XA1k#ImJqV;Lxa`fW1NFJxfZG1Y>`|B5^Su=f$O%loq$!mA@hZ5)LcJ3A3Y>SR2
zM@^gjMr<_ZK9Z!S2&)5NurTU$_j>#q^7E6=^AR6+(0l9iqqX~QysBViP#phHBz5%!
zT9|=8bV~N2K?L>>EOHl}N6sowP*#m6X!9>yCmXMX>|ElVWy?tCNZH(zC&=e?UUq$;
z+joF*emf1qYVVt9b5Oj1-xDDXiE5>4#=_*qCZ9>LXhrh!0%Fh8Jv)%xz{Q0(H~oFw
zZ1dLLm-n_?$f!|;p_o94iI^F_x}z{o(K7D4R-e;j`_K6UrT26q7;hJJ5Jh^kSW9Y~
z7Hk>x0Q-|F{=^3Kl_!N>irmBmo*M|2mdB_ONHf_UwN^M4+&w+k_Z!CEHM6l}b}DZN
zV8A5R*iAgVX;WOnqq~PnkqaB>GJZ5EbIxP0pqwbTu}_JtP$ThZW6F<N!Z$J4TRK`;
z-J93A9vFYxxBJBUs-zY}is{5lh`s8<lEa%LQ>HjBtwD`2@Cz=+4WLNWx?wNd=*FZo
zaNc+LohfU*hr(%>(4WI%^%x>v9_PyN2{*C|!o+FMLx}677Y~TFJ|i}^jy12t7}!LO
z64XxPfgr;Ek~2~#J#-H@#7O?MN?bTy9mui5$YsMO%8L{gi>GHb$VmKvFWx_q&d;%m
zFSmA^;O^^2)$R&#ef_fCM{%prMi4qH!l6tvX(Xb8;BP8d=h?lR_7GidOOo}5dz%0Y
zN1q)1Orv{NR686!b&Bod7H0z2frZ_wSNGtvxv^8U6X8|;;=ntjgua2R!1)MoS4ztP
z**z>&{o5-N<|*-W2ftOI1vd&9!R;pQJI(}hNlXmfcu)d|7J+b9QA{4He5h&=<=thn
zrmrb<LWyuTaq|yptjCoSr0&bP4^&s>n!Gu7vwTf?=$R}tpM!0<|550oZNDT_-jCP^
z{(cLlW9M!4PK?dxa95tRN8aS<2^a7aK=45-bV%>*tM|!>MXWk9yny-6$N13Yl+ih|
zpAAKm@(%D5u`9NIp&u(8&&@l$pF5K-K5=apVYJPW5TYCy@`%}gQ0%0_AV$eQ#u6JE
z&Zj75OjH@WQSaz6c6}vjC|TDqE|zAJqM0|jQIO${JQ#v*MzE`0;!Syu*@$r#W?#$}
zb9Sa1)GFxj6cTAwR&EQ!W8lDs)QiqN`GK_VlhLw;d14<i>(1@}Ne^49o813#I3&K2
zZRWBpEJJo%%s}4&he;jJs|ANKSV)y*5Tcsi3;ID<@GAIbpmd|&EOHb9q27*=Hq7IP
z2Dto9R>X!$)6tX`p`_5pMbpL!d2A7j0vCPlxz`%r<s8wni9JaH^Lhr&hM|X2TtVt@
zD-mK{Fq~%+9O|YOW?hpuvbpk8r#RI4)KT3nJ+q2ywh6qgsFWPE=_J(-lyWc%&%@_f
zaz}I<d~Zck4;9;JCDr}_H_E1gXG|D78{QqSi&y=nM%L)>6%Lqp|Ma6C(w^N@WjrC)
z@owa@P3EYD4y}SVaqx?lJScUy11=z>#XLKPDdl;g3^&m8-RMLbIw>hfpkq2$`(bO~
z-`8aBvj^wV{;r6RM*G>?e^uK`2}O>Wet7lqXxoKDAk&xS&F$i4yB!5AEBtZp#{mV&
zk>>RY<@Nn|W$HL0E&S)yJ8EYUWn@5>$y#{kT^<_qgJZmtBz0Dj-m!)v?<T^lZIgSV
z`P?+!i;OWmX@=6v;?SOyI#Ko}bnLIAhsClvH!eBk@$y$%XXbw|2ioY`=C!V$J?Y9N
zX~~!i#iHwHZ9Ft>q&@XNpgkV$8eUz|cYMNtI?B3w!p;OTGj;PuNF}%T1l=3wg#~1(
zH$ON~m-ZC8=JP!C(yXW8v)uDg&$0KWKkm8@^bbkEiF?}kgQWbnQFBJzm=W2S+f}(D
zKeZFa)(1JptH%vyGPKdF8qGWz*|QXLM{~y___76xrV5P2(mH{LX@St7J$c{M+}Zhi
zW)6BL^GqAAG=!#1oKuNgckbTfe7v{ti%c)LR&cqKA_Q*$Hm7&#>HAHBe1QA7&Ozb6
zn?vi8ltdY<V4xGnr7Bb*&iZd7g~G$}KamG+*8@gg^B3H|c5ysyks2m@*i@Y(ypodm
z&-#bM#wo?8o3@DBXi`Cvh>eJRB02I8NqlSLNtDAVzvZ|kfnAsj(H!8%{>Y+biqY4a
zZg7(wxQR~^5qU#}FUmdCyO@+7*ba5f{dEwHBgu`}saUE%OW$QVNQUTksDWS(E-k;F
zX$qZ7D<s7>Cf3U}Am^bM84;)N5`MaA6}TQHi8ZnQ+t6`|AhMdFRQs&Zt1R}W$esV&
z?Gpji)^W*sXOhI9U39T6{B+8B$Kk#XqSQVn`bPB;LUU8wUpmRW5srC>+uLENCXRCS
z^ygSFOK!eW#m_i)o)lGg{APZ#rod3NqGwYb7V=My9fW|d?ZgFc_PmkhFYF9X-airR
zUFY0*_^YF0uhVbob|2F6Dy)sUm;>!W|At4JVmF+cfRX9n_8oWT3GkhtDRV4q^ZK}%
z>3)5&H0FFL^?YyCPNHMIYBxIb-eI8yhVoq}XN~2V5urGp<CA!})9sJJy}d%tmRPm&
z&~((r-6*}YeEJ~WHfE#Ppf0l=$v(T=5HR*`Zb0mSXV4;unDt*|zZPo2YWy3W3V00Y
zH%cX^Y)Z9dzyan2!HB*kk9gb!vmbnkvz_AQxw#j%iPFwv6a8$IK8iczp_7<iNbu>1
zZKW5S;-^dgfp@df-+nUE;%S*3x-=~J5%ZltytfZrc<@7>`YU4a;9F8hX=*fQE-0Wr
zk@mOoYk@zia(g4N`B<f-Gh%>Y1BB!(;%oTzIJw#RJurca-vvF19;A<qvh!54wl}{9
zctV-=2G_CYR@jV|csTPyt@ZyMV`rwDFtW+tm|NV-LamrKm7`$$n%*eG%ppwdv2J|H
zg~d|caGptq)5^ZLUt7Bhuk+mr-h+Zhf{qR9_+-*(rpSGKgNY?IXl)YjI3_Wp7M0T+
z*>SxO+np{9cr15(9_DR4gCAPJSiYNXU>cR?UC3NwXwa_Br8QFL^=(CI#kd&G+G$q9
z&=lE=Pw;BgYc$2}x=}2=l}DNG8U!tJ^fWD-%?hKl{ZRXKJE&aj#YaxB;Y$D4n+pWd
z&`)LqTA@rg?IFQ1iKlgGhNC6sFfUBU9zz`Z8S0m0V6zhDEN13O>wc-YSLoy)J4^N2
z>pKz6k%WOL=K#Y`BFj#S5FHE>-rXK+IZZdu9L68LIn6u@oHXZ!TE~0)!EbkW50Hct
zHT$4X>^p@&xuhw!ArQ3{ONxES#POi#eB*3nspq|9vWc}XD4W7UAvTISwu;4|{xNze
zxDM*kLPp=rlDdES7VwD$QqO=5Dq#stAYy_^iqby2&X}yzut9`PJ=ivII=KrbX|wfv
zCwC~-I5QnAq1}8DIw2vTlUzbo)H@_z1QDT!9?ekXyCL`ob6{Sk^*>(PAU0sQ^+ZiO
zj(_atBDaLzP{Tz`{KRM11^haVvD+5ngROB|p_~xHAa{jCBnZi=oN5eQWOe|@DBCYm
z3&@3voH^h(V7QxcFnVFZR#KXrC)-^eE1ltB+w$nHqzsAycWKJ&JXzXe5PbKv(u4@0
z1jWt>TtH<DM)u5)t5`G@QH~6p-0k&=CF5cQ7>3!Dy&h$;^@8k3*qf^0;97O`{7Hx$
z!}b!BtGnUf+FKy*gA@jjtV;?ynAE+xT9zUI^B7%-xdJ1(q&Dn+pCgp|Bi<i_$*oEI
zB9qb(klX1-@;dQ(x9|Darz;K$>T3M`RQNgaL7leIGv}bkx+|sDe~3Z2iEbnekBD}6
z#=L_gYIa4iN#5=c_?=hghYAh~Z<K*VY}+nHx%{TmSj`hL0v=U3-y$6tH?c60<uDfa
z6fT;M+~_0HkGm-4yC|McZz;yU-`uFb?{~0R?#XK$78CDhpIkS*f9z^|F<Y!1H}i?6
zCz?lS0E(s6QkLUv5{-bjrCL_<XVJ4NNn`6AbbVYC@xSf|cAYv+QKZ$*-J%o<vvIf>
z^3Q4|((GI$hp-q%y5|x{(4}^;R`s!!p7sAM{w{KqOPb_)Z%o>=-v^J^`zYf+=t3*n
zm7dx=lCExa9{VdJEuy;al>2>-=1n(D<`s&fW&&W79GftzZIObhT*I>RoJ+_#BxN*t
z(D91d42%p1*S!HN=oyx^36*F_wT;t#^#+N<YxY3&z40p)^|=0Z5#1=as2JMs4^~}9
zl1SXhp;Yv@za5V}fB{7K$!gD(vBJ<mM+Otm>%#$Hmh)3beSz1NWLggf427Q3z^Vu`
zr)FW$3yPbG8)X$^n$`}PPh)Oz^a!+koAm0#!HoPa`0@SKc!o!kwOa&=_*Q;M^79;%
zotfD|6a*dh>^@I3JeWHtw&nvn=U|X@D6e$rzxlPT?JMX(3MAE`tPKjZ%i3fF8j&7d
zMD=#yl-Ep>-o=Q}Y)Oe2@schMcmFhz;tN%URCii{Q-!;;-)6^5hIAg|8EAuIccX~!
zI@>KA<F4x``A3o~ataIm-ke*w0sVFhcd~e32nRP?&-{7}%frgNY-dS#5JQluhfn>3
zXDFWyO5IJ27uGf-yw23mD3nI_F_3&o#z2G)R&7raGuD3F+R-@HaU2Nx{g$Q$d3554
ztggu~QxtjRlNf_EKoc1Uy}#g2V;1gMIVF<E8QX?t;}J`KHFb?d;cQ=EdA1dhfP-Fj
zt(b5LdgpO@z4_s#teYNapn-E1Cd57Ij~hs%ojX>a`F_Qfm@}uZyr=j`{+!ef#qsWA
z?B>u?xA$7Ao$h+z9}B!fDjm<a%7Kz&mmIt82OqHbqHIM{w`p33WQWmTWALJp@7ti2
z_Zw<)#=Qd}XnUlQA_E!8Q(IwCf?jDo`05$16}xC+XMd3q)`Z>^`IH|H9YK3i!n;vk
zw-0&OxR*z*76a;QNMkEfURo`50(dUrW^G%c!S8b2oEU|Lu|~8r<Bu}DAM3j4+86ys
zqARZ_Jx5>0&H>jiUXt`|6zdyuVGw}|J$8wwCc=_dKJlglft_X~Uu{x(5QBGZGfI}3
zFDgt8y<Ke6yY=rRbH2SAXtP*B)h4>fVZzG%VFqB=4}cgsA2hW+s$xeaF-!n-{>YTq
z`XT3aD<xX5E1CED6XAeRK-S-Vmz6~PVqJW)IQK+n$Z=FA8Llsn`z2o6G7oOD+jtX7
zdxed&6WgK4K+p5g!8Vx~+*>8x{h>=!w{`Q2Be<_}U2D9EFXuTx>9rtw9P$u0>y)hl
zT`88qLw;@!ZXyn`v9Qp(Uu9eB=k`4XjwK?8vYy$XWafV-*nw@yWWXUenUlTA_L20M
z2e&$<Jg-TTBagyVwwQe-M-@7-#?`Oh9ghvBY1@Q#)CYda-Z1rLTDw)nx4wlb>naa(
zR{MGl(YryI(L7N4uU|}O0<Jojxo7b+*}8AB6`6~%rx{3FzVxKgIFM(uafLHfS34Mf
z?%mN%D|j?6@w;SqLDx%_khQ|3)<wqQT0d&CMX=+wy&;Mmk94wEO)jy!%WrTAU7^*+
z%>CPK>Bxo5&YsdErr0A~y_pWeLN21bcSr67A3pVo_;f>6Y%>RQX_q0SWz83my@$(P
z-D{zjDqCB~)W2SeiK(m*_#ZUt`1(8YD`Jq#&zW{N0umYZxtQ;2j4s^c&4&hqP41OX
zj)7=GYn*rnkFfd7zs#iP-GHnvB;$)EhT6@Xxubv|NqRd^3^@ez2KBFq1_*!0hjF=3
zOl)oG#=4Sn&%9e6JCB=;`FW7?-K7wMmk#RBK8FEVNL;MLP&GLR`nq35*s$ll%7PMP
zc&tuk^;)La2s3VA%k)%-EyOwvI~5Q*P(Xou)VY1_a<@>ngk!4v_6w;YA08yJTbAC$
z9_c{kB%uk`m@HCi{}oGnLovSBO(tM&<7iav>of1a$*H221I0sjpI&e8v&}SV9{eyn
zJ6pWL)ZZ-Y0HjT<y>8vp68Dx=fYf{CN5oe3BVl6pRA_kBonptV{(_$8%IjrkdUpn0
zVRQQSvSlZWM({j6?H;obkS&Lxk$zmOTShi5@wjUQm{T&FR}VMN^X^s4w(;r!wWAm~
ziyav3_+rj$fy|xmCrM8#GR>TXyMM2saoWKufpiwpi$Xt`OvldZL~F00nHrdDU#-xo
z;Nd|iA1`Ds8#Kx6E2oW0-)L%eah4X~16Kah=`l2Qc9uao3qO){6CyQsf`1usPIjEo
z3Al!eIDr*9DW0b2c*0rn8_f4YYC&NBJXUh33<;N2l6qF!k1lRDa8QtxoCaM{aL@F0
z8W;;_0lx<@J~%QJ=%5>TK~Q<Qxgh3Zh|19IyGv6rP-^S;l#3#Q=Pvj>ie`A?_lsTA
zU<iY<Tt3dPJt%dKK--%;-N0ZpT{x7xtXe%nu(}F!5k5D{NZ3Er&>p^TIW+9THNO45
zlC7oWfiOVZ77Hw^zE#jYC^(gd3JpEs%anMp6v+6vX8|g9ry6+PO^g8SqcgX)zB?gg
zFI_|MaLIV^W(#R#ggQ2l<hmv@;+|g{mheAk)!PzfJ_Hj2Wp<<yd0n%HkJ)$ZUbfkv
z4k&ZHnJ1vW2eN#MA$pp6P>hF)w}`O27p=bHZ=5{}dG%cQpS*}*Kq+?fnbn`?vj{BZ
z=;|)-KDm10*Ss5@=!Zb?)I46Dol-{bE`~{<Oj4m1ohK7m*ZgCU<lg&@bQz1x$@;>o
z>5@&0(BTmAJ#!s3#qDZfH>llESnTaO+L2)6;~S5bhnq{Jk$_uVN{Of0NL|aJBYD`}
zz)!@eH(Eh1Pxq<?As^YS58ZZ~UdB-ZZt>eGxq8>)r4JpC%DuF_;nFR}MO=eNUtu%4
zJ9+kgku#?#d5XY#-_60zB*TejhPBm~>xsMVsZezWQHmm&qEFhowLZRRE7-bD!p+s=
z{dzdaN8#9vGoMl8mZf0d;?Q**jO$+Z9VJFjI4#a_cHo~M$IB;#@}LmjNd3!UM!)=x
z68EY>8G|yYSS;dso}@NRP}6sB%!~YzLZdS^?xDJ+K%6yDpB{*i&9w3%Iqxug&dSI1
zAFk*ePIn_DamRfi4L&r`=uGArzEBL(?4{)^w+~|QS2i1J+0Qb}gD^f#drsYx{974B
zj=*c?^Tyo;B^=MW58bmy*7XqG%bk5|oAo+l6T#xcAL&gpuefCboVxg&z6s8<t(C^j
zwl~)Y1y&k#K884Mww0kN(6VRYNY!IT7qd8;F$fNoq@w^8j(M+YzoX`#CA&7GeETIt
zc)uA%Iqte7<b=NNTbjt5-p{R^8j8yBrcO#A#W0_!*qf>=Jaqpma?qjkbqW+YoA@a(
zCz7`~RS&>36zjt=%MD1{adS#Yr(6Mqi^zB`ZqK&iv~huiIf4I*5YnA+ZW=(dUg$5s
z?ySP2jx-`K&JVXoiOOrJ3`IzU*Pzg6aOn5dQ02S(gq|h@5bz|s&Gije3<{eKX6CZ<
zI*4Ll0O+1;yKW|j^6%ipaS0l-c=x%vWCO@?7N<WzP9mJs(ZaTBzmN-tS1yp-0p@(w
z9nwWG@~tAad!<jF-|Vh|u4jSoT_#OJfbFAWZE;+1v`8f_Bga!OL9c#58Zfw<B{Tv6
z=@^M}*Rf(d#m$G>0KbYzS6*nq8AE!da!_X#jNu`JK&FSEVV`;QXQi7yHiVN8eu-px
z4TV^B&uGNdfsrP%`H{9KxwtCET`BIFBW@;M&JL7q+`IuFJaWeRbN;z3{g9}GT+p-)
zGTp#4%JYn-UIbv1(UN(WCC%RzItzGCIv@#U5O{fDU%9nGskeyzMj_AZ3%MIac59>h
z1w~j+qhAh`J;tY6qa&g3qNQ=?HVBQI+t+P|Wkvr7i8x8*#o}cP5M>=|2my3w(?0;L
zofD=RwC>(`Q0jf@mC?X49rGDfRh5~FLgwb2be!vD&o4ATVMrV9=p10KQcF3v!yf2k
z4A{BOB^L8ED)cY_zUW%%J21QCR#|_0c&%stmjbA_y<t=_tGz;D%Q`yL2QK@Yp2^=d
ziKJqWdmF{Y@DOHv3x1m^xlz9!<i?DeX(E?alA^1ybs-UQMV!8i2t~7mx3EEw|0M^$
z>BfI6Wsm*|=>2y)6|{N0PYXVzs;TtvQxj{F!pMJmW16hlz5=lVGqXobij8wf7sccv
zC8qX$M){P4ThRcIavs2+eZO$5vHo^(Vi?)Qby8l+n=(k=)(F{k=87yqs<S6)0EUjh
zO-<L9TyJgRhZO@#5aWaGamn3~7u6Ef*+{f>WMHP?S+^J2*tg|hzhtRy$ucw`$DL{m
z8aq(?S~lhOS((!^y_g65)n2zxzWBdgZ>h&G@&{#qB9q@?@kOTWDRd8T;PZcLpdR&C
z>+9$y2dM6g2X$Wd^kqhr$$d!IZ3<}W9{)zbX14h?LeuR=m<n!@FQ~y=(~oXq$jvPR
zmH<v#@Ci0)?8f#8Z=NQ*&W6B#D^FzX#eD@|PlqT+Zh@;#UKcXGXcmiMk&tsn7ZT~Y
z3c8f4k-{B1F}&Mpd&#a;&&vPxy!4T+;_>xrhYK6_i2+^_w|k1mm!?`-f=b@JHbWxB
zMPa;YwBd9-3zR&+#~an2eA4{A>!0g*8^0T8qP?+axN29Rzr;?p2O!ez8uA!vKiwo~
z$otu9Apc%`J@P2ZwDuI;EO3LvfLrZk86Q7n)_X|&)TOWKOrYCX>&+ggxh4zCipU=6
z$%yeD;bb<4X8iKsEWqHkj$Q^0quq|9aroCbqgX!Nv*;3I6`3e<`t)n$*k31j<iC$-
z?x!fYXmDF#eZy0=ZL>bA)80=?>P|8QTq0`Bj1MNTHm{lWi&bJ^@u9i;f#-!EHQ$th
zP|I(&-lllVZYL&o@Ba^bZy6O=)BKC#E=h1tAVGq=O9Fx57A$!1;6B)30TSGTYp~$%
z&IETG++BvjVHo60p67knI^XYF_rK0Pw?FMQ)4Qd*>Q~)WyQ+881s=Y>1evmM`0}pY
zUO={<A1F^Ztk8Zv;V5(n9c;)B-T(8!w3sZ0gkgsN8ie?)`P)XWQz6xO4)4Wq^V&k)
zEZbOMWb^4y05hLPqi}TO(w3TELaN(Jv0ijT_2Sk{h9^<tJD;{^y}=UnP|MYuc550w
zpx@?Bv7DZaEba&4CLa-jjF*(**-Pm}&zoPuj&N@CENFw4RHt1S(HE6_ztCj?BD1yz
zEi5otD+C6Fm*uAQLG9)t$-8RXDif~DB0f?{&X$Ab%8a~kBG{_Ci!;l35mGJ0wSdW;
zFn<FGa?;_;TeJF=Dkz0GGHdo7<=tUO?j#M4>5TBg_fiuSVM^amqC}=P3=`R}zj$0*
zaeGP!?H7}GYwF#5Xa`N4)h5R}r*oNoi;l^rNb=Ipj8+;%n{==p$vUn>7;4OBJ@fya
zrrtn_I1aNKX`HJK*ZXPJu{#H*=P)!Z&;9|H;k?~lq(@H%G)#lry`7qz#SgdbKmr2%
zCrQNB8|xMuhxFP%Q{3G9oySKC&Ryu-@T(d>czU=5*0A*?rB4;0-&e0G_HU4%iP7y`
zH4_ua+AM4gd~<63L5E)pM{UQEDwG$1s3V~d(bLnrNMw+&6RN)X-8__YsS&IpC<FiE
zW_Pd^OcM+ePY6~|d#X)7MR;O32Q1!^nDia2tjJA6F{e7{<+gib*XK*y?T!Ym)ZGbw
z>NItQPy1U36gKv_WjN(;_L;ezWQ20PAqalpR`Mh*K6Qy3`X!E&Bc&%JBQut;@0_r^
z$Mf9{q)uQLUg*CRFc88<o9F8DeGIZ!io#7Gwvy*6XSdfb4s27}7r#b~^l$lV+WPm2
zo&YED1EM8Fp(bXI`2J53LXh+MKgC%shwa~roB#HRsQ2exV9mevz6YaN{9Eri=Kq-u
zw(Zq$9esPZ{K<}ko<2TmKiAiwx%$Eo|M~N{3&!VX8Pn%uHaXOUci%Dn_G&&NevN6@
zEpqNCeMwJ${^Qq);s4O=0Zezj0I_J>T^k%*V%vXLZ6Ji<c~!tBw!eQ*V9gSw@Arvi
zchMcALDRo$HSj5dhSQQK@I1dIm`MGK@joy1eu%61&tihUX;vb;ppZ`f`K0w9mM}k$
zoZ8RtfGWIxf#d$~Z`;32^+=7Nf8ZfZB+PX8;%J1bO?Lm1A&!MCg%I}%u(_7~>F^5r
zI7yK7pLRHihfmTBa>@qV{^7L$Pw`r}PX{Z49eCLB=1+~)3=?}j*{XV;iJ&m0=R$<7
zZBtEPM!FATWx8B<dC}1BT2+gt%pb>clweZ8WvSW~mz}O;vwhSlvX~cBMqQy(Izrm^
zQvZrovo(e5E4BEC8_QYgS#RrG3^c*+KF9Y*9mfqD_5mcNuVQy=gG6)JS@wsW^odpo
znQT33j4Ym|{B3TcC><No!Y%XFXBioD>Ol9?-nE{?zT*xZSkVE(XY9x@qiRw07SWfc
z!e3z?SV-(zL}Z4&tED@X6dl{cm|2k^At6HW`$3K!%k$^3?J5&j20O{LN3%RU1)cW~
z#@*+kiG+KgG|qQRF=IxN)dXLMsd1I8S5WGstGh28g3sxs_hkC6KUY;Fr$t+w{WKRY
zzv)?96YRqez;swdzZz>Hq^SjM+39R0e303u&-)8!66$QzSyil=9jMp-ka6j`X{VLL
zxa?$ORCGxZ`26PKQnwR_b%<d0x1lRFuDUYPB(U^e1vVs9{47Pd8fqH#e2LPGE!$cT
zF)cbxg?0pl$y>ai9)HO%aH%b|ZWog>oCAame8^1CapkScc5=Y`+L^-yM${&w{|CKn
zbmek6>wRXC*7WF?y4clrJRPtWjnH~L?ykPOJAP9@MX-5u*FV@O4{Ali(Fopjz={&2
zox`nfEWCWt*psCVe`;vYh;I<5JFH9%Ukpc3_LIh61dSgG2<0q@<i!XQeI;a+(}tE*
zsDuKB(Pt<D`xo)-nGhMx{930f5#qwZ_YWY40!+bQn@)-+iM&kTU2yhiuQ%W0s+~l7
zvs-(uo<X_l94KL#LAE&p({n065ZwsphRq`|X>cQ?-myE)ZL)%_t$Cehi*swR-$Z}<
zcFE|jInkmu=c;Sq1A??`E71#;iaC$6)<F6^adM6$)8CQik3X;d0}Ia%IS~}G`a8=w
zRpLrXxFnE?E{v&D*qikD`I7y&#WxJeG_X;p#j9twtaUV@I~QTeS}O$Tsu>UUnkqX-
zxndDl<$M3A5_5&(IYe6rx=t>vb{#~t4~KevjHp`10RgUosqr2V-PMeEAjJOL9n1zy
z@)vR6izBqs`QVpAI;&&l%;r+6)q5U3x?z)a;O{59Lk!bd+$xvID><&56OZ{v5Z=%W
z#7j0s|FDiQ+x$Gz5@8YBP0!(4&*#0!SqrBXNhx@{mFf=xu(bQazCEe=q@oy<G39D&
zvUt%(uj&Zd`e@W?i!@^~)yoipOA-hFj9Rzq;VG~sId^%o*1$=$%#YT{*XF)*ZdKw=
zOGB?Ttl+B3OUJU*vmLv<7`YPxol4kI>IhatE@Pa(z1`m#?!=wlYVWY&n5mRsAx~Mc
zf^pUh&g0e_r);YrY7Zo;#UKRkqXt~M%JyM^qF4`&ssl9!$o^0^7pR}hV=>}<$_?12
z+Uj&oebHn)H}KLI9Hn+;XiewnZI9r;RFA{Iy-`fzr*{f^E4i%G>eh;k-WmPwbidTx
z1MiK!c+E8Y707F`;QL!uCmPXXhUPyEMSYhQD%hCMcKIFz;^K6F>B2t4uBz8S_?gGo
z*m>z9&!@Q5QKuytF+y13UyA-23@UW&i8~$WT@`8CIARE%Kj)P4QuyR^ZXpyuI5Hd1
zjJ?<@$HA(LS-%za12Lrkti)?A+_vle{mX-Mcy6@>j1X0OIi)5Dy?oqKU4@gc=-Mb*
z%J0wv!RXr=|4K$*oPVexrUvPq>8NBk8z+D&*IRvPuY6)E%~05WJcgFY&>ff9T+jIL
z<V7lE=;KoMlj%B<2_AuOOBoRCg(y72!zV%MPwty*I)91#Px&9%nhY>aztDwq@l}Yf
zkqkzwW8xF`xUU5s!uDDPct~)--JxN*9+P_7d^sC%rxo$N{Lf@DB5(soUB3=i9_K(R
zh=7K;=hp@ucS$?IGfS^=!&C&032qtCe~=k_jw*0Z_KD^j6ExH$O{?u_eKqXWrRHsK
z9?LKgqZ9DDmA;)s&7UrX$rN6ATq<@1Y}%@nHI`u~dJoTi@Ur%%961&gZ-6;1Ip!`B
zcB7g+U@Q`-S|DdrTLC+b<^jBQ*M-KF>ncrq&@S8V-Ek>bz~PZb)eFiKRHGm;>Q>o)
z;2}KcP;35PNLSPNCpWu49+)k&!;CAlEdsLTbN-CaUOWt)snT}#F>!}9rG26v)#CxT
zj!qO_h(npXVZ`N+ZBJ1q$-z0&N<#TI2ZHIuvk)8kqN*iqX*My5h5np7AHj2Dl7z%;
zL3*Ts_WT&6@RwCsDg554p4w?wMX8>oBcIW_*(ZQA)oob2W#8%rd#1EbZ|8CxONI>4
zwLUm)LpQ4FR3>F5b3=iW0Xu^4kBAQDN}n`b#Bn1p9>{p{IzO9ZLiTt*VsO1Zc7?Rj
z1!Gqkh0u*+vbsd$su<0oFxGV(0<3pKf_l|_BxlwLs3ZxO()-tD;-Ed!gn|5hsF`I5
zWU#b84-)=VjL~#sF~5Mo8&&DQizH14lJugdpEfUh=|@fTM8GMLT~mbx?m#Mq25NPN
zMlBT;vN={f)K37~2Z-h!`j!UFmlhJU4_Y)0PoCkb&<2!<zB-jjQwkN)ZJ|~<p&}?L
zDm$*FFiY2l>V>C8w`AZ>>p{&4PrO*4xa@!?O?w_65o>j$u6yy4Z_O(S^J^i)@6kS8
z2(y@rb2koF{_1m;I;H1v5#JE|h~_3h?)5uexD{6^-Uw*o?x~vq_2d@kxDB^K#M_ZA
z=g_ulgZ29E@;zMhbW`w^&?MEK(fqZdwE@9T*1%k;W~?MhtWpR0B#ZY|*bRP!X`$dV
zkbP%ZDrfgD*7CI$S3{RAsX_aYQNf+*?VYE|T(aqf$0>NpEXh{qT_7En^j>-uWND(!
z@g5>^`DVU~X15KE^EjA__M4Y0gm?rTw%*H>^JuE;%8GVx($JM|90E~}mhM0U`%)dw
zN#!s%8@HPDuYs;}A9&Ua&4tI0zu~V1mVCHwVQTE^!_0wvGfS6-*HPl&RS`%ZJ%vU|
zx}g!LQjgbKuc6@INcqv&*;ByuV3}&PfrdV^m_rm0SB-VjrW4297Vu-<(i~1JZO|xZ
z!Kc1}P&w1lsB5S-P4Mm$b|-~GtqLt@sQh$ar+er+g#(dyGxV;05OVW=j}dU8ubKWi
z-9|W-78F`!@)Ma;0%Pt0Yl!`}SA+SMqKqZLsso+%5&MwpI5M%ra{k&M-x1`4zYI4M
zx*9(%?GJnMyVsv+d+l|S9Q(LDe(oI~wSF;i_mZzHKp3Mg)eLI8U+UDK4T=^u@xe{>
z$3MBn1zd121D<Icn%!JuHiG&%rTZ<Mraf~@2--}o{ZWkW`%XbQp-d*TVt}*SX8>Yw
zMQzMV+MLObI$+n;M0m*V^E&zNd0rxo9*U<pYDi^|*;>`oVH?~OV*Jj62lbaDU7Z99
zRkjn_{)H(-QtII=R^n^_CvEG;cWJKDGTUN*4}oMq&pU&cIxzgw*L(ZB*+O^xaRqPQ
zn{eD}fH~)psFdbm_VC@NL)L3RA3hYsNt-S6A=LYe*x=LE`>g~gL)5wlGyw;Uw>gi<
z^QU;^bA)S7RIa9Ag(e4NH!<vP&y4}OQ%g>}CMuEaFmY(l8S1=XF{hLkT7y6C;`IU%
zu>4N634*@<*;+#2+T$!h9itZ9gI{X0wgwX5WP1|*{Zv8oY2-L$^ai=U>iVeanl2Yo
zJ4nkTX>jwJFL9yU*cKPC$HYYK^I}9Y)UaPR=>q0H@yI!G`5a9G*A^O@?!6&*^TzQP
zSvOgB^-b}?(!?abt3zNPRLX`{>{`64Mks3wyB$aFB(T7<2ka;+!DZ4LW#SO%L6jug
znRp<9+85<CZnkb8G<qXFfl&5=vrhzlnWt3!x3@!>x#xcK_mW*~L7`7nNs@R)Y4Pvc
zFmZOJa{x{D7Km$HmLR!cROPiGu%8KA2){q3aapkOJ$9XH@BEh1@Zcu7H|n$B0kdlu
zwh5V(zC%G$NGOWycmDPM{Z@3yd8A-pnhH3#TUjUrqZ(2m%!EV;xC&E)k^HVRvEt#X
zxGSC912)Vhl<7&GoQ}@x!oQ*&pP#!Vu<kCv>&~A-CIkRy^2|*b;8k2IAWApS{w>Av
zAx%eN`3lZ>2O6N8IZP_m(4k**;xh(5{=~JW53~G)678CKDp){hCu~vQ6wJWj({JV1
zouymlWy|yNkzo0PA4>vpiKU;>?-oo-x<Dz8`%Ugv0$YEA(mhZG-NGEZrSz3?*I_nt
zcF1TeqmR=E8Adt8Da1esktamQ^yP|GvZYfukIBx;x;S2EGv;Ev;G@0h#_E&8)6W1p
zAQ0pfJ8_5nsRQ|^ZI|c|?1V$dx+5R4g^RDg1lr}HV{|>0k3c}Hy{PG~w(eEqgd1TB
zchwgYJ}LN~Y@t1-)l#5~b-K7Zxzts#nifi{Ls*pbyE$!ZXb2{4h?YO{=o)I@Vg<79
zx*?iRMNpynXU{M?S^#+o>gmABAZ-KUR(f<Q%G@bJCZ9m`b#a{X&TnKiL?!Oz*EhCv
z-4*?i^Xe{reD*mZZ%4s*2cLC2UcAK@WiG$7{ml^w$qtDc+?u=N0W?a1GS#k7_iyb&
z`!DPyXI{<M(g-Z$XstC4W5SW&L42*@0x8+bFqK01aXp<bnnaNC3b&Z4?vX%BGtD71
zbTnG3c^wt7{kS3GN!*3H7FN<X*}f9{z+Ikc+nBH>v?wpfx%`Ax)<LyMC%)7z(i%BD
zcu9_ugg%04JT!m*Hh_m5fCaqLb=gofVGmgXOV9cW)x>a*z75Rwx7=<vz6#x1&hEN;
zO|^>A8V%Oo=jvaZ^D?so&7;Qni7q?}c4xR(-BVh-8W1k3NL>wc6+Qnrt?6^FW!>3J
zkQgJ0*&08G*yI-&S+AniH-e_#RC)a__cA^3Uaxu(b~T0gb!h|6J2tXl*X-A@rCUL@
ztJihop(Wl}5(J&F$y3<UoxaeOJ6F_5Oy2haH}${h6AkmI+%WHuv=T_+j5(&MO*2&`
zN)(KKRB*y|6Y`lNkQxZm`;bba*Fs9kN{xHZwI}l1KA>aIRNp0OWvE`wq`SgOGYINe
zRY|e?COA!?N=FEXT{&{x7E_%%7@WA^JE{XohnC1Pzf7$VTlLVF9m}!Ls?s5JpiK>;
z75kcOdmCs3_u*XS@pj$i7F{1^m51fbo>+*1f;8m<VICtV%S8fKF;&=9RHkO<*|Vv%
z<9L0#0+RT)bb&iZIpP!_XI2T&`%UH;PYW@IQco`IewWlQnWei%9wP(F>@IJXjA<$6
z)59Qs7nz8A8eNoh%rJRbAo<WY`+)h!W%&N<umej?l?F&(P*Tx!qRo5#A!_6zf+cr~
z`Y7Uqu?J0U9(-y>H*xt=w4_zRb7Yg%0yQ7pk?<~Xm_{E{Ons$IoukXWC27VXMtIJs
z^L@GWMDlXwAYQM759Qs-w?loJKo{{D@cUD4cwnSw)Wfs#t%Af{R>nHJz&`k~>n}gL
zZ{DV>BGye9X$`NB+))~UW<eyx&r`o3juzWJ)wr||zIWB#t5gu$;xoYTv9tH?kIC#z
zF|_>VYJSydaYrr2A!wk>9t!Rqfz;8B$wf?9KN{&sz2M?0?r+tFQ1zfp?Q~qc(Gef|
zVD2d+W@hC0gH#5YoJz8`H)`wd%im+_S4@URy+<Ci1FIR|xlrc>*_uEe)ar?G!n=<4
z@hSP=X-{<6NP-lue`OAe*K58Pwv((PSM2NO4VGH)9Xk=O=s=B3?qUnRb9XvlgQPzl
zy6sq1jjy6#SF6s^PJ<b)Oe{oN-3$9?2pjpMRxRIg0QMFVgbP%Je*R<|9xPe;mJ7|}
zvyjc<<hNKB(adztge+$dT)i%bRo%f^w2ng(meza2Omx16pXtKWLVp&HPKfqa7Emo-
zYQwKA@Q-ruU+veKtdbp*J-2%(FuW{lzCJ6x7ZE$1W9WolnKj0Fb#ct!Dl<R4GPn^d
zgM^ORd#ra}NJ3^D5vOA1zu2f&X2dm(K9ssF^rq5Uy9J3sU@$x?7e)&Za0ee%>t36`
z>(4uo6wRVMHm{P00(8!{C1P~nOs}&__lY#!V7K15!=)#x2{G;hNKUP$A8<<TqW1tj
zRYw-TJBbeW-XBG&?Sw&83ydGyPA<$Zr7rv<x$8D$6sp8Wcg+3p`oA_#3xjSx8LE18
zGx*2%N`GgOkSmJB_A#pO7oD4ZZ>7%zX`#5rv+#c%voOd$Q6o7jz$OF>eDDue?Pf^N
zxg-7Ej`3Ef`;aR4ta77wT)KAkIp@E2u4ZdpCP@0{Jae~jQ<Fzn$<K1m1y=X-X70Xp
zf<YHMBV$1|l#t@~z-A9lny`5BfoZq;%9f>*SIc+F3`}d>JQSK)(fnf)!oZox7^~1l
ze{3db;38GG59<V)Xg~Q{+JS`rV2+b^8sCxY9c59j*;lD01oxRDE<^uX)Uu)!j0^L2
z>7a(a$!H@iaSRp|l)j=DIv?G*sDq~XWng&pfR*~%Jf)Y+uKfX!g&OEBN7;j|^+O-}
zU6+)M1o!e@D+<Dhwu|I;^}bt!*>lyNGq8YlT)!M?f1aR_tVB}quJ#aa0z1S;!(yz1
zqV2wSS9q*;#veHY4boV7Ngll>Ax0+sMj%A$Z3<htenhx&GNaLH%#Jm;hEFp=YQ}u{
z{jOwn0@6EV$DT}{_uj`dH<33mayM!Wvp1ha`zeyU$OyZFIg6B1_zc;s*N{gV`^RF`
z(D03e58v!HWUx32PH&)eWMv4JdqACG{A&KJXgyVX%^NYJ_7gUX3E?F9zV2Z52|!W%
zvMxbsr>*LuXDGTH&@zUK^W%+^yv#PW=3n@Y;X3vv`5jCJM4OFLbI0{Pb3UaN>D1!Q
z4mM<0dlYn<_Pf^Y2zpa>ALg_Ks;Y7R(#~Z6^d*MG?v|U)vCgM+XbxomgoM6Q1+miX
zU}d0QLnW`eJi3+`{rat5wQDMe#+uZ?a-1zIsv-Nm6&fWFCF&^GSD5WdqRi&oy5jX8
zT}_}E`#j7x=N0T^+j;7Mch!o><DPh=*=Z$DZ3%0q)D)7Zrf*&p6+;;zs=Nrqk?X0t
z@4x0|XAYLidw5%jPx#)wccBsax(r;}$$i%uYFoo<C9*i0*cwpDq&a%rn-vN1NSVV5
ztme^Yvj!5BdFx&Xa|sf50to_8s~|R_^^ehErmOTm5{QHFMs#*p+7cw}WfHO?G%TG~
zu|uvI4+WtSMX#&EqU{frN#ib{?*W#q^gI@e9oSU3CZm~7b1No@qlJIyXmXFC=A^BF
zy6HRs^HkdUw&MY4faIME2|G>4j<&D&=uR*WO79>Pd?0mD;D9cncfy7@blR1Y86<-s
zthr~%O;5;OL1X<JsgCa;yj9nv02B24`xxG}!g}1SwmAYoB^4!;-Mc-_Q^Y++L?p_f
zGdu}=2w_i#@&s5M9lmVRiF2R>oIe#j?Y#oI+dT-h-O60MXH;3<j!OpPuQkRTxDWP%
zO|4t#1RMrUsEmF;V9XT4_O}_TY$L=O+!%MvUGc_?aId|2dPRWvatP%Cm8!??)WhlT
z*A={_cJn1F89Jue(&RnX?<5$PU2kdKScS*qvJ1g@?hkDTm7*Z0BdAWGV5g<wXimb+
z&X`y9F00Ki5RTwq#=v_Fp<8%yS@ib-V(zX1<?BSG`j{Rv$FQhjQ@B^w>pp#y+TGx}
zpagppmB|!!v*&0E$2r^Pp}g#>BxZ3BMvEPO&)h~5?GpoMGc!WG=$8z?be6EXx2jCL
zi;aJ7w7Xuw4g4$$xikQC)0hEWJPwV7njP=FPk30YTK>xmeU48?yBn88Z!Jdy^k%O|
zHo8%-+3sz06dRCP@w)vqKQ(;EsygOqf*CVC`XS85F6za=XOvVq2S(O3ha_x4=|I9^
z_FJzlJ{8N?^kgK4stChk2O||TE5a1d@zyXb=aR~4dRPrc{ViZ9J;y_$&QDpeo<3gq
zpNh>03*^9uH@9eFtC+SOXtVwfo%Y3!l6D<b4V`WswoLSNOf6a8zx%qdApD&XV}pAe
z3(l78q|XNvE12l%vz5HX6aQZ|f2|JsdUexDn>_w+9wXAStmPZSA2RgZ{96C|g`UH1
z(?7+W=dM0n%Bm=98Oku)9UQYF&0kE4?lm(a;8Sr<Fb+3;Og#>FZUQF!JOL-M(Zg=@
z^Z+$s;oCs|Xm<7Yg4)V;H7L;N@l<E8RRr<M*Zr8b3P*afLi7(2EJK-t;ArJTsTa)w
zc)eIe1^-XnOyu?5_5b}A;D5k|`0w)lAL4*bjTL)4RIzjD#aeut1^V(Z47+TzZLVp$
z9r-17Mkyf<{mgNZ4e~ZO(XKH{-pz)l{hgipp26y>pAxalK@evZ8zGx3StZ@_2QG-W
z5$RZK1S_se?F}L~muReIREA<xnJ)!@2njKT*EX^`tBmK<D*_MbdZ&*>4`nH3)&g5r
zd;D+&DHm&HEDE8F8R|F8Xcw-gbWh2<SQFoE9m}69J@vd1*w<|%wsd}cslhMFxcoO<
zl750~b4&z~jcFx*;o)6RtTbldNG$eYZY1`l)MnU{v0Ek8CktLXZpnO!#Bb%?m{tUH
z#pJcML-`5%y1gVK%}qZS9q`6DpMK(nYYtNns~J`aEnHDodWO|3dM+nBUB&Y!qJp~!
zfIj=;JzHfboG|Ka_ROecWCGfUH;z_%%S{F$Y;3Jrs_>XE?$Gu96sC&>Jen<J39)m<
z$5zMt`u4{iUvD6O9FZIhnnlajmPl+oI$L`&FL^7vitx75YmO+&56&7GruAdSt)0Dl
z3en(<!y~TP*bP_YngUvz_QDavJKGpEJcg>aKMGf*|B!Bb*f^q&jMCIzRj4p_5=MRr
zIJ7Vljk|~Yy)uN#a1awUNzx|um&Id9XlY1$0l9&dgqx@`^mLOHi^C^dqC%83t@~xr
zqwx$-7{F)kXb}DuCZwK|F|=a_j-(_dL-{0iE1h+q@HE$FtOnXl70(b$yZez`BkbvX
zaX*~8gS?4~#vp##`%<tmsP_?_jl-~4t9#BX7vfG(n2{(uS-<Q%SkE?gg+f}mAPqGk
zj%iG(*~EDpuH&+^d;{=X<Gh7R94VzXdnH#j0R)&Zjk1Rr=*e(y>^MsAG%Gkus~lI$
zo-r{L{aDX?`SgN=z3rRg@SM}-ZKc=UgwMAylrK;_(y_8!yHh^L)_`gh3mxtY6n-|3
zZlk#go_^RfsxBqq+|d{)eMGL=<&hoD$MdxHI@#-qty@hGMp?mDfn*X0Wrkk2jSW^G
z^2*W-TP?c3KG_-gaRXRFM86O3m2FN@7?~hlFu;A&Vl#d&8`MUw=lXAXwrf(KW(xPc
zy5sqsN3z&w3tt0K3W-d%3(K*o_+Ii<W?WBFwN%-@jUPHHHF0*vm+k2Z{~(uT%Rv{~
z-{E)nskfe5{LVH@qRmt2JB>44HBzY+6=rK6Fh!T!2S5957Jirfh5MKR`q<o+$~Fpr
zHGC^SgR;W#9K9k#9V>tY@`_woc_dR#ePbt)#Bhx6r{MWhqi1EedM0LC|I>>ib-@<?
zaz)(w$o!AGC_f_D*z7juv4`+Dlt_%Ktoo|PRJxOD>s4lzv^4ZvMYCXBN5`M+y-FCh
za=#`hWGJ^aB-ER3zI6(Dz5boMru)_Vmx1Q58HB&89WAzaDEg^E!(^RizOHn?gu%4V
z0=2WJ9qA4-E+qEaXAOPQi3}(4^>EAPR5L)PitP$X#OON*-;_t3cs?|@vYdA+Pvfbk
z{3@{WOXwWpWaJk6XbbN=Pd~bHTa(zr%NSEAInOQ}XijJ+47-KxOgDmp;tYC?Ng;pC
zQf_@zoqpA)C9P*p&pIX6h&u`{X(!jXJ^)7|B#Rs_EBtJ|Ox|c}IK_%naA4ItcR<Kz
z4ec9)S<eOmJf7#5i#TEna9HUX-FfZ!(#aQoo7k4pr$1*#!?gCP+XW8!=E`r<HCY!l
z4_~#PxtCxT7C-AFt9i46n(Iv&8o?sZZss~AufmZ(%fR<>Li?kQC8L#OcHzypEp>$i
z{GePUk0lfdP|rt}!k2^6!{NP<y$US8zT@umm*+Qgo6yd+f@gZ)x{WXL?hxjbq;jXY
z<m%v(dJ=$((fYx<E%iZUOG*?bIe{qi05d~%DWiw2lkcm7gVbj@&+3#9T8PfjjiEBM
zG7%+oR3XR9r^-V*6lh`N>MPzsEsx!z?yu9sUX0iJnkjRn;}Z6aLS5T##-cp9ue|jZ
z`DDFBwnj6c`+p4dWM>yQZCZ?bW7Xtm7r(rq$ls}@I|uK96R^AfKqW`xLOrd?QGFEN
z?2PG;vu=vn4I}Ky0i74@Krp~%>H|_2-i+qTtq2nw#e~(ebjg*hqfBi7*rpXG8;Uz3
zP*QmxOO%-ui^nDLy>ZKM^}a6jq$+Uff%wBsOAYa1em(qk|It&3@#^Up%4R#xW%+Kt
zn6=|EIhF>H)0!}FZuTP=N5Zd>%@FZ&+=fX3jFUPXv1ze_D|5R)%1o=d2bu+Ov5IkU
z`G(4sK}}&*jjW^y*C{P-ipJ8!^E$g@sV`Hq;RXc&?Y>1lM)cG;Kl|~DoGxzV0?%sg
zU5)Lx1uwb#9zjxVxT@Y3`P5lB;V%-3+Ss{^Q|NWL9jM5!-wW&=oBM=XO}RT<o1w11
zT+vgA8<wVzP0{FX$apJI#Hf0gq1!*K>t*${B>t1QpsDadp9|4)qTV3l{|2A83&%rm
z)IB+yvmLId7SA~3+U!$=m_M!!B(k9N7%jq4$yqECRa}L+GW3~MxX$oZ4$?gO4O>=!
zu<89>)ot-hwjG2YXJeY$9*}$ij>UU+J~eSCy?WJP+SkmicfmaN!`1%pik@GlXQ)uS
z`S^cWIZS3AUmn#IAI&%&UzJEc0VblLjyxuK_EGfmXb(dvF<~HG*463(U{s>qLWa52
z@2tbrKKc+*;irY8OY-(Q0^4U(=gCSxL^ZH4-0#DACs%&31w9hAc{XQF=33tKM*V0_
z?+LD}Yad=nOkT*+?QV!ns9XO2*s4^>T$>`}ExfieavJ+i@ps}29rO?OQG6q<2TO{s
z#vC6`G!!)bqXh4kGpq}+vkJZH5`$QlL<jWY^gcDX41uh8%PlNvc~PJI?_9?S(w=1x
zo1w4eZ-b@7+A_!nK=}nQx?796KOIU=VuQv>$Aw*#SgxK^E|9SMnyr&DG83WFc}?As
z-%xjgMn096g>+Ur)A3y^7H3@!NHenU+}bW#Igz0>eoal5l`-QCe|A?E++aI*C`Vqp
zG1&wV-H-8dtarD~v4%Z2i)fU=&Zh~*irUTCu`pLsYAkJUA$cR`;84;j-}YC2+yiuo
zBN7ejOvJm|g%paZ;(9!7Ru`#EERh;i&snvk`SHsqDK6>J40~Ykp#(Z=vwvu48(v(*
zbdr(=wP$D>(x242oBG(Jw*`D4(872gs4DnZWYp_u9~ej~)}EuVdj2SLV5;HwUA4NB
zmp9{tqTy1dO=d%f>88DZ*pna<%|m7*6$7beWpaV)J1-Gu4B4jCL$E6`u}+mR)aH<r
z79_eQb#IOnzY-v?KVD$Nh}>!h`0X_{n3eSTI}z6Plk;`OYMy;v$?#P&DE$3-JU*6>
z&2d`pEOG!+$Kx(%zHUOKPw~-t4Lf&o#>~~cV}{w3nur{_eg*+K?AslXs3&UX>RMxy
z!_w882cPD`cby26Mm+hnm6c_Tc{jfks9_RWHx*F%@IqYnAaN8@VKtwi`Ek=)oT=iG
z)g3iBh1Gx<kY%Q-!mWj_RR_-O5?mdCa<t3(Jg(Kax_>WB0)4tkM|+>78sS<kTp6Dg
zJqL@cj5|3qKVKfNWvWaG3M*kg^iD72+|T{saIF%07jp*w`FXLw>!55iHHhiVTylI4
z!>D!T=kYE8%PIB*82@KuI%D6?V&K@ESuDoPwe*E)XyEN^qNNq@v8b%OKWF#k0n=M0
z;|qEpR~&BHG3Pdzkh?T8msq0MC7bA+*kG1BYGZ3T#n@YqUE+8y1CUR4X1)!xMp9cH
z&8Ho$s`hn?MccK4od|E{*Dv05pU`7-jlK*0)%v-QBIVIFE-dIEX4B?Ir9ygMKqKw2
zi@L19rYzQe+HZiq(vx0s<;!8etP8iWz{~lM(_L)H9&HI=`JXqvvqu&OntSznXz*y)
z8lPuoT5nYM0G7a-XV5+y!I#U5H*Naf_)|1-C*k4@bNuLG%W<(3HeS4<2fzTjXgqu$
z3KTpaMjQ2V|Ev)nu`|Yw6ITETiLpHM$}J4vh3D-@V?+S9Af`mF4cZGGNtf1M!7sml
z7TPaj;>w50`xx`O`-}9!0`(265ZHBIL=<<{8T|!^CeDQb(+7LCWH~L@s3o`^PZh^<
zaN>qTZjtJM09g>zrrkLRmmy%|)gJ9tSJqG$ZOIP66~DY2X%`T1vT8=%xq%+fGF0W}
zN{A|SN<P}Q<`#Lea>WN@*zh<(NFXuxkTY#YUjOZmRgukpagjyKN%mPtq7~T}I>)V%
zBB<<po`8db+$)+fyp(xORu<JCi8HB*5k#l%+4-hUT<3b0oaG&!UeH>ij05Xezgtt5
znfgj_vc_2YUP}3>njPbukIEkUV0L%Ju@@)UJ{!Aa-=;)fY_X~g;N10Qs;OhI0NTwP
zzDBWJ7ooYxboW+AU@O7QFR@Z2W&3_>mkz1PG^+)$UkWe7bo*SiIPFUrCuI4#?=i~{
z?+aUAg!kf5u_8HO00LZjc1NXN50r6L=~>MgsgEQ@Dm5*Pgm5S}7Ua6};Yki^%Hw9t
zupSpwv__rR_a^2uX$nrREKMm)KSQ#LRBjdZ?~T(H|J~D$7lllKDc{gMM!BS@=y<g4
zsd8-f6SeiMcU0o&#<+(}>-j#yyDfVB%I_zprvaoKlY|IbSs-4*?_t`{C}r!(Fs(e`
zqyL4w8R6ZRrTkTyx1o}DZ&ae;GAQpZ-I<)L_WF10bSd*ex>%{V=bU_r6<jQ?*X2`H
zX;Z9xkvw%-l|}r|{T^DYO{nmgqi_54aaVph>lCFyVsl)AUwz!gYmS8YOR-2bhAZ*l
zYR*sE5lsl?<sMN?x1=M}!0#B%wyx6F<lWBZhco2Ltvx(AK0LtAY^e`?qnvD8m^#Yw
zU6{ZG)Df`YJgN_`E1#(Rz}6=nr|`M0JDBU>+%1xXAW~?(P=3G#IB!ToT>i}jicVQ3
zFS#r)?a8~8fWzWWrYSxz1WniV)M1BY8MsWbeuRE!<9BcUS;#u2z-!n$nWd}XwJYoL
zf>*wN<@3qgu~5H`B4eShiNv#yxl4#hf%W)jk0y~E<9Vof$Op3Kcm?iWT>e#pyWH6~
z)z~+ibeRv0R<E1riSp+a7wRRF_i3kDwPi7E3Kw<E*5Ay#2OaVeYK<ve+ZC@<F_hQc
zG)^3M$|Q8Uo`3V8zX{r)XZ1S&1_*uL48|wS$h+&4%YdGZ-Np`3-+v3;P-rMhQa3Lx
z;6N*Tn<q4A(sV?v?Amt9{+AOI*$aWxEffc(;?EoQteg>Sc$YY^JFw;zd01!QA>yg=
zb76qf4^?xG_@q}7<8_mh2sPLE0@Fjc(MzHtP;mFXbTFG{o*!OFx4mnELvWIpAgiiU
z+n?p2YZQ=NPszxDZu_K)w*wUPLEh3F4B+)*!W{NgT4f#!L?$m~Wv=0$;)=1DM<Q>R
zk$zG+*G^WC2gDz4{LoHQj5~>*HrRYIowX%dR@y~DmgAFy?&y!(ogSC&tbLb2JmQrG
z$Nb@Cd+eNenz`m*m%gQ0$%88{uxrp#*Q+{m3hjDsyX%B;ice^wA_gG8ak(DqoXFA}
z1lL7vzai78D~$ccFS<Zsw|!ji`TIoEQ>Zx%!+-QheOOv}1yfj8P3isp%TYhxFmt3H
zBg&Sg`6T_1@}UteN9}QXC0mHy$86*vHLDqJ+kf~lCYs;{+OY?rjdOkD-dsw;2QUIv
zr?uQGdNMN{#pMtBR*1`1l;zj-xmS6mj?_3+-?R=Wv-YiOiC*CI<;MO(qhm)XESaP*
zta$k^Us;-2DXp?{7&#K!W(7p<+P-S(oj5d1NuZ$1)8b4SWVA~dl67MFxNz&`T7G7_
z!%lsLg2bwAnqBPfx>HUn$IJE9Su8|WP$7n=)P_3h`FKJ^XQ3F5Y{}UnRVKwdU_Bo!
znwv6uz(MISf=DiGzcw@b@n>md{H4Lzw+P4jsTq|u1tQ}CqGl4IEgzfPOD?Fp{_xL{
zhQtEOM11YCD(-@G&$d8(6EQxYolPUU^z^`w6d0;!yFXoPG4TRUUmC4pw{;Lb%Z?Ov
z7w@Hr(ah2wxpwE|ayygD$exatIU(@h;~vZ3)A)HutZFrPVcF^lb}el!O}fr7IG-B&
zcvG*QmW0hvRGe+^k}FT@)J~*#PO2j*fVp(K%9huquH?MQ?JLdemlWvZ{z?#&ia}9|
zy~K!fII!?tYD^p@&<_-0!@5J|Ma5lMx0(WIrB^LIE-eKN2~V?0a0u*mc6CfaYdBe_
zRwgd}-%m5go?@w%rP^$Ap?Y3l9J4=Cb!h2|-0){v*tE=kSorgj(`Rqwh+#>g*!%V+
zywNuTLKNYhHg`aQuDu<fEb?Hi<^7r4#@U3RaZ+L&n=^x=)r)O5K#jqFKLe{3!z<OI
zX)9;x21Bw`k{Z5d<!??NoDo@ru6#&xTv}j*ICVx57m#d49~GQwvnnge3g}^XSu?n|
zt8c6fyg9;6_dPmsT`V#Z#0z7sshiTJd#DvhYVPmvqdv%&#0zCeFAG`r>%%;|%0wLQ
zKy7pUShBPqc#W(+oCG>CS?Q~8wcD(pi48aW5v~)CoYk143;6RTe5ye$Ho(FK)SP~E
zrD9H@`{_aGlVZChl5=XJ<R@V>anMJ<w3QE-UN&2`+r6MKM^`nMEX?B(KPn$|GU&MF
zm6k@qx_iSDHo&z7D&bdiG~30q6~#=K;YtEzwB|*u6hZa(xqI4G2u%c)th6FW9hb&j
zNal5is5!0c)n@zIx>m1Uy5yP>5bY1d??HvS7fgw?ld7$xXGEGC$q{+{5Ic!cY_4S;
zP{<D!-Tw6EiG$wPsqxEJED9+t{yv8)S%YJ}Cb5Oli_M&l&!E3v?!AO}YADbL*ZBgE
zRc>q;rW8g1RQQ?G<9%PR{ongNi~PeQz{Hu8Fbv=&^Y-pmAJU|tB`@CTQ7Y%#&^P<N
zMe|xoq?3Ak=aafQ(mef^i(@+Z7T?pRUtC57`?iY6Hw@v@2xpWP)Y0jYa+B&DGhV`O
zMLvRbleLe{tip$31m`Zsn`(oub}FxHj=?RR8<NTG&-jNO|Huelq^yj7IH}+G3iJd;
zG}9+TjJ<Gf%JyUe3ddwax5Vwn^kQ7wqV7)6%c>>T5SwwAs^53c0ilwa`Lk&7!<GBq
z7KLT*63QQAZyz@|n4J4Dt*>S$L?%tDQV_b%>(E==c+38iv8Y^ksKW9ODWMion}8%+
zN2i`^daVkBhRP?7H^EVL_;=)#m}UiGZL=Xw_868`SzA&4*3~l?p{=VMD`=wQ;-5Zt
zFDBn=Tk_gJ9=WBz(nY9~`jj9of~Ey|!wafUgevXt7@Iw6#<`V!3HUEItoSde@?PDn
z0jOq`p|PLt#nOpb7izs2VqldM5<DNDQOQRexMVg9>2x`*aj)8EQG58!trdvCUTT%H
z&Nm9jfIBSP0^f}QFvk$Qn30P390_r1+K_Q9;}#-tnG+k&x<uSa#T9A3{G(aOz6-#q
zJh!?oK0H$S!-j--q3d$a#k9~=e9L~u(`O-RE0?b}_An3@JuZ0W0p1)g>B%(Ng!?9O
ze?-g%#OO6h<yF!ZF**Cg#ymdRNM8OugH?+r;4K4%>Z0uVdwVxLd@eCkB!g{Bh-Lbu
zJlFh3!>2YJVN3#-+M8NV*R(H$V5eo#Y03r06-(OIi0qv$s;gV2;F%eP2CL>SyIo(o
zrQ;7N(CSoZP~3*fm_IN&ep4YFr=zz+i+_gD<4a%bfV@w}xf@~MfX!X7R8w+vfV_q*
zg{Y!kKAdCnIi5+MuoGi)HTyJ)4a>gf$5?pxpCj=xuQ^3K#fr^?V_6_r;^JJX@QH|0
zZkV}|$?Q5aANWqSdCv<0iHxLLlC;93K02BJhm{1tRQ8VA*Ei)4qqmGFEb_g?`7KpM
zT}zJ`a+UgYU)U>*2&FQ@*pE8oH~7}dTQ6uz%)ss2Kut9xq}nGmCea9(;tii6YhQbY
z<GlnVRZA>ROZJ@$OL~GL)J}4@V9+Oxu)Y4aaOMmJu;l+I29wo`zBPVx=6!e#4olkY
z*{D(53cv4tQ34<O7oBI2==b4;M>ISFJt0DFW`^NG_0*l)1q+(JW)!^u1CnJJ5g&Fm
zpG(KqYr4Bd^61BhXdmGvK#Ie-N|ux3YtFoQu@gH{F@y1iBc|e$G>5XxK;C-0^T?7d
z<WRXK>%-<hAyoWzx=*xQJPYx9-`@okRxIfM_RFjX)QSU0MT+M>vR_H7Zm8H5bl4_o
zry+1qQdAZU5@k_X^JBeF$Ge$xx*y{HtWBV?zx;#g0{yyZ<`f)aJh`YRt$62vqw%&_
zA@JlV%x1rHNsu4aO5vcc7My0VdF!U%M<s;>&0Z}OhoG0xR<w!Ju29?Wj#xR4+fze*
zW0Va?eLG*jd#VJ~+Qb&kX=}t@9B0}t0g#Nxv4Q@O*X50U*|0_bvFDF8#`>Bg*Y~1x
zYCVjPft1mWwCPW4;R7o!wW+-J{KO<Wn+VLl?f_+^9()m8Rq|71+F2N^a~sdwQ>lX)
zKxLAmdS_$8;L&H6hP*q3;_|S=_vcf*(3#*K<oky`A=d}uJ@}V0jOyRs;?I&J`?V;y
zFrUk#K9%ec(5X3ZI03rj4xYeIjuLdY#|}wWUOWc*O9}GdV@9x!sWRozMOOo-<K#DS
zxJHfcP=nLjSceu9D^H<{Pek?)>}?CDAh=-$XJj=`LLC|=1|#Sl-Q&cvKKK|rHUCY%
zln@uxKAA9BCJeI(UpCmNrCd?0SM(#F2_JD&?TDr?&jA7Et}YbSzYR~H(S`ssbRojg
z{8d8u$i+NcT?Q+!e61)~AL=b?CZ4M3yNJM*4aU*kfKwz;pvpbrb>(`+9rWiDZore?
zh4G22yv*yQ<l!f(V(e03-^c8II!NnUFU$p@&gNfEZuHsknt<V|Wus*3mC2klGP1hN
z_00ayM~qs+NQI0{#RB}B?5C!P>5+DInL4UN?5U_Be?;#r9Di_85Y%x~A3X}tnh4#^
zM-xtoxA7&>^GIjjlFm+vjGNU?sHR@5RmpR$_m;LGJWpNpNBxW-WXEHl<qQF1nRcV3
zL`>hWl>a&@<2aLo{u?_G^8Y`5|KDg(of0Kz;b5V;3vSgSjYF4L*2NE)KKbIgVZ6LI
z_5J%YRZGVYu@uI`SH^z(<9}EC&)3amgWpX~Lew3K{|yjDYz$|A`JSjgh3%dkBHH;6
z)jzN*`G<#xJ$iR9adI&GKk}ye06S9*|8;|YUv`hc9!eWg{5NXw&--_OgI*Ek|GKB1
zrZx0uh&a5SKMV5m2CCts(a$lLFL}wEWXE++fyqh;n$W()K}1j0lnHCxoX;<!+I_5i
z&BTH$w>)$6O}26aBKiSzmF)zEKS$lL@gVIC7FWGbQD$YY46%jfSklp9>}sSW3+uvY
z#K1g}v^!*09mBI{Kgco?CC}wNnk^S}gudI{VMngySncCL+KZZ&y>N`J33gKAGTq<s
z-_KI+C1#`fuI<8;SFI-LULm5m0yO_dknIds2TQWTkVR|ewo-P=8@~CT*AT=dyMLGT
zzh&+$_D@~&@c08LN=Vd#<tyV1(FpB|#IihsufaKOJwMo6_OW64<!M_Ifo*uq<4>a3
zk)9LYTbLPN@2ofo*sFdn&-a=MTjam_bDO${bVRkb_bVPD!fdXTd5Dm^;b^lfPr$*C
z1To&QSnAQFx@4<{*>erxLLw5pD`K2}{NgQUr$=fs7?N`1iRolcV<KG^i&y1BroW=8
zpN0rfN}?kF4F@10;ke7oy!(WXU}Qa`c%$9k@VR*|Lj^qdb*i$072^oUreL))2e7lA
z9kX~($8njD-jolML^l3W%td4RO*~H}e(ijQxMr6}jfB;j*bNI}jV^y5)g#79=X?aa
zOPc%A@SWh2c2`h2OnAxNVv1Z}|2;+ex$(yM#m#1ez^ne93*A9U*XE-x)QMT4fgN1i
zTbtGb(JLG#mQsOrz4l;y+iT6NIJ#LBhyF}BBPQxk>c=_{hpliLIJ0VF136~1cY7$c
zBA}u9x_0m^p*d<0A$xkiW`6ypQ78jk>v7p4#+(0pr8`}v->tnwB+ejG?&Rk&!IQ0e
z!Y2kYxnG`5$uwR^P}3iK3?HqR&pBt_sXnoAG<5WEO5EW*ZEZoF-TAz!M7=`7V4u&I
zC0?<^k-Wfi`&;}o{O!H`NAGywE*uUTFY6#g-m8lR5oA8V6E|+w+*;^HMR3(Ibd(&;
z!!-=f?an^!3-RysY-RO`4eX}$?kiGhxOz#s?;AqrcaKv(sfrTW5Bu6(vCt{K>`gnE
zCB`8-<C7R6E!Q+}mPtMmZ8Zv9=C&jxL(Gn2=5OF@QrI^`yOgOk->8f1&mic?`;|)P
zHtglRqNXXY@awLr&&I+tPvWZ%z))y~zpuu+@}S5c?=3%$G+w(5mw-`~qchwo?=37;
zQ^{n%A}os)>?MIciH1Von5_rTCs71MiDCUq#h_6wd6i2;`)%UA);%H*(Wcu2Pni7@
zT#@6o>S_H$0PAG=5G(6KmB`(z9&@j(_HAALAET6qBJ_&p_-@oYCH&_7KdT2XZVg!~
z5%G7yAOFH!Qb=O{eep^Kro=D5`wrs0M+(v2qR0K%><`+9X$Orz*1Ey`3H@Kf?xJ0s
zzw}g0G8i0RtKK#Hj3S?Zig@-s&O)J42tf&q<ECTHWEV;-1GB=_zTpJAwbLk;N=DIN
zR=pjdq>k*XA7GszgLdaXJQYi-hI04`j4dk5z2Lf3{o=nO@b(|JC$(cOta!9d^K;cF
zwb8?Vfu#%)aM0&A{9Ruwu}3jg#+h2w1jsX+YDJ-Oba9{R9<fTW>W@#^N5GPK2l3zh
z0v#iEt@9be&HNHGeu*vn1%-X9MHRL#H%EzOsAW1D2&Az1&#Wg#yAtQ{mJT^hBZ@K;
zf0h=tQDXHv7<x7lR+;|2A6rK>n(NB?uGx2MW_wQO<V)#N^T0VG&{hmkCx-a=i4FCg
zH!6+PW4?~&F^v55%@MF2!_(yw0C|bZF!!ZRFk)1A(pGh8kn@hg|H&z`vA;aJL`X<4
zxjiQFI(wu=1tFIM8ox;Y3OgxAD2FPxZNp6eU^X<9V^u<$)h%`+kv}!FEaS+`3dC(;
z>CgvtiB+BNZb{rHb6h{VQT2nC_$3y}#TYhhIc+C2yzP?=a~-O7&WBn}&unZULKPL$
z{_ZW<aC28<tYfk^7tiq03(>k$=NGEZc%F3K<H;fjs~xm18~jox5$QlNbT*eIKyhtZ
ziA}%B`bDsmu-9<+#t>-|_t$BKdUse9W#4_iHq_l5{B3UQ@pueHL~QH5|4VRlFLF@{
zi*n<EU-uiy&T?VXHuD;tpo*|_=hr7W_ZxH<F0=TLkJX*6M2!8FliV+8gfh#9Glq}H
zRRqi-DO10w$?q6CT1gTUIzX=IF{&(zs{Q%~QAVmF4&pK9a-D)7tw&T%`uw|plRAlJ
z9Y45A^!=S%VBbH&J=qN#kEvokdIrww7nmz;YsyUm;TL^TQnT2YAMQBU8K{<cAMN@D
z*Rb1D)ZD|Veh<eyQF)Q?4V<LI147&h$zJ^G<8zZ*S#7I)`9}v>;hx#TYFKGM7zS`&
zDlLmp8O+u6J=KdG|0v!Cp4s1Q8?t@6Zr2O)kudh(<nb}^{5)wt=4YpTN867{X5fp6
zOvMcU{Ff#RSg?+McqT#^JzQ|2FIK{J780p#LoJz3j&;0j;OSyX#IqqHZsCM!(JDCe
z63}#ms1u;Q*cdN~9DtG4|Ld@e0jU?v@w`!=FphNPP{idMVP!wLs2E}NwxL+{hS1Yh
zsifs}V#Ox#b0sjHFs#TSd2fMg7Kb|js&7)b&cS$VGiq_LTeKolneeA_V<3Xn;`9Dt
zHHn_N+uP0IvQpoKb1g94y!`N;D1oJ4SYYmFUui+CNYp;c<zkB_qAyY0C+p`w9WT1o
zW`;pwFy*CJ=Qv<hg5GGud%a#OF>V7z_&Ps^L(pggMq@4B&Coo&bN&`5IZxVC$pdWP
z)F(-*44kzb?%@VywiP{CtXg;QTw$Jx&bL@TY|r)C7Pr!FwO3#JY$<=9lN=Kn-txFf
zL*3>FPf@P)>LY|1TjL+4Zn4b{JRAIK7t=wS7c#4&FtwHcdbB2{D!Vkw2WDeiwobl%
z_!G4)+dEwRKvVxW%@2z*#f)<IS=FN=Oh9Y9S<=mlseUSd41exJT_f#gQF8PYk*)Co
ze4*~^x$Uc`=Nx_3UePw&SnlGL)f^xfU!t2DehA6u)<P%{k*1^O)4%FdH1`qI_UJh7
zbbt3Vwf{~3qJUF4;h1DkOa+kdVSeN6vxHD%u(7X^DD;6>d$^y>!>P$RiF+ECm|Seg
zaa_FThLg5{$x`sE=n<WF=pgII4=slMIJ4F6PG_>0O?$oqUb{TSsekcUmHqF=N-QdC
zv(bOPYhe}+`+D*UZ1pLAE5(wc$SIcg32xB*NF-tHlbzM<7A9Z|yX|2xa1i~Y)9LbE
znPS?#8Uu@G!WgteCAk3tH(Jx_?<c2yK2aM`g$`KwK580tI452h>R4t`k2EBgQ|Kia
zQ&r}k+|_&eU3Nv~8LU)A;3uWYgu{`JqHdanicYo>vPM{e*j~4-e_#PR?H=*n+&^-V
z${}K^SDnk=f-)0T7l&r&wWS7*8MwSdJf27}NNG-v^I!i&SU#9~ce_XhtDkId?`#j6
zsWkNwi9iN2K8an<9dp7Df*N%q5cuS4gC_gchNigSD(hH$pld=756{m{^%_BZdSmK;
z!rlLF->cRBn<nP}x~K2BVW17TA=dvIv=_tP3Myd0KnGmkftVDr{@EL^7PhGJd=>*d
PnTo;F)z4*}Q$iB}_Z^$>

literal 0
HcmV?d00001


From 29c0f5b8693bc8ca6e534e054cc91102f2bcf8f9 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Fri, 8 Nov 2024 00:59:05 -0500
Subject: [PATCH 24/47] Migrate feature diff for NN Descent from RAFT to cuVS
 (#421)

This PR is an amalgamation of the diff of 3 PRs in RAFT:

1. https://github.com/rapidsai/raft/pull/2345
2. https://github.com/rapidsai/raft/pull/2380
3. https://github.com/rapidsai/raft/pull/2403

This PR also addresses part 1 and 2 of #419, closes https://github.com/rapidsai/cuvs/issues/391 and makes CAGRA use the compiled headers of NN Descent, which seemed to have been a pending TODO https://github.com/rapidsai/cuvs/blob/009bb8de03ce9708d4d797166187250f77a59a36/cpp/src/neighbors/detail/cagra/cagra_build.cuh#L36-L37

Also, batch tests are disabled in this PR due to issue https://github.com/rapidsai/raft/issues/2450. PR https://github.com/rapidsai/cuvs/pull/424 will attempt to re-enable them.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/421
---
 cpp/include/cuvs/neighbors/nn_descent.hpp     |  92 ++-
 .../neighbors/detail/cagra/cagra_build.cuh    |   8 +-
 cpp/src/neighbors/detail/nn_descent.cuh       | 294 ++++---
 cpp/src/neighbors/detail/nn_descent_batch.cuh | 736 ++++++++++++++++++
 cpp/src/neighbors/nn_descent.cuh              |  45 +-
 cpp/src/neighbors/nn_descent_float.cu         |  47 +-
 cpp/src/neighbors/nn_descent_half.cu          |  48 +-
 cpp/src/neighbors/nn_descent_int8.cu          |  48 +-
 cpp/src/neighbors/nn_descent_uint8.cu         |  48 +-
 cpp/test/neighbors/ann_nn_descent.cuh         | 191 ++++-
 .../ann_nn_descent/test_float_uint32_t.cu     |   6 +
 cpp/test/neighbors/ann_utils.cuh              |  20 +-
 python/cuvs/cuvs/test/test_hnsw.py            |   2 +-
 13 files changed, 1361 insertions(+), 224 deletions(-)
 create mode 100644 cpp/src/neighbors/detail/nn_descent_batch.cuh

diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp
index 347ccf889..bd41d1ff7 100644
--- a/cpp/include/cuvs/neighbors/nn_descent.hpp
+++ b/cpp/include/cuvs/neighbors/nn_descent.hpp
@@ -55,6 +55,8 @@ struct index_params : cuvs::neighbors::index_params {
   size_t intermediate_graph_degree = 128;     // Degree of input graph for pruning.
   size_t max_iterations            = 20;      // Number of nn-descent iterations.
   float termination_threshold      = 0.0001;  // Termination threshold of nn-descent.
+  bool return_distances            = true;    // return distances if true
+  size_t n_clusters                = 1;       // defaults to not using any batching
 
   /** @brief Construct NN descent parameters for a specific kNN graph degree
    *
@@ -100,14 +102,20 @@ struct index : cuvs::neighbors::index {
    * @param res raft::resources is an object mangaging resources
    * @param n_rows number of rows in knn-graph
    * @param n_cols number of cols in knn-graph
+   * @param return_distances whether to return distances
    */
-  index(raft::resources const& res, int64_t n_rows, int64_t n_cols)
+  index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false)
     : cuvs::neighbors::index(),
       res_{res},
       metric_{cuvs::distance::DistanceType::L2Expanded},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(n_rows, n_cols)},
-      graph_view_{graph_.view()}
+      graph_view_{graph_.view()},
+      return_distances_{return_distances}
   {
+    if (return_distances) {
+      distances_      = raft::make_device_matrix<float, int64_t>(res_, n_rows, n_cols);
+      distances_view_ = distances_.value().view();
+    }
   }
 
   /**
@@ -119,14 +127,20 @@ struct index : cuvs::neighbors::index {
    *
    * @param res raft::resources is an object mangaging resources
    * @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
+   * @param distances_view optional raft::device_matrix_view<float, int64_t, row_major> for storing
+   * distances
    */
   index(raft::resources const& res,
-        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view)
+        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
+        std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
+          std::nullopt)
     : cuvs::neighbors::index(),
       res_{res},
       metric_{cuvs::distance::DistanceType::L2Expanded},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0)},
-      graph_view_{graph_view}
+      graph_view_{graph_view},
+      distances_view_{distances_view},
+      return_distances_{distances_view.has_value()}
   {
   }
 
@@ -155,6 +169,13 @@ struct index : cuvs::neighbors::index {
     return graph_view_;
   }
 
+  /** neighborhood graph distances [size, graph-degree] */
+  [[nodiscard]] inline auto distances() noexcept
+    -> std::optional<device_matrix_view<float, int64_t, row_major>>
+  {
+    return distances_view_;
+  }
+
   // Don't allow copying the index for performance reasons (try avoiding copying data)
   index(const index&)                    = delete;
   index(index&&)                         = default;
@@ -166,8 +187,11 @@ struct index : cuvs::neighbors::index {
   raft::resources const& res_;
   cuvs::distance::DistanceType metric_;
   raft::host_matrix<IdxT, int64_t, raft::row_major> graph_;  // graph to return for non-int IdxT
+  std::optional<raft::device_matrix<float, int64_t, row_major>> distances_;
   raft::host_matrix_view<IdxT, int64_t, raft::row_major>
     graph_view_;  // view of graph for user provided matrix
+  std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view_;
+  bool return_distances_;
 };
 
 /** @} */
@@ -200,12 +224,15 @@ struct index : cuvs::neighbors::index {
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -232,12 +259,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -262,12 +292,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -294,12 +327,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -324,12 +360,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -356,12 +395,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -386,14 +428,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
-
-/** @} */
+           raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -420,12 +463,17 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
+
+/** @} */
 
 /**
  * @brief Test if we have enough GPU memory to run NN descent algorithm.
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 9e4d453e3..6209ff819 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -33,8 +33,7 @@
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <cuvs/neighbors/refine.hpp>
 
-// TODO: Fixme- this needs to be migrated
-#include "../../nn_descent.cuh"
+#include <cuvs/neighbors/nn_descent.hpp>
 
 // TODO: This shouldn't be calling spatial/knn APIs
 #include "../ann_utils.cuh"
@@ -356,8 +355,8 @@ void build_knn_graph(
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,
   cuvs::neighbors::nn_descent::index_params build_params)
 {
-  auto nn_descent_idx = cuvs::neighbors::nn_descent::index<IdxT>(res, knn_graph);
-  cuvs::neighbors::nn_descent::build<DataT, IdxT>(res, build_params, dataset, nn_descent_idx);
+  std::optional<raft::host_matrix_view<IdxT, int64_t, row_major>> graph_view = knn_graph;
+  auto nn_descent_idx = cuvs::neighbors::nn_descent::build(res, build_params, dataset, graph_view);
 
   using internal_IdxT = typename std::make_unsigned<IdxT>::type;
   using g_accessor    = typename decltype(nn_descent_idx.graph())::accessor_type;
@@ -471,6 +470,7 @@ index<T, IdxT> build(
     }
 
     // Use nn-descent to build CAGRA knn graph
+    nn_descent_params.return_distances = false;
     build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
   }
 
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 8c5767c50..883d82d76 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -16,42 +16,41 @@
 
 #pragma once
 
-#include <cuvs/neighbors/nn_descent.hpp>
-
 #include "ann_utils.cuh"
 #include "cagra/device_common.hpp"
+
+#include <cuvs/neighbors/nn_descent.hpp>
+
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/error.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/pinned_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
-
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/slice.cuh>
 #include <raft/util/arch.cuh>  // raft::util::arch::SM_*
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
-#include <cub/cub.cuh>
+#include <rmm/device_uvector.hpp>
+
 #include <cuda_runtime.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/host_vector.h>
-#include <thrust/mr/allocator.h>
-#include <thrust/mr/device_memory_resource.h>
 
 #include <mma.h>
 #include <omp.h>
 
 #include <limits>
+#include <optional>
 #include <queue>
 #include <random>
 
 namespace cuvs::neighbors::nn_descent::detail {
-static const std::string RAFT_NAME = "raft";
-using pinned_memory_resource       = thrust::universal_host_pinned_memory_resource;
-template <typename T>
-using pinned_memory_allocator = thrust::mr::stateless_resource_allocator<T, pinned_memory_resource>;
 
 using DistData_t = float;
 constexpr int DEGREE_ON_DEVICE{32};
@@ -216,6 +215,7 @@ struct BuildConfig {
   // If internal_node_degree == 0, the value of node_degree will be assigned to it
   size_t max_iterations{50};
   float termination_threshold{0.0001};
+  size_t output_graph_degree{32};
 };
 
 template <typename Index_t>
@@ -300,6 +300,7 @@ class BloomFilter {
 
 template <typename Index_t>
 struct GnndGraph {
+  raft::resources const& res;
   static constexpr int segment_size = 32;
   InternalID_t<Index_t>* h_graph;
 
@@ -310,16 +311,17 @@ struct GnndGraph {
 
   raft::host_matrix<DistData_t, size_t, raft::row_major> h_dists;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_new;
-  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_new;
+  raft::pinned_matrix<Index_t, size_t> h_graph_new;
+  raft::pinned_vector<int2, size_t> h_list_sizes_new;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old;
-  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_old;
+  raft::pinned_matrix<Index_t, size_t> h_graph_old;
+  raft::pinned_vector<int2, size_t> h_list_sizes_old;
   BloomFilter<Index_t> bloom_filter;
 
   GnndGraph(const GnndGraph&)            = delete;
   GnndGraph& operator=(const GnndGraph&) = delete;
-  GnndGraph(const size_t nrow,
+  GnndGraph(raft::resources const& res,
+            const size_t nrow,
             const size_t node_degree,
             const size_t internal_node_degree,
             const size_t num_samples);
@@ -344,9 +346,14 @@ class GNND {
   GNND(const GNND&)            = delete;
   GNND& operator=(const GNND&) = delete;
 
-  void build(Data_t* data, const Index_t nrow, Index_t* output_graph);
+  void build(Data_t* data,
+             const Index_t nrow,
+             Index_t* output_graph,
+             bool return_distances,
+             DistData_t* output_distances);
   ~GNND()    = default;
   using ID_t = InternalID_t<Index_t>;
+  void reset(raft::resources const& res);
 
  private:
   void add_reverse_edges(Index_t* graph_ptr,
@@ -371,15 +378,14 @@ class GNND {
   raft::device_matrix<ID_t, size_t, raft::row_major> graph_buffer_;
   raft::device_matrix<DistData_t, size_t, raft::row_major> dists_buffer_;
 
-  // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827
-  thrust::host_vector<ID_t, pinned_memory_allocator<ID_t>> graph_host_buffer_;
-  thrust::host_vector<DistData_t, pinned_memory_allocator<DistData_t>> dists_host_buffer_;
+  raft::pinned_matrix<ID_t, size_t> graph_host_buffer_;
+  raft::pinned_matrix<DistData_t, size_t> dists_host_buffer_;
 
   raft::device_vector<int, size_t> d_locks_;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_new_;
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old_;
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_old_;
+  raft::pinned_matrix<Index_t, size_t> h_rev_graph_new_;
+  raft::pinned_matrix<Index_t, size_t> h_graph_old_;
+  raft::pinned_matrix<Index_t, size_t> h_rev_graph_old_;
   // int2.x is the number of forward edges, int2.y is the number of reverse edges
 
   raft::device_vector<int2, size_t> d_list_sizes_new_;
@@ -971,19 +977,21 @@ int insert_to_ordered_list(InternalID_t<Index_t>* list,
 }  // namespace
 
 template <typename Index_t>
-GnndGraph<Index_t>::GnndGraph(const size_t nrow,
+GnndGraph<Index_t>::GnndGraph(raft::resources const& res,
+                              const size_t nrow,
                               const size_t node_degree,
                               const size_t internal_node_degree,
                               const size_t num_samples)
-  : nrow(nrow),
+  : res(res),
+    nrow(nrow),
     node_degree(node_degree),
     num_samples(num_samples),
     bloom_filter(nrow, internal_node_degree / segment_size, 3),
     h_dists{raft::make_host_matrix<DistData_t, size_t, raft::row_major>(nrow, node_degree)},
-    h_graph_new(nrow * num_samples),
-    h_list_sizes_new(nrow),
-    h_graph_old(nrow * num_samples),
-    h_list_sizes_old{nrow}
+    h_graph_new{raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow, num_samples)},
+    h_list_sizes_new{raft::make_pinned_vector<int2, size_t>(res, nrow)},
+    h_graph_old{raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow, num_samples)},
+    h_list_sizes_old{raft::make_pinned_vector<int2, size_t>(res, nrow)}
 {
   // node_degree must be a multiple of segment_size;
   assert(node_degree % segment_size == 0);
@@ -1001,9 +1009,9 @@ void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors,
 {
 #pragma omp parallel for
   for (size_t i = 0; i < nrow; i++) {
-    auto list_new         = h_graph_new.data() + i * num_samples;
-    h_list_sizes_new[i].x = 0;
-    h_list_sizes_new[i].y = 0;
+    auto list_new                       = h_graph_new.data_handle() + i * num_samples;
+    h_list_sizes_new.data_handle()[i].x = 0;
+    h_list_sizes_new.data_handle()[i].y = 0;
 
     for (size_t j = 0; j < width; j++) {
       auto new_neighb_id = new_neighbors[i * width + j].id();
@@ -1011,8 +1019,8 @@ void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors,
       if (bloom_filter.check(i, new_neighb_id)) { continue; }
       bloom_filter.add(i, new_neighb_id);
       new_neighbors[i * width + j].mark_old();
-      list_new[h_list_sizes_new[i].x++] = new_neighb_id;
-      if (h_list_sizes_new[i].x == num_samples) break;
+      list_new[h_list_sizes_new.data_handle()[i].x++] = new_neighb_id;
+      if (h_list_sizes_new.data_handle()[i].x == num_samples) break;
     }
   }
 }
@@ -1051,31 +1059,37 @@ void GnndGraph<Index_t>::sample_graph(bool sample_new)
 {
 #pragma omp parallel for
   for (size_t i = 0; i < nrow; i++) {
-    h_list_sizes_old[i].x = 0;
-    h_list_sizes_old[i].y = 0;
-    h_list_sizes_new[i].x = 0;
-    h_list_sizes_new[i].y = 0;
+    h_list_sizes_old.data_handle()[i].x = 0;
+    h_list_sizes_old.data_handle()[i].y = 0;
+    h_list_sizes_new.data_handle()[i].x = 0;
+    h_list_sizes_new.data_handle()[i].y = 0;
 
     auto list     = h_graph + i * node_degree;
-    auto list_old = h_graph_old.data() + i * num_samples;
-    auto list_new = h_graph_new.data() + i * num_samples;
+    auto list_old = h_graph_old.data_handle() + i * num_samples;
+    auto list_new = h_graph_new.data_handle() + i * num_samples;
     for (int j = 0; j < segment_size; j++) {
       for (int k = 0; k < num_segments; k++) {
         auto neighbor = list[k * segment_size + j];
         if ((size_t)neighbor.id() >= nrow) continue;
         if (!neighbor.is_new()) {
-          if (h_list_sizes_old[i].x < num_samples) {
-            list_old[h_list_sizes_old[i].x++] = neighbor.id();
+          if (h_list_sizes_old.data_handle()[i].x < num_samples) {
+            list_old[h_list_sizes_old.data_handle()[i].x++] = neighbor.id();
           }
         } else if (sample_new) {
-          if (h_list_sizes_new[i].x < num_samples) {
+          if (h_list_sizes_new.data_handle()[i].x < num_samples) {
             list[k * segment_size + j].mark_old();
-            list_new[h_list_sizes_new[i].x++] = neighbor.id();
+            list_new[h_list_sizes_new.data_handle()[i].x++] = neighbor.id();
           }
         }
-        if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
+        if (h_list_sizes_old.data_handle()[i].x == num_samples &&
+            h_list_sizes_new.data_handle()[i].x == num_samples) {
+          break;
+        }
+      }
+      if (h_list_sizes_old.data_handle()[i].x == num_samples &&
+          h_list_sizes_new.data_handle()[i].x == num_samples) {
+        break;
       }
-      if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
     }
   }
 }
@@ -1137,7 +1151,8 @@ template <typename Data_t, typename Index_t>
 GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build_config)
   : res(res),
     build_config_(build_config),
-    graph_(build_config.max_dataset_size,
+    graph_(res,
+           build_config.max_dataset_size,
            align32::roundUp(build_config.node_degree),
            align32::roundUp(build_config.internal_node_degree ? build_config.internal_node_degree
                                                               : build_config.node_degree),
@@ -1151,28 +1166,38 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
       raft::make_device_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     dists_buffer_{
       raft::make_device_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
-    graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
-    dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
+    graph_host_buffer_{
+      raft::make_pinned_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+    dists_host_buffer_{
+      raft::make_pinned_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     d_locks_{raft::make_device_vector<int, size_t>(res, nrow_)},
-    h_rev_graph_new_(nrow_ * NUM_SAMPLES),
-    h_graph_old_(nrow_ * NUM_SAMPLES),
-    h_rev_graph_old_(nrow_ * NUM_SAMPLES),
+    h_rev_graph_new_{
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)},
+    h_graph_old_(
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)),
+    h_rev_graph_old_{
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)},
     d_list_sizes_new_{raft::make_device_vector<int2, size_t>(res, nrow_)},
     d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
 {
   static_assert(NUM_SAMPLES <= 32);
-
-  thrust::fill(thrust::device,
-               dists_buffer_.data_handle(),
-               dists_buffer_.data_handle() + dists_buffer_.size(),
-               std::numeric_limits<float>::max());
-  thrust::fill(thrust::device,
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()),
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()) + graph_buffer_.size(),
-               std::numeric_limits<Index_t>::max());
-  thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0);
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
 };
 
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::reset(raft::resources const& res)
+{
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
+}
+
 template <typename Data_t, typename Index_t>
 void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
                                               Index_t* h_rev_graph_ptr,
@@ -1189,34 +1214,35 @@ void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
 template <typename Data_t, typename Index_t>
 void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
 {
-  thrust::fill(thrust::device.on(stream),
-               dists_buffer_.data_handle(),
-               dists_buffer_.data_handle() + dists_buffer_.size(),
-               std::numeric_limits<float>::max());
-  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(
-    thrust::raw_pointer_cast(graph_.h_graph_new.data()),
-    thrust::raw_pointer_cast(h_rev_graph_new_.data()),
-    d_list_sizes_new_.data_handle(),
-    thrust::raw_pointer_cast(h_graph_old_.data()),
-    thrust::raw_pointer_cast(h_rev_graph_old_.data()),
-    d_list_sizes_old_.data_handle(),
-    NUM_SAMPLES,
-    d_data_.data_handle(),
-    ndim_,
-    graph_buffer_.data_handle(),
-    dists_buffer_.data_handle(),
-    DEGREE_ON_DEVICE,
-    d_locks_.data_handle(),
-    l2_norms_.data_handle());
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                      h_rev_graph_new_.data_handle(),
+                                                      d_list_sizes_new_.data_handle(),
+                                                      h_graph_old_.data_handle(),
+                                                      h_rev_graph_old_.data_handle(),
+                                                      d_list_sizes_old_.data_handle(),
+                                                      NUM_SAMPLES,
+                                                      d_data_.data_handle(),
+                                                      ndim_,
+                                                      graph_buffer_.data_handle(),
+                                                      dists_buffer_.data_handle(),
+                                                      DEGREE_ON_DEVICE,
+                                                      d_locks_.data_handle(),
+                                                      l2_norms_.data_handle());
 }
 
 template <typename Data_t, typename Index_t>
-void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* output_graph)
+void GNND<Data_t, Index_t>::build(Data_t* data,
+                                  const Index_t nrow,
+                                  Index_t* output_graph,
+                                  bool return_distances,
+                                  DistData_t* output_distances)
 {
   using input_t = typename std::remove_const<Data_t>::type;
 
   cudaStream_t stream = raft::resource::get_cuda_stream(res);
   nrow_               = nrow;
+  graph_.nrow         = nrow;
   graph_.h_graph      = (InternalID_t<Index_t>*)output_graph;
 
   cudaPointerAttributes data_ptr_attr;
@@ -1226,24 +1252,18 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{
     data, static_cast<size_t>(nrow_), build_config_.dataset_dim, batch_size, stream};
   for (auto const& batch : vec_batches) {
-    preprocess_data_kernel<<<batch.size(),
-                             raft::warp_size(),
-                             sizeof(Data_t) *
-                               raft::ceildiv(build_config_.dataset_dim,
-                                             static_cast<size_t>(raft::warp_size())) *
-                               raft::warp_size(),
-                             stream>>>(batch.data(),
-                                       d_data_.data_handle(),
-                                       build_config_.dataset_dim,
-                                       l2_norms_.data_handle(),
-                                       batch.offset());
+    preprocess_data_kernel<<<
+      batch.size(),
+      raft::warp_size(),
+      sizeof(Data_t) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
+        raft::warp_size(),
+      stream>>>(batch.data(),
+                d_data_.data_handle(),
+                build_config_.dataset_dim,
+                l2_norms_.data_handle(),
+                batch.offset());
   }
 
-  thrust::fill(thrust::device.on(stream),
-               (Index_t*)graph_buffer_.data_handle(),
-               (Index_t*)graph_buffer_.data_handle() + graph_buffer_.size(),
-               std::numeric_limits<Index_t>::max());
-
   graph_.clear();
   graph_.init_random_graph();
   graph_.sample_graph(true);
@@ -1251,8 +1271,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   auto update_and_sample = [&](bool update_graph) {
     if (update_graph) {
       update_counter_ = 0;
-      graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
-                          thrust::raw_pointer_cast(dists_host_buffer_.data()),
+      graph_.update_graph(graph_host_buffer_.data_handle(),
+                          dists_host_buffer_.data_handle(),
                           DEGREE_ON_DEVICE,
                           update_counter_);
       if (update_counter_ < build_config_.termination_threshold * nrow_ *
@@ -1265,15 +1285,15 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
   for (size_t it = 0; it < build_config_.max_iterations; it++) {
     raft::copy(d_list_sizes_new_.data_handle(),
-               thrust::raw_pointer_cast(graph_.h_list_sizes_new.data()),
+               graph_.h_list_sizes_new.data_handle(),
                nrow_,
                raft::resource::get_cuda_stream(res));
-    raft::copy(thrust::raw_pointer_cast(h_graph_old_.data()),
-               thrust::raw_pointer_cast(graph_.h_graph_old.data()),
+    raft::copy(h_graph_old_.data_handle(),
+               graph_.h_graph_old.data_handle(),
                nrow_ * NUM_SAMPLES,
                raft::resource::get_cuda_stream(res));
     raft::copy(d_list_sizes_old_.data_handle(),
-               thrust::raw_pointer_cast(graph_.h_list_sizes_old.data()),
+               graph_.h_list_sizes_old.data_handle(),
                nrow_,
                raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
@@ -1286,13 +1306,13 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
     // contains some information for local_join.
     static_assert(DEGREE_ON_DEVICE * sizeof(*(dists_buffer_.data_handle())) >=
                   NUM_SAMPLES * sizeof(*(graph_buffer_.data_handle())));
-    add_reverse_edges(thrust::raw_pointer_cast(graph_.h_graph_new.data()),
-                      thrust::raw_pointer_cast(h_rev_graph_new_.data()),
+    add_reverse_edges(graph_.h_graph_new.data_handle(),
+                      h_rev_graph_new_.data_handle(),
                       (Index_t*)dists_buffer_.data_handle(),
                       d_list_sizes_new_.data_handle(),
                       stream);
-    add_reverse_edges(thrust::raw_pointer_cast(h_graph_old_.data()),
-                      thrust::raw_pointer_cast(h_rev_graph_old_.data()),
+    add_reverse_edges(h_graph_old_.data_handle(),
+                      h_rev_graph_old_.data_handle(),
                       (Index_t*)dists_buffer_.data_handle(),
                       d_list_sizes_old_.data_handle(),
                       stream);
@@ -1316,21 +1336,21 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
     update_and_sample_thread.join();
 
     if (update_counter_ == -1) { break; }
-    raft::copy(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+    raft::copy(graph_host_buffer_.data_handle(),
                graph_buffer_.data_handle(),
                nrow_ * DEGREE_ON_DEVICE,
                raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
-    raft::copy(thrust::raw_pointer_cast(dists_host_buffer_.data()),
+    raft::copy(dists_host_buffer_.data_handle(),
                dists_buffer_.data_handle(),
                nrow_ * DEGREE_ON_DEVICE,
                raft::resource::get_cuda_stream(res));
 
-    graph_.sample_graph_new(thrust::raw_pointer_cast(graph_host_buffer_.data()), DEGREE_ON_DEVICE);
+    graph_.sample_graph_new(graph_host_buffer_.data_handle(), DEGREE_ON_DEVICE);
   }
 
-  graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
-                      thrust::raw_pointer_cast(dists_host_buffer_.data()),
+  graph_.update_graph(graph_host_buffer_.data_handle(),
+                      dists_host_buffer_.data_handle(),
                       DEGREE_ON_DEVICE,
                       update_counter_);
   raft::resource::sync_stream(res);
@@ -1338,6 +1358,27 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
   // Reuse graph_.h_dists as the buffer for shrink the lists in graph
   static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t));
+
+  if (return_distances) {
+    auto graph_d_dists = raft::make_device_matrix<DistData_t, int64_t, raft::row_major>(
+      res, nrow_, build_config_.node_degree);
+    raft::copy(graph_d_dists.data_handle(),
+               graph_.h_dists.data_handle(),
+               nrow_ * build_config_.node_degree,
+               raft::resource::get_cuda_stream(res));
+
+    auto output_dist_view = raft::make_device_matrix_view<DistData_t, int64_t, raft::row_major>(
+      output_distances, nrow_, build_config_.output_graph_degree);
+
+    raft::matrix::slice_coordinates coords{static_cast<int64_t>(0),
+                                           static_cast<int64_t>(0),
+                                           static_cast<int64_t>(nrow_),
+                                           static_cast<int64_t>(build_config_.output_graph_degree)};
+    raft::matrix::slice<DistData_t, int64_t, raft::row_major>(
+      res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords);
+    raft::resource::sync_stream(res);
+  }
+
   Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle();
 
 #pragma omp parallel for
@@ -1410,10 +1451,24 @@ void build(raft::resources const& res,
                            .node_degree           = extended_graph_degree,
                            .internal_node_degree  = extended_intermediate_degree,
                            .max_iterations        = params.max_iterations,
-                           .termination_threshold = params.termination_threshold};
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = params.graph_degree};
 
   GNND<const T, int> nnd(res, build_config);
-  nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle());
+
+  if (idx.distances().has_value() || !params.return_distances) {
+    nnd.build(dataset.data_handle(),
+              dataset.extent(0),
+              int_graph.data_handle(),
+              params.return_distances,
+              idx.distances()
+                .value_or(raft::make_device_matrix<float, int64_t>(res, 0, 0).view())
+                .data_handle());
+  } else {
+    RAFT_EXPECTS(!params.return_distances,
+                 "Distance view not allocated. Using return_distances set to true requires "
+                 "distance view to be allocated.");
+  }
 
 #pragma omp parallel for
   for (size_t i = 0; i < static_cast<size_t>(dataset.extent(0)); i++) {
@@ -1445,11 +1500,12 @@ index<IdxT> build(
     graph_degree = intermediate_degree;
   }
 
-  index<IdxT> idx{res, dataset.extent(0), static_cast<int64_t>(graph_degree)};
+  index<IdxT> idx{
+    res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
 
   build(res, params, dataset, idx);
 
   return idx;
 }
 
-}  // namespace  cuvs::neighbors::nn_descent::detail
+}  // namespace cuvs::neighbors::nn_descent::detail
diff --git a/cpp/src/neighbors/detail/nn_descent_batch.cuh b/cpp/src/neighbors/detail/nn_descent_batch.cuh
new file mode 100644
index 000000000..842dbe788
--- /dev/null
+++ b/cpp/src/neighbors/detail/nn_descent_batch.cuh
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <sys/types.h>
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#include "nn_descent.cuh"
+#include <cuvs/neighbors/brute_force.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/managed_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/matrix/detail/gather_inplace.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/sample_rows.cuh>
+
+#include <thrust/copy.h>
+
+#include <vector_types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <random>
+#include <type_traits>
+
+namespace cuvs::neighbors::nn_descent::detail::experimental {
+
+//
+// Run balanced kmeans on a subsample of the dataset to get centroids
+//
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void get_balanced_kmeans_centroids(
+  raft::resources const& res,
+  cuvs::distance::DistanceType metric,
+  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+  raft::device_matrix_view<T, IdxT> centroids)
+{
+  size_t num_rows   = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols   = static_cast<size_t>(dataset.extent(1));
+  size_t n_clusters = centroids.extent(0);
+  size_t num_subsamples =
+    std::min(static_cast<size_t>(num_rows / n_clusters), static_cast<size_t>(num_rows * 0.1));
+
+  auto d_subsample_dataset =
+    raft::make_device_matrix<T, int64_t, raft::row_major>(res, num_subsamples, num_cols);
+  raft::matrix::sample_rows<T, int64_t, Accessor>(
+    res, raft::random::RngState{0}, dataset, d_subsample_dataset.view());
+
+  cuvs::cluster::kmeans::balanced_params kmeans_params;
+  kmeans_params.metric = metric;
+
+  auto d_subsample_dataset_const_view =
+    raft::make_device_matrix_view<const T, int, raft::row_major>(
+      d_subsample_dataset.data_handle(), num_subsamples, num_cols);
+  auto centroids_view = raft::make_device_matrix_view<T, int, raft::row_major>(
+    centroids.data_handle(), n_clusters, num_cols);
+  cuvs::cluster::kmeans::fit(res, kmeans_params, d_subsample_dataset_const_view, centroids_view);
+}
+
+//
+// Get the top k closest centroid indices for each data point
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename T, typename IdxT = uint32_t>
+void get_global_nearest_k(
+  raft::resources const& res,
+  size_t k,
+  size_t num_rows,
+  size_t n_clusters,
+  const T* dataset,
+  raft::host_matrix_view<IdxT, IdxT, raft::row_major> global_nearest_cluster,
+  raft::device_matrix_view<T, IdxT, raft::row_major> centroids,
+  cuvs::distance::DistanceType metric)
+{
+  size_t num_cols     = centroids.extent(1);
+  auto centroids_view = raft::make_device_matrix_view<const T, int64_t, raft::row_major>(
+    centroids.data_handle(), n_clusters, num_cols);
+
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset));
+  float* ptr = reinterpret_cast<float*>(attr.devicePointer);
+
+  size_t num_batches = n_clusters;
+  size_t batch_size  = (num_rows + n_clusters) / n_clusters;
+  if (ptr == nullptr) {  // data on host
+
+    auto d_dataset_batch =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, batch_size, num_cols);
+
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<int64_t, int64_t, raft::row_major>(res, batch_size, k);
+    auto nearest_clusters_idxt =
+      raft::make_device_matrix<IdxT, int64_t, raft::row_major>(res, batch_size, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, batch_size, k);
+
+    for (size_t i = 0; i < num_batches; i++) {
+      size_t batch_size_ = batch_size;
+
+      if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; }
+      raft::copy(d_dataset_batch.data_handle(),
+                 dataset + i * batch_size * num_cols,
+                 batch_size_ * num_cols,
+                 resource::get_cuda_stream(res));
+
+      std::optional<raft::device_vector_view<const T, int64_t>> norms_view;
+      cuvs::neighbors::brute_force::index<T> brute_force_index(
+        res, centroids_view, norms_view, metric);
+      cuvs::neighbors::brute_force::search(res,
+                                           brute_force_index,
+                                           raft::make_const_mdspan(d_dataset_batch.view()),
+                                           nearest_clusters_idx.view(),
+                                           nearest_clusters_dist.view());
+
+      thrust::copy(raft::resource::get_thrust_policy(res),
+                   nearest_clusters_idx.data_handle(),
+                   nearest_clusters_idx.data_handle() + nearest_clusters_idx.size(),
+                   nearest_clusters_idxt.data_handle());
+      raft::copy(global_nearest_cluster.data_handle() + i * batch_size * k,
+                 nearest_clusters_idxt.data_handle(),
+                 batch_size_ * k,
+                 resource::get_cuda_stream(res));
+    }
+  } else {  // data on device
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<int64_t, int64_t, raft::row_major>(res, num_rows, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, num_rows, k);
+
+    std::optional<raft::device_vector_view<const T, int64_t>> norms_view;
+    cuvs::neighbors::brute_force::index<T> brute_force_index(
+      res, centroids_view, norms_view, metric);
+    auto dataset_view =
+      raft::make_device_matrix_view<const T, int64_t, raft::row_major>(dataset, num_rows, num_cols);
+    cuvs::neighbors::brute_force::search(res,
+                                         brute_force_index,
+                                         dataset_view,
+                                         nearest_clusters_idx.view(),
+                                         nearest_clusters_dist.view());
+
+    auto nearest_clusters_idxt =
+      raft::make_device_matrix<IdxT, int64_t, raft::row_major>(res, batch_size, k);
+    for (size_t i = 0; i < num_batches; i++) {
+      size_t batch_size_ = batch_size;
+
+      if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; }
+      thrust::copy(raft::resource::get_thrust_policy(res),
+                   nearest_clusters_idx.data_handle() + i * batch_size_ * k,
+                   nearest_clusters_idx.data_handle() + (i + 1) * batch_size_ * k,
+                   nearest_clusters_idxt.data_handle());
+      raft::copy(global_nearest_cluster.data_handle() + i * batch_size_ * k,
+                 nearest_clusters_idxt.data_handle(),
+                 batch_size_ * k,
+                 resource::get_cuda_stream(res));
+    }
+  }
+}
+
+//
+// global_nearest_cluster [num_rows X k=2] : top 2 closest clusters for each data point
+// inverted_indices [num_rows x k vector] : sparse vector for data indices for each cluster
+// cluster_size [n_cluster] : cluster size for each cluster
+// offset [n_cluster] : offset in inverted_indices for each cluster
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename IdxT = uint32_t>
+void get_inverted_indices(raft::resources const& res,
+                          size_t n_clusters,
+                          size_t& max_cluster_size,
+                          size_t& min_cluster_size,
+                          raft::host_matrix_view<IdxT, IdxT> global_nearest_cluster,
+                          raft::host_vector_view<IdxT, IdxT> inverted_indices,
+                          raft::host_vector_view<IdxT, IdxT> cluster_size,
+                          raft::host_vector_view<IdxT, IdxT> offset)
+{
+  // build sparse inverted indices and get number of data points for each cluster
+  size_t num_rows = global_nearest_cluster.extent(0);
+  size_t k        = global_nearest_cluster.extent(1);
+
+  auto local_offset = raft::make_host_vector<IdxT>(n_clusters);
+
+  max_cluster_size = 0;
+  min_cluster_size = std::numeric_limits<size_t>::max();
+
+  std::fill(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters, 0);
+  std::fill(local_offset.data_handle(), local_offset.data_handle() + n_clusters, 0);
+
+  // TODO: this part isn't really a bottleneck but maybe worth trying omp parallel
+  // for with atomic add
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      cluster_size(cluster_id) += 1;
+    }
+  }
+
+  offset(0) = 0;
+  for (size_t i = 1; i < n_clusters; i++) {
+    offset(i) = offset(i - 1) + cluster_size(i - 1);
+  }
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      inverted_indices(offset(cluster_id) + local_offset(cluster_id)) = i;
+      local_offset(cluster_id) += 1;
+    }
+  }
+
+  max_cluster_size = static_cast<size_t>(
+    *std::max_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+  min_cluster_size = static_cast<size_t>(
+    *std::min_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+}
+
+template <typename KeyType, typename ValueType>
+struct KeyValuePair {
+  KeyType key;
+  ValueType value;
+};
+
+template <typename KeyType, typename ValueType>
+struct CustomKeyComparator {
+  __device__ bool operator()(const KeyValuePair<KeyType, ValueType>& a,
+                             const KeyValuePair<KeyType, ValueType>& b) const
+  {
+    if (a.key == b.key) { return a.value < b.value; }
+    return a.key < b.key;
+  }
+};
+
+template <typename IdxT, int BLOCK_SIZE, int ITEMS_PER_THREAD>
+RAFT_KERNEL merge_subgraphs(IdxT* cluster_data_indices,
+                            size_t graph_degree,
+                            size_t num_cluster_in_batch,
+                            float* global_distances,
+                            float* batch_distances,
+                            IdxT* global_indices,
+                            IdxT* batch_indices)
+{
+  size_t batch_row = blockIdx.x;
+  typedef cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>
+    BlockMergeSortType;
+  __shared__ typename cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>::
+    TempStorage tmpSmem;
+
+  extern __shared__ char sharedMem[];
+  float* blockKeys  = reinterpret_cast<float*>(sharedMem);
+  IdxT* blockValues = reinterpret_cast<IdxT*>(&sharedMem[graph_degree * 2 * sizeof(float)]);
+  int16_t* uniqueMask =
+    reinterpret_cast<int16_t*>(&sharedMem[graph_degree * 2 * (sizeof(float) + sizeof(IdxT))]);
+
+  if (batch_row < num_cluster_in_batch) {
+    // load batch or global depending on threadIdx
+    size_t global_row = cluster_data_indices[batch_row];
+
+    KeyValuePair<float, IdxT> threadKeyValuePair[ITEMS_PER_THREAD];
+
+    size_t halfway   = BLOCK_SIZE / 2;
+    size_t do_global = threadIdx.x < halfway;
+
+    float* distances;
+    IdxT* indices;
+
+    if (do_global) {
+      distances = global_distances;
+      indices   = global_indices;
+    } else {
+      distances = batch_distances;
+      indices   = batch_indices;
+    }
+
+    size_t idxBase = (threadIdx.x * do_global + (threadIdx.x - halfway) * (1lu - do_global)) *
+                     static_cast<size_t>(ITEMS_PER_THREAD);
+    size_t arrIdxBase = (global_row * do_global + batch_row * (1lu - do_global)) * graph_degree;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < graph_degree) {
+        threadKeyValuePair[i].key   = distances[arrIdxBase + colId];
+        threadKeyValuePair[i].value = indices[arrIdxBase + colId];
+      } else {
+        threadKeyValuePair[i].key   = std::numeric_limits<float>::max();
+        threadKeyValuePair[i].value = std::numeric_limits<IdxT>::max();
+      }
+    }
+
+    __syncthreads();
+
+    BlockMergeSortType(tmpSmem).Sort(threadKeyValuePair, CustomKeyComparator<float, IdxT>{});
+
+    // load sorted result into shared memory to get unique values
+    idxBase = threadIdx.x * ITEMS_PER_THREAD;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < 2 * graph_degree) {
+        blockKeys[colId]   = threadKeyValuePair[i].key;
+        blockValues[colId] = threadKeyValuePair[i].value;
+      }
+    }
+
+    __syncthreads();
+
+    // get unique mask
+    if (threadIdx.x == 0) { uniqueMask[0] = 1; }
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        uniqueMask[colId] = static_cast<int16_t>(blockValues[colId] != blockValues[colId - 1]);
+      }
+    }
+
+    __syncthreads();
+
+    // prefix sum
+    if (threadIdx.x == 0) {
+      for (int i = 1; i < 2 * graph_degree; i++) {
+        uniqueMask[i] += uniqueMask[i - 1];
+      }
+    }
+
+    __syncthreads();
+    // load unique values to global memory
+    if (threadIdx.x == 0) {
+      global_distances[global_row * graph_degree] = blockKeys[0];
+      global_indices[global_row * graph_degree]   = blockValues[0];
+    }
+
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        bool is_unique       = uniqueMask[colId] != uniqueMask[colId - 1];
+        int16_t global_colId = uniqueMask[colId] - 1;
+        if (is_unique && static_cast<size_t>(global_colId) < graph_degree) {
+          global_distances[global_row * graph_degree + global_colId] = blockKeys[colId];
+          global_indices[global_row * graph_degree + global_colId]   = blockValues[colId];
+        }
+      }
+    }
+  }
+}
+
+//
+// builds knn graph using NN Descent and merge with global graph
+//
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void build_and_merge(raft::resources const& res,
+                     const index_params& params,
+                     size_t num_data_in_cluster,
+                     size_t graph_degree,
+                     size_t int_graph_node_degree,
+                     T* cluster_data,
+                     IdxT* cluster_data_indices,
+                     int* int_graph,
+                     IdxT* inverted_indices,
+                     IdxT* global_indices_d,
+                     float* global_distances_d,
+                     IdxT* batch_indices_h,
+                     IdxT* batch_indices_d,
+                     float* batch_distances_d,
+                     GNND<const T, int>& nnd)
+{
+  nnd.build(cluster_data, num_data_in_cluster, int_graph, true, batch_distances_d);
+
+  // remap indices
+#pragma omp parallel for
+  for (size_t i = 0; i < num_data_in_cluster; i++) {
+    for (size_t j = 0; j < graph_degree; j++) {
+      size_t local_idx                      = int_graph[i * int_graph_node_degree + j];
+      batch_indices_h[i * graph_degree + j] = inverted_indices[local_idx];
+    }
+  }
+
+  raft::copy(batch_indices_d,
+             batch_indices_h,
+             num_data_in_cluster * graph_degree,
+             raft::resource::get_cuda_stream(res));
+
+  size_t num_elems     = graph_degree * 2;
+  size_t sharedMemSize = num_elems * (sizeof(float) + sizeof(IdxT) + sizeof(int16_t));
+
+  if (num_elems <= 128) {
+    merge_subgraphs<IdxT, 32, 4>
+      <<<num_data_in_cluster, 32, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 512) {
+    merge_subgraphs<IdxT, 128, 4>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 1024) {
+    merge_subgraphs<IdxT, 128, 8>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 2048) {
+    merge_subgraphs<IdxT, 256, 8>
+      <<<num_data_in_cluster, 256, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else {
+    // this is as far as we can get due to the shared mem usage of cub::BlockMergeSort
+    RAFT_FAIL("The degree of knn is too large (%lu). It must be smaller than 1024", graph_degree);
+  }
+  raft::resource::sync_stream(res);
+}
+
+//
+// For each cluster, gather the data samples that belong to that cluster, and
+// call build_and_merge
+//
+template <typename T, typename IdxT = uint32_t>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::host_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_host_matrix<T, int64_t, row_major>(max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on host. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+#pragma omp parallel for
+    for (size_t i = 0; i < num_data_in_cluster; i++) {
+      for (size_t j = 0; j < num_cols; j++) {
+        size_t global_row         = (inverted_indices + offset)[i];
+        cluster_data_matrix(i, j) = dataset(global_row, j);
+      }
+    }
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_matrix.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd);
+    nnd.reset(res);
+  }
+}
+
+template <typename T, typename IdxT = uint32_t>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::device_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_device_matrix<T, int64_t, row_major>(res, max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on device. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+    auto cluster_data_view = raft::make_device_matrix_view<T, IdxT>(
+      cluster_data_matrix.data_handle(), num_data_in_cluster, num_cols);
+    auto cluster_data_indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+      cluster_data_indices + offset, num_data_in_cluster);
+
+    auto dataset_IdxT =
+      raft::make_device_matrix_view<const T, IdxT>(dataset.data_handle(), num_rows, num_cols);
+    raft::matrix::gather(res, dataset_IdxT, cluster_data_indices_view, cluster_data_view);
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_view.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd);
+    nnd.reset(res);
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<float>, memory_type::host>>
+void batch_build(raft::resources const& res,
+                 const index_params& params,
+                 mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+                 index<IdxT>& global_idx)
+{
+  size_t graph_degree        = params.graph_degree;
+  size_t intermediate_degree = params.intermediate_graph_degree;
+
+  size_t num_rows = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols = static_cast<size_t>(dataset.extent(1));
+
+  auto centroids =
+    raft::make_device_matrix<T, IdxT, raft::row_major>(res, params.n_clusters, num_cols);
+  get_balanced_kmeans_centroids<T, IdxT>(res, params.metric, dataset, centroids.view());
+
+  size_t k                    = 2;
+  auto global_nearest_cluster = raft::make_host_matrix<IdxT, IdxT, raft::row_major>(num_rows, k);
+  get_global_nearest_k<T, IdxT>(res,
+                                k,
+                                num_rows,
+                                params.n_clusters,
+                                dataset.data_handle(),
+                                global_nearest_cluster.view(),
+                                centroids.view(),
+                                params.metric);
+
+  auto inverted_indices = raft::make_host_vector<IdxT, IdxT, raft::row_major>(num_rows * k);
+  auto cluster_size     = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+  auto offset           = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+
+  size_t max_cluster_size, min_cluster_size;
+  get_inverted_indices(res,
+                       params.n_clusters,
+                       max_cluster_size,
+                       min_cluster_size,
+                       global_nearest_cluster.view(),
+                       inverted_indices.view(),
+                       cluster_size.view(),
+                       offset.view());
+
+  if (intermediate_degree >= min_cluster_size) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than minimum cluster size, reducing it to %lu",
+      dataset.extent(0));
+    intermediate_degree = min_cluster_size - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  size_t extended_graph_degree =
+    align32::roundUp(static_cast<size_t>(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3)));
+  size_t extended_intermediate_degree = align32::roundUp(
+    static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
+
+  auto int_graph = raft::make_host_matrix<int, int64_t, row_major>(
+    max_cluster_size, static_cast<int64_t>(extended_graph_degree));
+
+  BuildConfig build_config{.max_dataset_size      = max_cluster_size,
+                           .dataset_dim           = num_cols,
+                           .node_degree           = extended_graph_degree,
+                           .internal_node_degree  = extended_intermediate_degree,
+                           .max_iterations        = params.max_iterations,
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = graph_degree};
+
+  auto global_indices_h   = raft::make_managed_matrix<IdxT, int64_t>(res, num_rows, graph_degree);
+  auto global_distances_h = raft::make_managed_matrix<float, int64_t>(res, num_rows, graph_degree);
+
+  std::fill(global_indices_h.data_handle(),
+            global_indices_h.data_handle() + num_rows * graph_degree,
+            std::numeric_limits<IdxT>::max());
+  std::fill(global_distances_h.data_handle(),
+            global_distances_h.data_handle() + num_rows * graph_degree,
+            std::numeric_limits<float>::max());
+
+  auto batch_indices_h =
+    raft::make_host_matrix<IdxT, int64_t, row_major>(max_cluster_size, graph_degree);
+  auto batch_indices_d =
+    raft::make_device_matrix<IdxT, int64_t, row_major>(res, max_cluster_size, graph_degree);
+  auto batch_distances_d =
+    raft::make_device_matrix<float, int64_t, row_major>(res, max_cluster_size, graph_degree);
+
+  auto cluster_data_indices = raft::make_device_vector<IdxT, IdxT>(res, num_rows * k);
+  raft::copy(cluster_data_indices.data_handle(),
+             inverted_indices.data_handle(),
+             num_rows * k,
+             resource::get_cuda_stream(res));
+
+  cluster_nnd<T, IdxT>(res,
+                       params,
+                       graph_degree,
+                       extended_graph_degree,
+                       max_cluster_size,
+                       dataset,
+                       offset.data_handle(),
+                       cluster_size.data_handle(),
+                       cluster_data_indices.data_handle(),
+                       int_graph.data_handle(),
+                       inverted_indices.data_handle(),
+                       global_indices_h.data_handle(),
+                       global_distances_h.data_handle(),
+                       batch_indices_h.data_handle(),
+                       batch_indices_d.data_handle(),
+                       batch_distances_d.data_handle(),
+                       build_config);
+
+  raft::copy(global_idx.graph().data_handle(),
+             global_indices_h.data_handle(),
+             num_rows * graph_degree,
+             raft::resource::get_cuda_stream(res));
+  if (params.return_distances && global_idx.distances().has_value()) {
+    raft::copy(global_idx.distances().value().data_handle(),
+               global_distances_h.data_handle(),
+               num_rows * graph_degree,
+               raft::resource::get_cuda_stream(res));
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<float>, memory_type::host>>
+index<IdxT> batch_build(raft::resources const& res,
+                        const index_params& params,
+                        mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset)
+{
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  index<IdxT> idx{
+    res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
+
+  batch_build(res, params, dataset, idx);
+
+  return idx;
+}
+
+}  // namespace cuvs::neighbors::nn_descent::detail::experimental
diff --git a/cpp/src/neighbors/nn_descent.cuh b/cpp/src/neighbors/nn_descent.cuh
index 582da72c1..ed91dac91 100644
--- a/cpp/src/neighbors/nn_descent.cuh
+++ b/cpp/src/neighbors/nn_descent.cuh
@@ -17,9 +17,14 @@
 #pragma once
 
 #include "detail/nn_descent.cuh"
+#include "detail/nn_descent_batch.cuh"
+
+#include <cmath>
+#include <cstdint>
 #include <cuvs/neighbors/nn_descent.hpp>
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
 #include <raft/core/host_mdspan.hpp>
 
 namespace cuvs::neighbors::nn_descent {
@@ -61,7 +66,15 @@ auto build(raft::resources const& res,
            index_params const& params,
            raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      return detail::experimental::batch_build<T, IdxT>(res, params, dataset);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset);
+  }
 }
 
 /**
@@ -100,7 +113,15 @@ void build(raft::resources const& res,
            raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,
            index<IdxT>& idx)
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      detail::experimental::batch_build<T, IdxT>(res, params, dataset, idx);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    detail::build<T, IdxT>(res, params, dataset, idx);
+  }
 }
 
 /**
@@ -135,7 +156,15 @@ auto build(raft::resources const& res,
            index_params const& params,
            raft::host_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      return detail::experimental::batch_build<T, IdxT>(res, params, dataset);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset);
+  }
 }
 
 /**
@@ -174,7 +203,15 @@ void build(raft::resources const& res,
            raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
            index<IdxT>& idx)
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      detail::experimental::batch_build<T, IdxT>(res, params, dataset, idx);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    detail::build<T, IdxT>(res, params, dataset, idx);
+  }
 }
 
 /** @} */  // end group nn-descent
diff --git a/cpp/src/neighbors/nn_descent_float.cu b/cpp/src/neighbors/nn_descent_float.cu
index c6d356671..fa85db127 100644
--- a/cpp/src/neighbors/nn_descent_float.cu
+++ b/cpp/src/neighbors/nn_descent_float.cu
@@ -19,21 +19,38 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    };                                                                                        \
+  }                                                                                           \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(float, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_half.cu b/cpp/src/neighbors/nn_descent_half.cu
index 587993031..2ee45d435 100644
--- a/cpp/src/neighbors/nn_descent_half.cu
+++ b/cpp/src/neighbors/nn_descent_half.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(half, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_int8.cu b/cpp/src/neighbors/nn_descent_int8.cu
index 813a01746..e150f511b 100644
--- a/cpp/src/neighbors/nn_descent_int8.cu
+++ b/cpp/src/neighbors/nn_descent_int8.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(int8_t, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_uint8.cu b/cpp/src/neighbors/nn_descent_uint8.cu
index 9d73dd90f..d8657777b 100644
--- a/cpp/src/neighbors/nn_descent_uint8.cu
+++ b/cpp/src/neighbors/nn_descent_uint8.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(uint8_t, uint32_t);
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index bce0f9899..7d2575c2b 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -18,9 +18,13 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
+
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include "naive_knn.cuh"
 
@@ -42,6 +46,15 @@ struct AnnNNDescentInputs {
   double min_recall;
 };
 
+struct AnnNNDescentBatchInputs {
+  std::pair<double, size_t> recall_cluster;
+  int n_rows;
+  int dim;
+  int graph_degree;
+  cuvs::distance::DistanceType metric;
+  bool host_dataset;
+};
+
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p)
 {
   os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
@@ -50,6 +63,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs&
   return os;
 }
 
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentBatchInputs& p)
+{
+  os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << ", clusters=" << p.recall_cluster.second << std::endl;
+  return os;
+}
+
 template <typename DistanceT, typename DataT, typename IdxT>
 class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
  public:
@@ -65,7 +86,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
   {
     size_t queries_size = ps.n_rows * ps.graph_degree;
     std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
 
     {
       rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
@@ -81,16 +104,18 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
                                         ps.graph_degree,
                                         ps.metric);
       raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       raft::resource::sync_stream(handle_);
     }
 
     {
       {
-        cuvs::neighbors::nn_descent::index_params index_params;
+        nn_descent::index_params index_params;
         index_params.metric                    = ps.metric;
         index_params.graph_degree              = ps.graph_degree;
         index_params.intermediate_graph_degree = 2 * ps.graph_degree;
         index_params.max_iterations            = 100;
+        index_params.return_distances          = true;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -101,22 +126,40 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
               (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
-            auto index =
-              cuvs::neighbors::nn_descent::build(handle_, index_params, database_host_view);
-            raft::update_host(
+            auto index = nn_descent::build(handle_, index_params, database_host_view);
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
           } else {
-            auto index = cuvs::neighbors::nn_descent::build(handle_, index_params, database_view);
-            raft::update_host(
+            auto index = nn_descent::build(handle_, index_params, database_view);
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
           };
         }
         raft::resource::sync_stream(handle_);
       }
 
       double min_recall = ps.min_recall;
-      EXPECT_TRUE(eval_recall(
-        indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall));
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.001,
+                                  min_recall));
     }
   }
 
@@ -146,6 +189,125 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
   rmm::device_uvector<DataT> database;
 };
 
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnNNDescentBatchTest : public ::testing::TestWithParam<AnnNNDescentBatchInputs> {
+ public:
+  AnnNNDescentBatchTest()
+    : stream_(raft::resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnNNDescentBatchInputs>::GetParam()),
+      database(0, stream_)
+  {
+  }
+
+  void testNNDescentBatch()
+  {
+    size_t queries_size = ps.n_rows * ps.graph_degree;
+    std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        database.data(),
+                                        database.data(),
+                                        ps.n_rows,
+                                        ps.n_rows,
+                                        ps.dim,
+                                        ps.graph_degree,
+                                        ps.metric);
+      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    {
+      {
+        nn_descent::index_params index_params;
+        index_params.metric                    = ps.metric;
+        index_params.graph_degree              = ps.graph_degree;
+        index_params.intermediate_graph_degree = 2 * ps.graph_degree;
+        index_params.max_iterations            = 10;
+        index_params.return_distances          = true;
+        index_params.n_clusters                = ps.recall_cluster.second;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        {
+          if (ps.host_dataset) {
+            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            auto index = nn_descent::build(handle_, index_params, database_host_view);
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
+          } else {
+            auto index = nn_descent::build(handle_, index_params, database_view);
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+          };
+        }
+        raft::resource::sync_stream(handle_);
+      }
+      double min_recall = ps.recall_cluster.first;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.01,
+                                  min_recall,
+                                  true,
+                                  static_cast<size_t>(ps.graph_degree * 0.1)));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    raft::random::RngState r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
+    } else {
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
+    }
+    raft::resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    raft::resource::sync_stream(handle_);
+    database.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnNNDescentBatchInputs ps;
+  rmm::device_uvector<DataT> database;
+};
+
 const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<AnnNNDescentInputs>(
   {1000, 2000},                                              // n_rows
   {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
@@ -154,4 +316,15 @@ const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<An
   {false, true},
   {0.90});
 
-}  // namespace  cuvs::neighbors::nn_descent
+// TODO : Investigate why this test is failing Reference issue https
+// :  // github.com/rapidsai/raft/issues/2450
+const std::vector<AnnNNDescentBatchInputs> inputsBatch =
+  raft::util::itertools::product<AnnNNDescentBatchInputs>(
+    {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
+    {4000, 5000},                                          // n_rows
+    {192, 512},                                            // dim
+    {32, 64},                                              // graph_degree
+    {cuvs::distance::DistanceType::L2Expanded},
+    {false, true});
+
+}  // namespace cuvs::neighbors::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
index 64c0e0291..7a24f96a1 100644
--- a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
@@ -23,6 +23,12 @@ namespace cuvs::neighbors::nn_descent {
 typedef AnnNNDescentTest<float, float, std::uint32_t> AnnNNDescentTestF_U32;
 TEST_P(AnnNNDescentTestF_U32, AnnNNDescent) { this->testNNDescent(); }
 
+// typedef AnnNNDescentBatchTest<float, float, std::uint32_t> AnnNNDescentBatchTestF_U32;
+// TEST_P(AnnNNDescentBatchTestF_U32, AnnNNDescentBatch) { this->testNNDescentBatch(); }
+
 INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestF_U32, ::testing::ValuesIn(inputs));
+// INSTANTIATE_TEST_CASE_P(AnnNNDescentBatchTest,
+//                         AnnNNDescentBatchTestF_U32,
+//                         ::testing::ValuesIn(inputsBatch));
 
 }  // namespace   cuvs::neighbors::nn_descent
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index b08e1d725..94bccade2 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/device_mdarray.hpp>  // raft::make_device_matrix
 #include <raft/core/resource/cuda_stream.hpp>
@@ -165,9 +166,14 @@ auto calc_recall(const std::vector<T>& expected_idx,
 /** check uniqueness of indices
  */
 template <typename T>
-auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t cols)
+auto check_unique_indices(const std::vector<T>& actual_idx,
+                          size_t rows,
+                          size_t cols,
+                          size_t max_duplicates = 0)
 {
   size_t max_count;
+  size_t dup_count = 0lu;
+
   std::set<T> unique_indices;
   for (size_t i = 0; i < rows; ++i) {
     unique_indices.clear();
@@ -180,8 +186,11 @@ auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t
       } else if (unique_indices.find(act_idx) == unique_indices.end()) {
         unique_indices.insert(act_idx);
       } else {
-        return testing::AssertionFailure()
-               << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        dup_count++;
+        if (dup_count > max_duplicates) {
+          return testing::AssertionFailure()
+                 << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        }
       }
     }
   }
@@ -264,7 +273,8 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
                      size_t cols,
                      double eps,
                      double min_recall,
-                     bool test_unique = true) -> testing::AssertionResult
+                     bool test_unique      = true,
+                     size_t max_duplicates = 0) -> testing::AssertionResult
 {
   auto [actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps);
@@ -284,7 +294,7 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
            << min_recall << "); eps = " << eps << ". ";
   }
   if (test_unique)
-    return check_unique_indices(actual_idx, rows, cols);
+    return check_unique_indices(actual_idx, rows, cols, max_duplicates);
   else
     return testing::AssertionSuccess();
 }
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
index 0ae97266b..8bd2e8b76 100644
--- a/python/cuvs/cuvs/test/test_hnsw.py
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -23,7 +23,7 @@
 
 
 def run_hnsw_build_search_test(
-    n_rows=1000,
+    n_rows=10000,
     n_cols=10,
     n_queries=100,
     k=10,

From fdb118002a482e878ec48fcaa7f11a15efd59140 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 13 Nov 2024 21:32:29 -0600
Subject: [PATCH 25/47] enforce wheel size limits, README formatting in CI
 (#464)

Contributes to https://github.com/rapidsai/build-planning/issues/110

Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI.

* checks on wheel size (compressed),
  - *to be sure they're under PyPI limits*
  - *and to prompt discussion on PRs that significantly increase wheel sizes*
* checks on README formatting
  - *to ensure they'll render properly as the PyPI project homepages*
  - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/*

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/464
---
 ci/build_wheel_cuvs.sh     |  5 ++++-
 ci/validate_wheel.sh       | 21 +++++++++++++++++++++
 python/cuvs/pyproject.toml |  8 ++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100755 ci/validate_wheel.sh

diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh
index e03da9f19..444657cc0 100755
--- a/ci/build_wheel_cuvs.sh
+++ b/ci/build_wheel_cuvs.sh
@@ -3,6 +3,8 @@
 
 set -euo pipefail
 
+package_dir="python/cuvs"
+
 case "${RAPIDS_CUDA_VERSION}" in
   12.*)
     EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
@@ -15,4 +17,5 @@ esac
 # Set up skbuild options. Enable sccache in skbuild config options
 export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}"
 
-ci/build_wheel.sh cuvs python/cuvs
+ci/build_wheel.sh cuvs ${package_dir}
+ci/validate_wheel.sh ${package_dir} final_dist
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 000000000..5910a5c59
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir=$1
+wheel_dir_relative_path=$2
+
+cd "${package_dir}"
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index 30d784c67..d40026776 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -133,6 +133,14 @@ build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# detect when package size grows significantly
+max_allowed_size_compressed = '1.4G'
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",

From bb9c669500cf0401114f4a5810d0f3a0ea1db6b3 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 14 Nov 2024 21:25:58 +0100
Subject: [PATCH 26/47] Fix include errors, header, and unsafe locks in
 iface.hpp (#467)

Fix a few issues with the internal header `neighbors/iface/iface.hpp`  leading to compile time errors and dangerous runtime behavior:

  - Add missing includes
  - Use `std::lock_guard` to avoid a deadlock on exception
  - Add NVIDIA header
  - Avoid an extra stream sync during search.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Victor Lafargue (https://github.com/viclafargue)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/467
---
 cpp/src/neighbors/cagra_c.cpp     |  2 ++
 cpp/src/neighbors/iface/iface.hpp | 53 +++++++++++++++++--------------
 cpp/src/neighbors/ivf_flat_c.cpp  |  2 ++
 cpp/src/neighbors/mg/mg.cuh       |  2 ++
 examples/cpp/src/common.cuh       |  4 +++
 5 files changed, 39 insertions(+), 24 deletions(-)
 mode change 100755 => 100644 cpp/src/neighbors/ivf_flat_c.cpp

diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp
index 6985ff094..326a89665 100644
--- a/cpp/src/neighbors/cagra_c.cpp
+++ b/cpp/src/neighbors/cagra_c.cpp
@@ -29,6 +29,8 @@
 #include <cuvs/neighbors/cagra.h>
 #include <cuvs/neighbors/cagra.hpp>
 
+#include <fstream>
+
 namespace {
 
 template <typename T>
diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp
index a329db429..9b3da75a4 100644
--- a/cpp/src/neighbors/iface/iface.hpp
+++ b/cpp/src/neighbors/iface/iface.hpp
@@ -1,4 +1,20 @@
-#include <mutex>
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
@@ -6,6 +22,9 @@
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <raft/core/device_resources.hpp>
 
+#include <fstream>
+#include <mutex>
+
 namespace cuvs::neighbors {
 
 using namespace raft;
@@ -16,7 +35,7 @@ void build(const raft::device_resources& handle,
            const cuvs::neighbors::index_params* index_params,
            raft::mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> index_dataset)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     auto idx = cuvs::neighbors::ivf_flat::build(
@@ -32,8 +51,6 @@ void build(const raft::device_resources& handle,
     interface.index_.emplace(std::move(idx));
   }
   resource::sync_stream(handle);
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT, typename Accessor1, typename Accessor2>
@@ -44,7 +61,7 @@ void extend(
   std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, Accessor2>>
     new_indices)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     auto idx =
@@ -58,8 +75,6 @@ void extend(
     RAFT_FAIL("CAGRA does not implement the extend method");
   }
   resource::sync_stream(handle);
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -70,7 +85,7 @@ void search(const raft::device_resources& handle,
             raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
             raft::device_matrix_view<float, int64_t, row_major> distances)
 {
-  // interface.mutex_->lock();
+  // std::lock_guard(*interface.mutex_);
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, int64_t>>::value) {
     cuvs::neighbors::ivf_flat::search(
       handle,
@@ -94,9 +109,7 @@ void search(const raft::device_resources& handle,
                                    neighbors,
                                    distances);
   }
-  resource::sync_stream(handle);
-
-  // interface.mutex_->unlock();
+  // resource::sync_stream(handle);
 }
 
 // for MG ANN only
@@ -108,7 +121,7 @@ void search(const raft::device_resources& handle,
             raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,
             raft::device_matrix_view<float, int64_t, row_major> d_distances)
 {
-  // interface.mutex_->lock();
+  // std::lock_guard(*interface.mutex_);
 
   int64_t n_rows = h_queries.extent(0);
   int64_t n_dims = h_queries.extent(1);
@@ -120,8 +133,6 @@ void search(const raft::device_resources& handle,
   auto d_query_view = raft::make_const_mdspan(d_queries.view());
 
   search(handle, interface, search_params, d_query_view, d_neighbors, d_distances);
-
-  // interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -129,7 +140,7 @@ void serialize(const raft::device_resources& handle,
                const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                std::ostream& os)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     ivf_flat::serialize(handle, os, interface.index_.value());
@@ -138,8 +149,6 @@ void serialize(const raft::device_resources& handle,
   } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
     cagra::serialize(handle, os, interface.index_.value(), true);
   }
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -147,7 +156,7 @@ void deserialize(const raft::device_resources& handle,
                  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                  std::istream& is)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     ivf_flat::index<T, IdxT> idx(handle);
@@ -162,8 +171,6 @@ void deserialize(const raft::device_resources& handle,
     cagra::deserialize(handle, is, &idx);
     interface.index_.emplace(std::move(idx));
   }
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -171,7 +178,7 @@ void deserialize(const raft::device_resources& handle,
                  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                  const std::string& filename)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   std::ifstream is(filename, std::ios::in | std::ios::binary);
   if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
@@ -191,8 +198,6 @@ void deserialize(const raft::device_resources& handle,
   }
 
   is.close();
-
-  interface.mutex_->unlock();
 }
 
-};  // namespace cuvs::neighbors
\ No newline at end of file
+};  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/ivf_flat_c.cpp b/cpp/src/neighbors/ivf_flat_c.cpp
old mode 100755
new mode 100644
index c14c1edc0..2acc6b678
--- a/cpp/src/neighbors/ivf_flat_c.cpp
+++ b/cpp/src/neighbors/ivf_flat_c.cpp
@@ -29,6 +29,8 @@
 #include <cuvs/neighbors/ivf_flat.h>
 #include <cuvs/neighbors/ivf_flat.hpp>
 
+#include <fstream>
+
 namespace {
 
 template <typename T, typename IdxT>
diff --git a/cpp/src/neighbors/mg/mg.cuh b/cpp/src/neighbors/mg/mg.cuh
index d3f635bc4..e9cdc30f6 100644
--- a/cpp/src/neighbors/mg/mg.cuh
+++ b/cpp/src/neighbors/mg/mg.cuh
@@ -25,6 +25,8 @@
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/mg.hpp>
 
+#include <fstream>
+
 namespace cuvs::neighbors {
 using namespace raft;
 
diff --git a/examples/cpp/src/common.cuh b/examples/cpp/src/common.cuh
index 1c93dec0e..8e109a764 100644
--- a/examples/cpp/src/common.cuh
+++ b/examples/cpp/src/common.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cstdint>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_resources.hpp>
@@ -28,6 +30,8 @@
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <fstream>
+
 // Fill dataset and queries with synthetic data.
 void generate_dataset(raft::device_resources const &dev_resources,
                       raft::device_matrix_view<float, int64_t> dataset,

From 7ab2bfdd250613137a5622471212dab528319306 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Fri, 15 Nov 2024 12:16:17 -0500
Subject: [PATCH 27/47] Add `InnerProduct` and `CosineExpanded` metric support
 in NN Descent (#177)

Closes #171

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/177
---
 cpp/CMakeLists.txt                            |  1 +
 cpp/include/cuvs/neighbors/nn_descent.hpp     | 24 ++---
 .../neighbors/detail/cagra/cagra_build.cuh    | 12 ++-
 cpp/src/neighbors/detail/nn_descent.cuh       | 87 +++++++++++++------
 cpp/src/neighbors/nn_descent_index.cpp        | 29 +++++++
 cpp/test/neighbors/ann_cagra.cuh              | 10 +--
 cpp/test/neighbors/ann_nn_descent.cuh         | 32 ++++---
 python/cuvs/cuvs/test/test_cagra.py           |  4 +-
 python/cuvs/cuvs/test/test_hnsw.py            |  4 +-
 9 files changed, 139 insertions(+), 64 deletions(-)
 create mode 100644 cpp/src/neighbors/nn_descent_index.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c493af488..81b82aa7b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -436,6 +436,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/nn_descent.cu
     src/neighbors/nn_descent_float.cu
     src/neighbors/nn_descent_half.cu
+    src/neighbors/nn_descent_index.cpp
     src/neighbors/nn_descent_int8.cu
     src/neighbors/nn_descent_uint8.cu
     src/neighbors/reachability.cu
diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp
index bd41d1ff7..9cd8192b5 100644
--- a/cpp/include/cuvs/neighbors/nn_descent.hpp
+++ b/cpp/include/cuvs/neighbors/nn_descent.hpp
@@ -61,11 +61,10 @@ struct index_params : cuvs::neighbors::index_params {
   /** @brief Construct NN descent parameters for a specific kNN graph degree
    *
    * @param graph_degree output graph degree
+   * @param metric distance metric to use
    */
-  index_params(size_t graph_degree = 64)
-    : graph_degree(graph_degree), intermediate_graph_degree(1.5 * graph_degree)
-  {
-  }
+  index_params(size_t graph_degree                 = 64,
+               cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded);
 };
 
 /**
@@ -103,11 +102,16 @@ struct index : cuvs::neighbors::index {
    * @param n_rows number of rows in knn-graph
    * @param n_cols number of cols in knn-graph
    * @param return_distances whether to return distances
+   * @param metric distance metric to use
    */
-  index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false)
+  index(raft::resources const& res,
+        int64_t n_rows,
+        int64_t n_cols,
+        bool return_distances               = false,
+        cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
     : cuvs::neighbors::index(),
       res_{res},
-      metric_{cuvs::distance::DistanceType::L2Expanded},
+      metric_{metric},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(n_rows, n_cols)},
       graph_view_{graph_.view()},
       return_distances_{return_distances}
@@ -129,14 +133,16 @@ struct index : cuvs::neighbors::index {
    * @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
    * @param distances_view optional raft::device_matrix_view<float, int64_t, row_major> for storing
    * distances
+   * @param metric distance metric to use
    */
   index(raft::resources const& res,
         raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
         std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
-          std::nullopt)
+          std::nullopt,
+        cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
     : cuvs::neighbors::index(),
       res_{res},
-      metric_{cuvs::distance::DistanceType::L2Expanded},
+      metric_{metric},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0)},
       graph_view_{graph_view},
       distances_view_{distances_view},
@@ -473,8 +479,6 @@ auto build(raft::resources const& res,
            std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
              std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
-/** @} */
-
 /**
  * @brief Test if we have enough GPU memory to run NN descent algorithm.
  *
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 6209ff819..b7fec724b 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -436,11 +436,11 @@ index<T, IdxT> build(
   auto knn_build_params = params.graph_build_params;
   if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
     // Heuristic to decide default build algo and its params.
-    if (params.metric == cuvs::distance::DistanceType::L2Expanded &&
-        cuvs::neighbors::nn_descent::has_enough_device_memory(
+    if (cuvs::neighbors::nn_descent::has_enough_device_memory(
           res, dataset.extents(), sizeof(IdxT))) {
       RAFT_LOG_DEBUG("NN descent solver");
-      knn_build_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
+      knn_build_params =
+        cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
     } else {
       RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
       knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
@@ -453,9 +453,6 @@ index<T, IdxT> build(
       std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(knn_build_params);
     build_knn_graph(res, dataset, knn_graph->view(), ivf_pq_params);
   } else {
-    RAFT_EXPECTS(
-      params.metric == cuvs::distance::DistanceType::L2Expanded,
-      "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent");
     auto nn_descent_params =
       std::get<cagra::graph_build_params::nn_descent_params>(knn_build_params);
 
@@ -466,7 +463,8 @@ index<T, IdxT> build(
         "nn-descent graph_degree.",
         nn_descent_params.graph_degree,
         intermediate_degree);
-      nn_descent_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
+      nn_descent_params =
+        cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
     }
 
     // Use nn-descent to build CAGRA knn graph
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 883d82d76..c62a52540 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -19,6 +19,7 @@
 #include "ann_utils.cuh"
 #include "cagra/device_common.hpp"
 
+#include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
 
 #include <raft/core/device_mdarray.hpp>
@@ -216,6 +217,7 @@ struct BuildConfig {
   size_t max_iterations{50};
   float termination_threshold{0.0001};
   size_t output_graph_degree{32};
+  cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded};
 };
 
 template <typename Index_t>
@@ -454,11 +456,13 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer,
 // TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
 /** Calculate L2 norm, and cast data to __half */
 template <typename Data_t>
-RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
-                                   __half* output_data,
-                                   int dim,
-                                   DistData_t* l2_norms,
-                                   size_t list_offset = 0)
+RAFT_KERNEL preprocess_data_kernel(
+  const Data_t* input_data,
+  __half* output_data,
+  int dim,
+  DistData_t* l2_norms,
+  size_t list_offset                  = 0,
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
 {
   extern __shared__ char buffer[];
   __shared__ float l2_norm;
@@ -468,26 +472,32 @@ RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
   load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size());
   if (threadIdx.x == 0) { l2_norm = 0; }
   __syncthreads();
-  int lane_id = threadIdx.x % raft::warp_size();
-  for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
-    int idx         = step * raft::warp_size() + lane_id;
-    float part_dist = 0;
-    if (idx < dim) {
-      part_dist = s_vec[idx];
-      part_dist = part_dist * part_dist;
-    }
-    __syncwarp();
-    for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
-      part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+
+  if (metric == cuvs::distance::DistanceType::L2Expanded ||
+      metric == cuvs::distance::DistanceType::CosineExpanded) {
+    int lane_id = threadIdx.x % raft::warp_size();
+    for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
+      int idx         = step * raft::warp_size() + lane_id;
+      float part_dist = 0;
+      if (idx < dim) {
+        part_dist = s_vec[idx];
+        part_dist = part_dist * part_dist;
+      }
+      __syncwarp();
+      for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+        part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+      }
+      if (lane_id == 0) { l2_norm += part_dist; }
+      __syncwarp();
     }
-    if (lane_id == 0) { l2_norm += part_dist; }
-    __syncwarp();
   }
 
   for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
     int idx = step * raft::warp_size() + threadIdx.x;
     if (idx < dim) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
+        output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
         output_data[list_id * dim + idx] =
           (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm);
       } else {
@@ -715,7 +725,8 @@ __launch_bounds__(BLOCK_SIZE, 4)
                     DistData_t* dists,
                     int graph_width,
                     int* locks,
-                    DistData_t* l2_norms)
+                    DistData_t* l2_norms,
+                    cuvs::distance::DistanceType metric)
 {
 #if (__CUDA_ARCH__ >= 700)
   using namespace nvcuda;
@@ -827,8 +838,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
     if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
         i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
+        s_distances[i] = 1.0 - s_distances[i];
       } else {
         s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
                          l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
@@ -906,8 +919,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
     if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
         i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
+        s_distances[i] = 1.0 - s_distances[i];
       } else {
         s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
                          l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
@@ -1161,7 +1176,7 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
     ndim_(build_config.dataset_dim),
     d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>(
       res, nrow_, build_config.dataset_dim)},
-    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, nrow_)},
+    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, 0)},
     graph_buffer_{
       raft::make_device_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     dists_buffer_{
@@ -1181,11 +1196,16 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
     d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
 {
   static_assert(NUM_SAMPLES <= 32);
+
   raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
   auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
     reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
   raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
   raft::matrix::fill(res, d_locks_.view(), 0);
+
+  if (build_config.metric == cuvs::distance::DistanceType::L2Expanded) {
+    l2_norms_ = raft::make_device_vector<DistData_t, size_t>(res, nrow_);
+  }
 };
 
 template <typename Data_t, typename Index_t>
@@ -1228,7 +1248,8 @@ void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
                                                       dists_buffer_.data_handle(),
                                                       DEGREE_ON_DEVICE,
                                                       d_locks_.data_handle(),
-                                                      l2_norms_.data_handle());
+                                                      l2_norms_.data_handle(),
+                                                      build_config_.metric);
 }
 
 template <typename Data_t, typename Index_t>
@@ -1261,7 +1282,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data,
                 d_data_.data_handle(),
                 build_config_.dataset_dim,
                 l2_norms_.data_handle(),
-                batch.offset());
+                batch.offset(),
+                build_config_.metric);
   }
 
   graph_.clear();
@@ -1417,6 +1439,11 @@ void build(raft::resources const& res,
   RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
                "The dataset size for GNND should be less than %d",
                std::numeric_limits<int>::max() - 1);
+  auto allowed_metrics = params.metric == cuvs::distance::DistanceType::L2Expanded ||
+                         params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+                         params.metric == cuvs::distance::DistanceType::InnerProduct;
+  RAFT_EXPECTS(allowed_metrics && idx.metric() == params.metric,
+               "The metric for NN Descent should be L2Expanded, CosineExpanded or InnerProduct");
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
 
@@ -1452,7 +1479,8 @@ void build(raft::resources const& res,
                            .internal_node_degree  = extended_intermediate_degree,
                            .max_iterations        = params.max_iterations,
                            .termination_threshold = params.termination_threshold,
-                           .output_graph_degree   = params.graph_degree};
+                           .output_graph_degree   = params.graph_degree,
+                           .metric                = params.metric};
 
   GNND<const T, int> nnd(res, build_config);
 
@@ -1500,8 +1528,11 @@ index<IdxT> build(
     graph_degree = intermediate_degree;
   }
 
-  index<IdxT> idx{
-    res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
+  index<IdxT> idx{res,
+                  dataset.extent(0),
+                  static_cast<int64_t>(graph_degree),
+                  params.return_distances,
+                  params.metric};
 
   build(res, params, dataset, idx);
 
diff --git a/cpp/src/neighbors/nn_descent_index.cpp b/cpp/src/neighbors/nn_descent_index.cpp
new file mode 100644
index 000000000..25d5b6af8
--- /dev/null
+++ b/cpp/src/neighbors/nn_descent_index.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
+namespace cuvs::neighbors::nn_descent {
+
+index_params::index_params(size_t graph_degree, cuvs::distance::DistanceType metric)
+{
+  this->graph_degree              = graph_degree;
+  this->intermediate_graph_degree = 1.5 * graph_degree;
+  this->metric                    = metric;
+}
+}  // namespace cuvs::neighbors::nn_descent
\ No newline at end of file
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 37d42dd1d..660246c67 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -361,8 +361,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
                                           // not used for knn_graph building.
         switch (ps.build_algo) {
           case graph_build_algo::IVF_PQ:
-            index_params.graph_build_params =
-              graph_build_params::ivf_pq_params(raft::matrix_extent<int64_t>(ps.n_rows, ps.dim));
+            index_params.graph_build_params = graph_build_params::ivf_pq_params(
+              raft::matrix_extent<int64_t>(ps.n_rows, ps.dim), index_params.metric);
             if (ps.ivf_pq_search_refine_ratio) {
               std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
                 index_params.graph_build_params)
@@ -370,8 +370,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
             }
             break;
           case graph_build_algo::NN_DESCENT: {
-            index_params.graph_build_params =
-              graph_build_params::nn_descent_params(index_params.intermediate_graph_degree);
+            index_params.graph_build_params = graph_build_params::nn_descent_params(
+              index_params.intermediate_graph_degree, index_params.metric);
             break;
           }
           case graph_build_algo::AUTO:
@@ -389,7 +389,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
           (const DataT*)database.data(), ps.n_rows, ps.dim);
 
         {
-          cagra::index<DataT, IdxT> index(handle_);
+          cagra::index<DataT, IdxT> index(handle_, index_params.metric);
           if (ps.host_dataset) {
             auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index 7d2575c2b..09861a219 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -27,6 +27,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include "naive_knn.cuh"
+#include <cuvs/distance/distance.hpp>
 
 #include <gtest/gtest.h>
 
@@ -107,7 +108,6 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
       raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       raft::resource::sync_stream(handle_);
     }
-
     {
       {
         nn_descent::index_params index_params;
@@ -124,6 +124,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
           if (ps.host_dataset) {
             auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            raft::resource::sync_stream(handle_);
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
               (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
             auto index = nn_descent::build(handle_, index_params, database_host_view);
@@ -151,6 +152,13 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
         raft::resource::sync_stream(handle_);
       }
 
+      if (ps.metric == cuvs::distance::DistanceType::InnerProduct) {
+        std::transform(
+          distances_naive.begin(), distances_naive.end(), distances_naive.begin(), [](auto x) {
+            return -x;
+          });
+      }
+
       double min_recall = ps.min_recall;
       EXPECT_TRUE(eval_neighbours(indices_naive,
                                   indices_NNDescent,
@@ -169,9 +177,11 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
     raft::random::RngState r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
       raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
-    } else {
+    } else if constexpr (std::is_same<DataT, int8_t>{}) {
       raft::random::uniformInt(
-        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(-5), DataT(5));
+    } else {
+      raft::random::uniformInt(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0), DataT(5));
     }
     raft::resource::sync_stream(handle_);
   }
@@ -308,13 +318,15 @@ class AnnNNDescentBatchTest : public ::testing::TestWithParam<AnnNNDescentBatchI
   rmm::device_uvector<DataT> database;
 };
 
-const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<AnnNNDescentInputs>(
-  {1000, 2000},                                              // n_rows
-  {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
-  {32, 64},                                                  // graph_degree
-  {cuvs::distance::DistanceType::L2Expanded},
-  {false, true},
-  {0.90});
+const std::vector<AnnNNDescentInputs> inputs =
+  raft::util::itertools::product<AnnNNDescentInputs>({2000, 4000},            // n_rows
+                                                     {4, 16, 64, 256, 1024},  // dim
+                                                     {32, 64},                // graph_degree
+                                                     {cuvs::distance::DistanceType::L2Expanded,
+                                                      cuvs::distance::DistanceType::InnerProduct,
+                                                      cuvs::distance::DistanceType::CosineExpanded},
+                                                     {false, true},
+                                                     {0.90});
 
 // TODO : Investigate why this test is failing Reference issue https
 // :  // github.com/rapidsai/raft/issues/2450
diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py
index 92b88f013..56e132c23 100644
--- a/python/cuvs/cuvs/test/test_cagra.py
+++ b/python/cuvs/cuvs/test/test_cagra.py
@@ -122,8 +122,9 @@ def run_cagra_build_search_test(
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("array_type", ["device", "host"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+@pytest.mark.parametrize("metric", ["euclidean"])
 def test_cagra_dataset_dtype_host_device(
-    dtype, array_type, inplace, build_algo
+    dtype, array_type, inplace, build_algo, metric
 ):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
@@ -132,6 +133,7 @@ def test_cagra_dataset_dtype_host_device(
         inplace=inplace,
         array_type=array_type,
         build_algo=build_algo,
+        metric=metric,
     )
 
 
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
index 8bd2e8b76..20a35401e 100644
--- a/python/cuvs/cuvs/test/test_hnsw.py
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -41,8 +41,6 @@ def run_hnsw_build_search_test(
             pytest.skip(
                 "inner_product metric is not supported for int8/uint8 data"
             )
-        if build_algo == "nn_descent":
-            pytest.skip("inner_product metric is not supported for nn_descent")
 
     build_params = cagra.IndexParams(
         metric=metric,
@@ -83,7 +81,7 @@ def run_hnsw_build_search_test(
 @pytest.mark.parametrize("k", [10, 20])
 @pytest.mark.parametrize("ef", [30, 40])
 @pytest.mark.parametrize("num_threads", [2, 4])
-@pytest.mark.parametrize("metric", ["sqeuclidean"])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
 def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
     # Note that inner_product tests use normalized input which we cannot

From 7b879116684501f36ca5a19a74c01fcecb52e962 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 15 Nov 2024 16:12:42 -0600
Subject: [PATCH 28/47] use different wheel-size thresholds based on CUDA
 version (#469)

`cuvs-cu11` wheels are significantly larger than `cuvs-cu12` wheels, because (among other reasons) they are not able to dynamically link to CUDA math library wheels.

In #464, I proposed a size limit for CI checks of "max CUDA 11 wheel size + a buffer".

This PR proposes using different thresholds based on CUDA major version, following these discussions:

* https://github.com/rapidsai/cugraph/pull/4754#discussion_r1842526907
* https://github.com/rapidsai/cuml/pull/6136#discussion_r1841774811

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/cuvs/pull/469
---
 ci/validate_wheel.sh       | 14 ++++++++++++++
 python/cuvs/pyproject.toml |  4 +---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index 5910a5c59..f2b235765 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -6,12 +6,26 @@ set -euo pipefail
 package_dir=$1
 wheel_dir_relative_path=$2
 
+RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
+
+# some packages are much larger on CUDA 11 than on CUDA 12
+if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
+    PYDISTCHECK_ARGS=(
+        --max-allowed-size-compressed '1.4G'
+    )
+else
+    PYDISTCHECK_ARGS=(
+        --max-allowed-size-compressed '950M'
+    )
+fi
+
 cd "${package_dir}"
 
 rapids-logger "validate packages with 'pydistcheck'"
 
 pydistcheck \
     --inspect \
+    "${PYDISTCHECK_ARGS[@]}" \
     "$(echo ${wheel_dir_relative_path}/*.whl)"
 
 rapids-logger "validate packages with 'twine'"
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index d40026776..92e4993c7 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -135,12 +135,10 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 
 [tool.pydistcheck]
 select = [
+    # NOTE: size threshold is managed via CLI args in CI scripts
     "distro-too-large-compressed",
 ]
 
-# detect when package size grows significantly
-max_allowed_size_compressed = '1.4G'
-
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",

From 27d45533d91f13ce00eabed409468a2b47452f4d Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 18 Nov 2024 14:55:13 -0800
Subject: [PATCH 29/47] Move check_input_array from pylibraft (#474)

With the changes in https://github.com/rapidsai/raft/pull/2498 we no longer have a pylibraft.neighbors module - but were still using a utility function `_check_input_array` from it in cuvs. Move this over to cuvs to unblock ci

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/474
---
 .../neighbors/brute_force/brute_force.pyx     |  2 +-
 python/cuvs/cuvs/neighbors/cagra/cagra.pyx    |  3 +-
 python/cuvs/cuvs/neighbors/common.py          | 36 +++++++++++++++++++
 .../cuvs/cuvs/neighbors/filters/filters.pyx   |  2 +-
 python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx      |  2 +-
 .../cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx |  2 +-
 python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx  |  2 +-
 python/cuvs/cuvs/neighbors/refine.pyx         |  2 +-
 8 files changed, 44 insertions(+), 7 deletions(-)
 create mode 100644 python/cuvs/cuvs/neighbors/common.py

diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
index 559302ccc..9d1d24eae 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
@@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from cuvs.common.c_api cimport cuvsResources_t
 
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
index 95209dbeb..752aef741 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
@@ -32,7 +32,8 @@ from cuvs.common cimport cydlpack
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
+
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/common.py b/python/cuvs/cuvs/neighbors/common.py
new file mode 100644
index 000000000..c14b9f8c9
--- /dev/null
+++ b/python/cuvs/cuvs/neighbors/common.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _check_input_array(cai, exp_dt, exp_rows=None, exp_cols=None):
+    if cai.dtype not in exp_dt:
+        raise TypeError("dtype %s not supported" % cai.dtype)
+
+    if not cai.c_contiguous:
+        raise ValueError("Row major input is expected")
+
+    if exp_cols is not None and cai.shape[1] != exp_cols:
+        raise ValueError(
+            "Incorrect number of columns, expected {} got {}".format(
+                exp_cols, cai.shape[1]
+            )
+        )
+
+    if exp_rows is not None and cai.shape[0] != exp_rows:
+        raise ValueError(
+            "Incorrect number of rows, expected {} , got {}".format(
+                exp_rows, cai.shape[0]
+            )
+        )
diff --git a/python/cuvs/cuvs/neighbors/filters/filters.pyx b/python/cuvs/cuvs/neighbors/filters/filters.pyx
index 3a81cb786..9bc2a905c 100644
--- a/python/cuvs/cuvs/neighbors/filters/filters.pyx
+++ b/python/cuvs/cuvs/neighbors/filters/filters.pyx
@@ -20,11 +20,11 @@ import numpy as np
 from libc.stdint cimport uintptr_t
 
 from cuvs.common cimport cydlpack
+from cuvs.neighbors.common import _check_input_array
 
 from .filters cimport BITMAP, NO_FILTER, cuvsFilter
 
 from pylibraft.common.cai_wrapper import wrap_array
-from pylibraft.neighbors.common import _check_input_array
 
 
 cdef class Prefilter:
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index 018fcfef9..bcfaf167e 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -21,6 +21,7 @@ from libcpp.string cimport string
 
 from cuvs.common.exceptions import check_cuvs
 from cuvs.common.resources import auto_sync_resources
+from cuvs.neighbors.common import _check_input_array
 
 from cuvs.common cimport cydlpack
 
@@ -36,7 +37,6 @@ import uuid
 from pylibraft.common import auto_convert_output
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 
 cdef class SearchParams:
diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
index 25b9b2aee..7a169e1a0 100644
--- a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
@@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
index 3add1df75..531302ee6 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
@@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/refine.pyx b/python/cuvs/cuvs/neighbors/refine.pyx
index 0eccc4108..b7aa35dca 100644
--- a/python/cuvs/cuvs/neighbors/refine.pyx
+++ b/python/cuvs/cuvs/neighbors/refine.pyx
@@ -31,13 +31,13 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
 
 from cuvs.common.c_api cimport cuvsResources_t
 
 from cuvs.common.exceptions import check_cuvs
+from cuvs.neighbors.common import _check_input_array
 
 
 @auto_sync_resources

From f127b06b83e3c9e3c3034fdc902441edbf841b90 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Tue, 19 Nov 2024 14:01:22 +0100
Subject: [PATCH 30/47] Fix an OOB error in device-side cuvs::neighbors::refine
 and CAGRA kern_prune (#460)

IVF-Flat index expects all valid indices during build, which may not be the case in the context of refinement.
At the same time, `cagra::detail::graph::kern_prune` fails with OOB error if some indices are invalid.

This PR tweaks both kernels to avoid touching the input data with an invalid index.

Fixes https://github.com/rapidsai/cuvs/issues/337

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/460
---
 cpp/src/neighbors/detail/cagra/graph_core.cuh |  1 +
 cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh |  8 +-
 cpp/test/CMakeLists.txt                       |  1 +
 .../ann_cagra/bug_extreme_inputs_oob.cu       | 73 +++++++++++++++++++
 4 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu

diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 4253cb781..daeac82b9 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -156,6 +156,7 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
   // count number of detours (A->D->B)
   for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
     const uint64_t iD = knn_graph[kAD + (graph_degree * iA)];
+    if (iD >= graph_size) { continue; }
     for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) {
       const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)];
       for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) {
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
index fb110d810..d6ffc1218 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
@@ -132,6 +132,10 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
 {
   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
   if (i >= n_rows) { return; }
+  auto source_ix = source_ixs == nullptr ? i + batch_offset : source_ixs[i];
+  // In the context of refinement, some indices may be invalid (the generating NN algorithm does
+  // not return enough valid items). Do not add the item to the index in this case.
+  if (source_ix == ivf::kInvalidRecord<IdxT> || source_ix == raft::upper_bound<IdxT>()) { return; }
 
   auto list_id     = labels[i];
   auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
@@ -139,7 +143,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
   auto* list_data  = list_data_ptrs[list_id];
 
   // Record the source vector id in the index
-  list_index[inlist_id] = source_ixs == nullptr ? i + batch_offset : source_ixs[i];
+  list_index[inlist_id] = source_ix;
 
   // The data is written in interleaved groups of `index::kGroupSize` vectors
   using interleaved_group = raft::Pow2<kIndexGroupSize>;
@@ -151,7 +155,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
 
   // Point to the source vector
   if constexpr (gather_src) {
-    source_vecs += source_ixs[i] * dim;
+    source_vecs += source_ix * dim;
   } else {
     source_vecs += i * dim;
   }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 1ed8466b3..7754a5043 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -137,6 +137,7 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_CAGRA_TEST
     PATH
+    neighbors/ann_cagra/bug_extreme_inputs_oob.cu
     neighbors/ann_cagra/bug_multi_cta_crash.cu
     neighbors/ann_cagra/test_float_uint32_t.cu
     neighbors/ann_cagra/test_half_uint32_t.cu
diff --git a/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu
new file mode 100644
index 000000000..e21a54e9e
--- /dev/null
+++ b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/cagra.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::cagra {
+
+class cagra_extreme_inputs_oob_test : public ::testing::Test {
+ public:
+  using data_type = float;
+
+ protected:
+  void run()
+  {
+    cagra::index_params ix_ps;
+    graph_build_params::ivf_pq_params gb_params{};
+    gb_params.refinement_rate       = 2;
+    ix_ps.graph_build_params        = gb_params;
+    ix_ps.graph_degree              = 64;
+    ix_ps.intermediate_graph_degree = 128;
+
+    [[maybe_unused]] auto ix = cagra::build(res, ix_ps, raft::make_const_mdspan(dataset->view()));
+    raft::resource::sync_stream(res);
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, n_samples, n_dim));
+    raft::random::RngState r(1234ULL);
+    raft::random::normal(
+      res, r, dataset->data_handle(), n_samples * n_dim, data_type(0), data_type(1e20));
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    dataset.reset();
+    raft::resource::sync_stream(res);
+  }
+
+ private:
+  raft::resources res;
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset = std::nullopt;
+
+  constexpr static int64_t n_samples                   = 100000;
+  constexpr static int64_t n_dim                       = 200;
+  constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+};
+
+TEST_F(cagra_extreme_inputs_oob_test, cagra_extreme_inputs_oob_test) { this->run(); }
+
+}  // namespace cuvs::neighbors::cagra

From 06afd5bd27d07ad6e58544c06f920d570b7df983 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Wed, 20 Nov 2024 15:26:05 -0800
Subject: [PATCH 31/47] Migrate sparse knn and distances code from raft (#457)

Authors:
  - Ben Frederickson (https://github.com/benfred)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/457
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cuvs/distance/distance.hpp        |  81 ++
 cpp/include/cuvs/neighbors/brute_force.hpp    | 104 +++
 .../distance/detail/sparse/bin_distance.cuh   | 231 +++++
 cpp/src/distance/detail/sparse/common.hpp     |  59 ++
 cpp/src/distance/detail/sparse/coo_spmv.cuh   | 211 +++++
 .../detail/sparse/coo_spmv_kernel.cuh         | 229 +++++
 .../coo_spmv_strategies/base_strategy.cuh     | 149 +++
 .../coo_mask_row_iterators.cuh                | 234 +++++
 .../dense_smem_strategy.cuh                   | 121 +++
 .../coo_spmv_strategies/hash_strategy.cuh     | 296 ++++++
 .../distance/detail/sparse/ip_distance.cuh    |  89 ++
 .../distance/detail/sparse/l2_distance.cuh    | 502 +++++++++++
 .../distance/detail/sparse/lp_distance.cuh    | 333 +++++++
 cpp/src/distance/detail/sparse/utils.cuh      | 171 ++++
 cpp/src/distance/sparse_distance.cu           |  85 ++
 cpp/src/distance/sparse_distance.cuh          | 115 +++
 cpp/src/neighbors/detail/sparse_knn.cuh       | 437 +++++++++
 cpp/src/neighbors/sparse_brute_force.cu       |  72 ++
 cpp/test/CMakeLists.txt                       |   3 +-
 cpp/test/distance/sparse_distance.cu          | 850 ++++++++++++++++++
 cpp/test/neighbors/sparse_brute_force.cu      | 175 ++++
 22 files changed, 4548 insertions(+), 1 deletion(-)
 create mode 100644 cpp/src/distance/detail/sparse/bin_distance.cuh
 create mode 100644 cpp/src/distance/detail/sparse/common.hpp
 create mode 100644 cpp/src/distance/detail/sparse/coo_spmv.cuh
 create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
 create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh
 create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh
 create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh
 create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh
 create mode 100644 cpp/src/distance/detail/sparse/ip_distance.cuh
 create mode 100644 cpp/src/distance/detail/sparse/l2_distance.cuh
 create mode 100644 cpp/src/distance/detail/sparse/lp_distance.cuh
 create mode 100644 cpp/src/distance/detail/sparse/utils.cuh
 create mode 100644 cpp/src/distance/sparse_distance.cu
 create mode 100644 cpp/src/distance/sparse_distance.cuh
 create mode 100644 cpp/src/neighbors/detail/sparse_knn.cuh
 create mode 100644 cpp/src/neighbors/sparse_brute_force.cu
 create mode 100644 cpp/test/distance/sparse_distance.cu
 create mode 100644 cpp/test/neighbors/sparse_brute_force.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 81b82aa7b..32093776c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,6 +369,7 @@ if(BUILD_SHARED_LIBS)
     src/distance/detail/fused_distance_nn.cu
     src/distance/distance.cu
     src/distance/pairwise_distance.cu
+    src/distance/sparse_distance.cu
     src/neighbors/brute_force.cu
     src/neighbors/cagra_build_float.cu
     src/neighbors/cagra_build_half.cu
@@ -449,6 +450,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/refine/detail/refine_host_int8_t_float.cpp
     src/neighbors/refine/detail/refine_host_uint8_t_float.cpp
     src/neighbors/sample_filter.cu
+    src/neighbors/sparse_brute_force.cu
     src/neighbors/vamana_build_float.cu
     src/neighbors/vamana_build_uint8.cu
     src/neighbors/vamana_build_int8.cu
diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp
index def72641e..42c574e58 100644
--- a/cpp/include/cuvs/distance/distance.hpp
+++ b/cpp/include/cuvs/distance/distance.hpp
@@ -20,6 +20,7 @@
 
 #include <cstdint>
 #include <cuda_fp16.h>
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
@@ -331,6 +332,86 @@ void pairwise_distance(
   cuvs::distance::DistanceType metric,
   float metric_arg = 2.0f);
 
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_csr_matrix.hpp>
+ * #include <raft/core/device_mdspan.hpp>
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix<float>(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix<float>(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix<float>(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const float, int, int, int> x,
+                       raft::device_csr_matrix_view<const float, int, int, int> y,
+                       raft::device_matrix_view<float, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg = 2.0f);
+
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_csr_matrix.hpp>
+ * #include <raft/core/device_mdspan.hpp>
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix<double>(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix<double>(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix<double>(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const double, int, int, int> x,
+                       raft::device_csr_matrix_view<const double, int, int, int> y,
+                       raft::device_matrix_view<double, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg = 2.0f);
+
 /** @} */  // end group pairwise_distance_runtime
 
 };  // namespace cuvs::distance
diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp
index 428fa592a..ba67797ee 100644
--- a/cpp/include/cuvs/neighbors/brute_force.hpp
+++ b/cpp/include/cuvs/neighbors/brute_force.hpp
@@ -18,6 +18,7 @@
 
 #include "common.hpp"
 #include <cuvs/neighbors/common.hpp>
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
@@ -375,4 +376,107 @@ void search(raft::resources const& handle,
  * @}
  */
 
+/**
+ * @defgroup sparse_bruteforce_cpp_index Sparse Brute Force index
+ * @{
+ */
+/**
+ * @brief Sparse Brute Force index.
+ *
+ * @tparam T Data element type
+ * @tparam IdxT Index element type
+ */
+template <typename T, typename IdxT>
+struct sparse_index {
+ public:
+  sparse_index(const sparse_index&)            = delete;
+  sparse_index(sparse_index&&)                 = default;
+  sparse_index& operator=(const sparse_index&) = delete;
+  sparse_index& operator=(sparse_index&&)      = default;
+  ~sparse_index()                              = default;
+
+  /** Construct a sparse brute force sparse_index from dataset */
+  sparse_index(raft::resources const& res,
+               raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset,
+               cuvs::distance::DistanceType metric,
+               T metric_arg);
+
+  /** Distance metric used for retrieval */
+  cuvs::distance::DistanceType metric() const noexcept { return metric_; }
+
+  /** Metric argument */
+  T metric_arg() const noexcept { return metric_arg_; }
+
+  raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset() const noexcept
+  {
+    return dataset_;
+  }
+
+ private:
+  raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset_;
+  cuvs::distance::DistanceType metric_;
+  T metric_arg_;
+};
+/**
+ * @}
+ */
+
+/**
+ * @defgroup sparse_bruteforce_cpp_index_build Sparse Brute Force index build
+ * @{
+ */
+
+/*
+ * @brief Build the Sparse index from the dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // create and fill the index from a CSR dataset
+ *   auto index = brute_force::build(handle, dataset, metric);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] dataset A sparse CSR matrix in device memory to search against
+ * @param[in] metric cuvs::distance::DistanceType
+ * @param[in] metric_arg metric argument
+ *
+ * @return the constructed Sparse brute-force index
+ */
+auto build(raft::resources const& handle,
+           raft::device_csr_matrix_view<const float, int, int, int> dataset,
+           cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
+           float metric_arg = 0) -> cuvs::neighbors::brute_force::sparse_index<float, int>;
+/**
+ * @}
+ */
+
+/**
+ * @defgroup sparse_bruteforce_cpp_index_search Sparse Brute Force index search
+ * @{
+ */
+struct sparse_search_params {
+  int batch_size_index = 2 << 14;
+  int batch_size_query = 2 << 14;
+};
+
+/*
+ * @brief Search the sparse bruteforce index for nearest neighbors
+ *
+ * @param[in] handle
+ * @param[in] index Sparse brute-force constructed index
+ * @param[in] queries a sparse CSR matrix on the device to query
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ */
+void search(raft::resources const& handle,
+            const sparse_search_params& params,
+            const sparse_index<float, int>& index,
+            raft::device_csr_matrix_view<const float, int, int, int> dataset,
+            raft::device_matrix_view<int, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+/**
+ * @}
+ */
 }  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/src/distance/detail/sparse/bin_distance.cuh b/cpp/src/distance/detail/sparse/bin_distance.cuh
new file mode 100644
index 000000000..1a63a8eb9
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/bin_distance.cuh
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "ip_distance.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_binary_row_norm_kernel(value_t* out,
+                                           const value_idx* __restrict__ coo_rows,
+                                           const value_t* __restrict__ data,
+                                           value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) {
+    // We do conditional here only because it's
+    // possible there could be some stray zeros in
+    // the sparse structure and removing them would be
+    // more expensive.
+    atomicAdd(&out[coo_rows[i]], data[i] == 1.0);
+  }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+RAFT_KERNEL compute_binary_warp_kernel(value_t* __restrict__ C,
+                                       const value_t* __restrict__ Q_norms,
+                                       const value_t* __restrict__ R_norms,
+                                       value_idx n_rows,
+                                       value_idx n_cols,
+                                       expansion_f expansion_func)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t q_norm            = Q_norms[i];
+  value_t r_norm            = R_norms[j];
+  value_t dot               = C[(size_t)i * n_cols + j];
+  C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_binary(value_t* C,
+                    const value_t* Q_norms,
+                    const value_t* R_norms,
+                    value_idx n_rows,
+                    value_idx n_cols,
+                    expansion_f expansion_func,
+                    cudaStream_t stream)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_bin_distance(value_t* out,
+                          const value_idx* Q_coo_rows,
+                          const value_t* Q_data,
+                          value_idx Q_nnz,
+                          const value_idx* R_coo_rows,
+                          const value_t* R_data,
+                          value_idx R_nnz,
+                          value_idx m,
+                          value_idx n,
+                          cudaStream_t stream,
+                          expansion_f expansion_func)
+{
+  rmm::device_uvector<value_t> Q_norms(m, stream);
+  rmm::device_uvector<value_t> R_norms(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+
+  compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream);
+}
+
+/**
+ * Jaccard distance using the expanded form:
+ * 1 - (sum(x_k * y_k) / ((sum(x_k) + sum(y_k)) - sum(x_k * y_k))
+ */
+template <typename value_idx = int, typename value_t = float>
+class jaccard_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit jaccard_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         raft::resource::get_cuda_stream(config_->handle),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t denom     = q_r_union - dot;
+
+                           value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
+
+                           // flip the similarity when both rows are 0
+                           bool both_empty = q_r_union == 0;
+                           return 1 - ((!both_empty * jacc) + both_empty);
+                         });
+  }
+
+  ~jaccard_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Dice distance using the expanded form:
+ * 1 - ((2 * sum(x_k * y_k)) / (sum(x_k) + sum(y_k)))
+ */
+template <typename value_idx = int, typename value_t = float>
+class dice_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit dice_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         raft::resource::get_cuda_stream(config_->handle),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t dice      = (2 * dot) / q_r_union;
+                           bool both_empty   = q_r_union == 0;
+                           return 1 - ((!both_empty * dice) + both_empty);
+                         });
+  }
+
+  ~dice_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/common.hpp b/cpp/src/distance/detail/sparse/common.hpp
new file mode 100644
index 000000000..803dabe56
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/common.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resources.hpp>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t>
+struct distances_config_t {
+  distances_config_t(raft::resources const& handle_) : handle(handle_) {}
+
+  // left side
+  value_idx a_nrows;
+  value_idx a_ncols;
+  value_idx a_nnz;
+  value_idx* a_indptr;
+  value_idx* a_indices;
+  value_t* a_data;
+
+  // right side
+  value_idx b_nrows;
+  value_idx b_ncols;
+  value_idx b_nnz;
+  value_idx* b_indptr;
+  value_idx* b_indices;
+  value_t* b_data;
+
+  raft::resources const& handle;
+};
+
+template <typename value_t>
+class distances_t {
+ public:
+  virtual void compute(value_t* out) {}
+  virtual ~distances_t() = default;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv.cuh b/cpp/src/distance/detail/sparse/coo_spmv.cuh
new file mode 100644
index 000000000..181b531f7
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "coo_spmv_strategies/dense_smem_strategy.cuh"
+#include "coo_spmv_strategies/hash_strategy.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cusparse_v2.h>
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
+          typename strategy_t>
+inline void balanced_coo_pairwise_generalized_spmv(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows;
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle)));
+
+  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+};
+
+/**
+ * Performs generalized sparse-matrix-sparse-matrix multiplication via a
+ * sparse-matrix-sparse-vector layout `out=A*B` where generalized product()
+ * and sum() operations can be used in place of the standard sum and product:
+ *
+ * out_ij = sum_k(product(A_ik, B_ik)) The sum goes through values of
+ * k=0..n_cols-1 where B_kj is nonzero.
+ *
+ * The product and sum operations shall form a semiring algebra with the
+ * following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 1} is a product monoid with identity element 1
+ * 3. Multiplication by 0 annihilates x. e.g. product(x, 0) = 0
+ *
+ * Each vector of A is loaded into shared memory in dense form and the
+ * non-zeros of B load balanced across the threads of each block.
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n in row-major
+ *             format.
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_b coo row array for B
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ * @param[in] chunk_size number of nonzeros of B to process for each row of A
+ *            this value was found through profiling and represents a reasonable
+ *            setting for both large and small densities
+ */
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows;
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle)));
+
+  int max_cols = max_cols_per_block<value_idx, value_t>();
+
+  if (max_cols > config_.a_ncols) {
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+  } else {
+    hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+  }
+};
+
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
+          typename strategy_t>
+inline void balanced_coo_pairwise_generalized_spmv_rev(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+};
+
+/**
+ * Used for computing distances where the reduction (e.g. product()) function
+ * requires an implicit union (product(x, 0) = x) to capture the difference A-B.
+ * This is necessary in some applications because the standard semiring algebra
+ * endowed with the default multiplication product monoid will only
+ * compute the intersection & B-A.
+ *
+ * This particular function is meant to accompany the function
+ * `balanced_coo_pairwise_generalized_spmv` and executes the product operation
+ * on only those columns that exist in B and not A.
+ *
+ * The product and sum operations shall enable the computation of a
+ * non-annihilating semiring algebra with the following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 0} is a product monoid with identity element 0
+ * 3. Multiplication by 0 does not annihilate x. e.g. product(x, 0) = x
+ *
+ * Manattan distance sum(abs(x_k-y_k)) is a great example of when this type of
+ * execution pattern is necessary.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_a coo row array for A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ * @param[in] chunk_size number of nonzeros of B to process for each row of A
+ *            this value was found through profiling and represents a reasonable
+ *            setting for both large and small densities
+ */
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv_rev(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  // try dense first
+  int max_cols = max_cols_per_block<value_idx, value_t>();
+
+  if (max_cols > config_.b_ncols) {
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+  } else {
+    hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+  }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
new file mode 100644
index 000000000..1f4b19af4
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/cub.cuh>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
+{
+  return __ffs(peer_group) - 1;
+}
+
+/**
+ * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+ * sparse-matrix-sparse-vector multiplication layout (SPMV).
+ * This is intended to be scheduled n_chunks_b times for each row of a.
+ * The steps are as follows:
+ *
+ * 1. Load row from A into dense vector in shared memory.
+ *    This can be further chunked in the future if necessary to support larger
+ *    column sizes.
+ * 2. Threads of block all step through chunks of B in parallel.
+ *    When a new row is encountered in row_indices_b, a segmented
+ *    reduction is performed across the warps and then across the
+ *    block and the final value written out to host memory.
+ *
+ * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb threads per block configured on launch
+ * @tparam rev if this is true, the reduce/accumulate functions are only
+ *         executed when A[col] == 0.0. when executed before/after !rev
+ *         and A & B are reversed, this allows the full symmetric difference
+ *         and intersection to be computed.
+ * @tparam kv_t data type stored in shared mem cache
+ * @tparam product_f reduce function type (semiring product() function).
+ *                  accepts two arguments of value_t and returns a value_t
+ * @tparam accum_f accumulation function type (semiring sum() function).
+ *                 accepts two arguments of value_t and returns a value_t
+ * @tparam write_f function to write value out. this should be mathematically
+ *                 equivalent to the accumulate function but implemented as
+ *                 an atomic operation on global memory. Accepts two arguments
+ *                 of value_t* and value_t and updates the value given by the
+ *                 pointer.
+ * @param[in] indptrA column pointer array for A
+ * @param[in] indicesA column indices array for A
+ * @param[in] dataA data array for A
+ * @param[in] rowsB coo row array for B
+ * @param[in] indicesB column indices array for B
+ * @param[in] dataB data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[in] dim number of features
+ * @param[in] nnz_b number of nonzeros in B
+ * @param[out] out array of size m*n
+ * @param[in] n_blocks_per_row number of blocks of B per row of A
+ * @param[in] chunk_size number of nnz for B to use for each row of A
+ * @param[in] buffer_size amount of smem to use for each row of A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename strategy_t,
+          typename indptr_it,
+          typename value_idx,
+          typename value_t,
+          bool rev,
+          int tpb,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+RAFT_KERNEL balanced_coo_generalized_spmv_kernel(strategy_t strategy,
+                                                 indptr_it indptrA,
+                                                 value_idx* indicesA,
+                                                 value_t* dataA,
+                                                 value_idx nnz_a,
+                                                 value_idx* rowsB,
+                                                 value_idx* indicesB,
+                                                 value_t* dataB,
+                                                 value_idx m,
+                                                 value_idx n,
+                                                 int dim,
+                                                 value_idx nnz_b,
+                                                 value_t* out,
+                                                 int n_blocks_per_row,
+                                                 int chunk_size,
+                                                 value_idx b_ncols,
+                                                 product_f product_func,
+                                                 accum_f accum_func,
+                                                 write_f write_func)
+{
+  typedef cub::WarpReduce<value_t> warp_reduce;
+
+  value_idx cur_row_a        = indptrA.get_row_idx(n_blocks_per_row);
+  value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
+
+  // chunk starting offset
+  value_idx ind_offset = cur_chunk_offset * chunk_size * tpb;
+  // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
+  value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
+
+  int tid     = threadIdx.x;
+  int warp_id = tid / raft::warp_size();
+
+  // compute id relative to current warp
+  unsigned int lane_id = tid & (raft::warp_size() - 1);
+  value_idx ind        = ind_offset + threadIdx.x;
+
+  extern __shared__ char smem[];
+
+  typename strategy_t::smem_type A                = (typename strategy_t::smem_type)(smem);
+  typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim);
+
+  auto inserter = strategy.init_insert(A, dim);
+
+  __syncthreads();
+
+  value_idx start_offset_a, stop_offset_a;
+  bool first_a_chunk, last_a_chunk;
+  indptrA.get_row_offsets(
+    cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk);
+
+  // Convert current row vector in A to dense
+  for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) {
+    strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]);
+  }
+
+  __syncthreads();
+
+  auto finder = strategy.init_find(A, dim);
+
+  if (cur_row_a > m || cur_chunk_offset > n_blocks_per_row) return;
+  if (ind >= nnz_b) return;
+
+  value_idx start_index_a = 0, stop_index_a = b_ncols - 1;
+  indptrA.get_indices_boundary(indicesA,
+                               cur_row_a,
+                               start_offset_a,
+                               stop_offset_a,
+                               start_index_a,
+                               stop_index_a,
+                               first_a_chunk,
+                               last_a_chunk);
+
+  value_idx cur_row_b = -1;
+  value_t c           = 0.0;
+
+  auto warp_red = warp_reduce(*(temp_storage + warp_id));
+
+  if (tid < active_chunk_size) {
+    cur_row_b = rowsB[ind];
+
+    auto index_b   = indicesB[ind];
+    auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+
+    if (in_bounds) {
+      value_t a_col = strategy.find(finder, index_b);
+      if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); }
+    }
+  }
+
+  // loop through chunks in parallel, reducing when a new row is
+  // encountered by each thread
+  for (int i = tid; i < active_chunk_size; i += blockDim.x) {
+    value_idx ind_next   = ind + blockDim.x;
+    value_idx next_row_b = -1;
+
+    if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
+
+    bool diff_rows = next_row_b != cur_row_b;
+
+    if (__any_sync(0xffffffff, diff_rows)) {
+      // grab the threads currently participating in loops.
+      // because any other threads should have returned already.
+      unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
+      bool is_leader          = get_lowest_peer(peer_group) == lane_id;
+      value_t v               = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+
+      // thread with lowest lane id among peers writes out
+      if (is_leader && v != 0.0) {
+        // this conditional should be uniform, since rev is constant
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a;
+        write_func(out + idx, v);
+      }
+
+      c = 0.0;
+    }
+
+    if (next_row_b != -1) {
+      ind = ind_next;
+
+      auto index_b   = indicesB[ind];
+      auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+      if (in_bounds) {
+        value_t a_col = strategy.find(finder, index_b);
+
+        if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); }
+      }
+
+      cur_row_b = next_row_b;
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh
new file mode 100644
index 000000000..457b25eea
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../common.hpp"
+#include "../coo_spmv_kernel.cuh"
+#include "../utils.cuh"
+#include "coo_mask_row_iterators.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class coo_spmv_strategy {
+ public:
+  coo_spmv_strategy(const distances_config_t<value_idx, value_t>& config_) : config(config_)
+  {
+    smem = raft::getSharedMemPerBlock();
+  }
+
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base(strategy_t& strategy,
+                      int smem_dim,
+                      indptr_it& a_indptr,
+                      value_t* out_dists,
+                      value_idx* coo_rows_b,
+                      product_f product_func,
+                      accum_f accum_func,
+                      write_f write_func,
+                      int chunk_size,
+                      int n_blocks,
+                      int n_blocks_per_row)
+  {
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              false,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
+
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
+      <<<n_blocks, tpb, smem, raft::resource::get_cuda_stream(config.handle)>>>(strategy,
+                                                                                a_indptr,
+                                                                                config.a_indices,
+                                                                                config.a_data,
+                                                                                config.a_nnz,
+                                                                                coo_rows_b,
+                                                                                config.b_indices,
+                                                                                config.b_data,
+                                                                                config.a_nrows,
+                                                                                config.b_nrows,
+                                                                                smem_dim,
+                                                                                config.b_nnz,
+                                                                                out_dists,
+                                                                                n_blocks_per_row,
+                                                                                chunk_size,
+                                                                                config.b_ncols,
+                                                                                product_func,
+                                                                                accum_func,
+                                                                                write_func);
+  }
+
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base_rev(strategy_t& strategy,
+                          int smem_dim,
+                          indptr_it& b_indptr,
+                          value_t* out_dists,
+                          value_idx* coo_rows_a,
+                          product_f product_func,
+                          accum_f accum_func,
+                          write_f write_func,
+                          int chunk_size,
+                          int n_blocks,
+                          int n_blocks_per_row)
+  {
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              true,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
+
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
+      <<<n_blocks, tpb, smem, raft::resource::get_cuda_stream(config.handle)>>>(strategy,
+                                                                                b_indptr,
+                                                                                config.b_indices,
+                                                                                config.b_data,
+                                                                                config.b_nnz,
+                                                                                coo_rows_a,
+                                                                                config.a_indices,
+                                                                                config.a_data,
+                                                                                config.b_nrows,
+                                                                                config.a_nrows,
+                                                                                smem_dim,
+                                                                                config.a_nnz,
+                                                                                out_dists,
+                                                                                n_blocks_per_row,
+                                                                                chunk_size,
+                                                                                config.a_ncols,
+                                                                                product_func,
+                                                                                accum_func,
+                                                                                write_func);
+  }
+
+ protected:
+  int smem;
+  const distances_config_t<value_idx, value_t>& config;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh
new file mode 100644
index 000000000..a9040e1d8
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../common.hpp"
+#include "../utils.cuh"
+
+#include <raft/util/cuda_dev_essentials.cuh>  // raft::ceildiv
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx>
+class mask_row_it {
+ public:
+  mask_row_it(const value_idx* full_indptr_,
+              const value_idx& n_rows_,
+              value_idx* mask_row_idx_ = NULL)
+    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_)
+  {
+  }
+
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
+    if (mask_row_idx != NULL) {
+      return mask_row_idx[blockIdx.x / n_blocks_nnz_b];
+    } else {
+      return blockIdx.x / n_blocks_nnz_b;
+    }
+  }
+
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const value_idx& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    start_offset = full_indptr[row_idx];
+    stop_offset  = full_indptr[row_idx + 1] - 1;
+  }
+
+  __device__ constexpr inline void get_indices_boundary(const value_idx* indices,
+                                                        value_idx& indices_len,
+                                                        value_idx& start_offset,
+                                                        value_idx& stop_offset,
+                                                        value_idx& start_index,
+                                                        value_idx& stop_index,
+                                                        bool& first_a_chunk,
+                                                        bool& last_a_chunk)
+  {
+    // do nothing;
+  }
+
+  __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a,
+                                                        value_idx& stop_index_a,
+                                                        value_idx& index_b)
+  {
+    return true;
+  }
+
+  const value_idx *full_indptr, &n_rows;
+  value_idx* mask_row_idx;
+};
+
+template <typename value_idx>
+RAFT_KERNEL fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
+                                      value_idx* chunk_indices,
+                                      value_idx n_rows)
+{
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < n_rows) {
+    auto start = n_chunks_per_row[tid];
+    auto end   = n_chunks_per_row[tid + 1];
+
+#pragma unroll
+    for (int i = start; i < end; i++) {
+      chunk_indices[i] = tid;
+    }
+  }
+}
+
+template <typename value_idx>
+class chunked_mask_row_it : public mask_row_it<value_idx> {
+ public:
+  chunked_mask_row_it(const value_idx* full_indptr_,
+                      const value_idx& n_rows_,
+                      value_idx* mask_row_idx_,
+                      int row_chunk_size_,
+                      const value_idx* n_chunks_per_row_,
+                      const value_idx* chunk_indices_,
+                      const cudaStream_t stream_)
+    : mask_row_it<value_idx>(full_indptr_, n_rows_, mask_row_idx_),
+      row_chunk_size(row_chunk_size_),
+      n_chunks_per_row(n_chunks_per_row_),
+      chunk_indices(chunk_indices_),
+      stream(stream_)
+  {
+  }
+
+  static void init(const value_idx* indptr,
+                   const value_idx* mask_row_idx,
+                   const value_idx& n_rows,
+                   const int row_chunk_size,
+                   rmm::device_uvector<value_idx>& n_chunks_per_row,
+                   rmm::device_uvector<value_idx>& chunk_indices,
+                   cudaStream_t stream)
+  {
+    auto policy = rmm::exec_policy(stream);
+
+    constexpr value_idx first_element = 0;
+    n_chunks_per_row.set_element_async(0, first_element, stream);
+    n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size);
+    thrust::transform(
+      policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor);
+
+    thrust::inclusive_scan(
+      policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1);
+
+    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream);
+
+    fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream);
+  }
+
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
+    return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]];
+  }
+
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const int& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    auto chunk_index    = blockIdx.x / n_blocks_nnz_b;
+    auto chunk_val      = chunk_indices[chunk_index];
+    auto prev_n_chunks  = n_chunks_per_row[chunk_val];
+    auto relative_chunk = chunk_index - prev_n_chunks;
+    first_a_chunk       = relative_chunk == 0;
+
+    start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size;
+    stop_offset  = start_offset + row_chunk_size;
+
+    auto final_stop_offset = this->full_indptr[row_idx + 1];
+
+    last_a_chunk = stop_offset >= final_stop_offset;
+    stop_offset  = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
+  }
+
+  __device__ inline void get_indices_boundary(const value_idx* indices,
+                                              value_idx& row_idx,
+                                              value_idx& start_offset,
+                                              value_idx& stop_offset,
+                                              value_idx& start_index,
+                                              value_idx& stop_index,
+                                              bool& first_a_chunk,
+                                              bool& last_a_chunk)
+  {
+    start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1;
+    stop_index  = last_a_chunk ? stop_index : indices[stop_offset];
+  }
+
+  __device__ inline bool check_indices_bounds(value_idx& start_index_a,
+                                              value_idx& stop_index_a,
+                                              value_idx& index_b)
+  {
+    return (index_b >= start_index_a && index_b <= stop_index_a);
+  }
+
+  inline static value_idx total_row_blocks = 0;
+  const cudaStream_t stream;
+  const value_idx *n_chunks_per_row, *chunk_indices;
+  value_idx row_chunk_size;
+
+  struct n_chunks_per_row_functor {
+   public:
+    n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_)
+      : indptr(indptr_), row_chunk_size(row_chunk_size_)
+    {
+    }
+
+    __host__ __device__ value_idx operator()(const value_idx& i)
+    {
+      auto degree = indptr[i + 1] - indptr[i];
+      return raft::ceildiv(degree, (value_idx)row_chunk_size);
+    }
+
+    const value_idx* indptr;
+    value_idx row_chunk_size;
+  };
+
+ private:
+  static void fill_chunk_indices(const value_idx& n_rows,
+                                 rmm::device_uvector<value_idx>& n_chunks_per_row,
+                                 rmm::device_uvector<value_idx>& chunk_indices,
+                                 cudaStream_t stream)
+  {
+    auto n_threads = std::min(n_rows, 256);
+    auto n_blocks  = raft::ceildiv(n_rows, (value_idx)n_threads);
+
+    chunk_indices.resize(total_row_blocks, stream);
+
+    fill_chunk_indices_kernel<value_idx>
+      <<<n_blocks, n_threads, 0, stream>>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows);
+  }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh
new file mode 100644
index 000000000..baa913a6c
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "base_strategy.cuh"
+
+#include <raft/util/cuda_dev_essentials.cuh>  // raft::ceildiv
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
+ public:
+  using smem_type   = value_t*;
+  using insert_type = smem_type;
+  using find_type   = smem_type;
+
+  dense_smem_strategy(const distances_config_t<value_idx, value_t>& config_)
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_)
+  {
+  }
+
+  inline static int smem_per_block(int n_cols)
+  {
+    return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t));
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.a_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> a_indptr(this->config.a_indptr, this->config.a_nrows);
+
+    this->_dispatch_base(*this,
+                         this->config.b_ncols,
+                         a_indptr,
+                         out_dists,
+                         coo_rows_b,
+                         product_func,
+                         accum_func,
+                         write_func,
+                         chunk_size,
+                         n_blocks,
+                         n_blocks_per_row);
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.b_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> b_indptr(this->config.b_indptr, this->config.b_nrows);
+
+    this->_dispatch_base_rev(*this,
+                             this->config.a_ncols,
+                             b_indptr,
+                             out_dists,
+                             coo_rows_a,
+                             product_func,
+                             accum_func,
+                             write_func,
+                             chunk_size,
+                             n_blocks,
+                             n_blocks_per_row);
+  }
+
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
+    for (int k = threadIdx.x; k < cache_size; k += blockDim.x) {
+      cache[k] = 0.0;
+    }
+    return cache;
+  }
+
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
+    cache[key] = value;
+  }
+
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
+    return cache;
+  }
+
+  __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh
new file mode 100644
index 000000000..cf212076b
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "base_strategy.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+
+#include <cuco/static_map.cuh>
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+
+// this is needed by cuco as key, value must be bitwise comparable.
+// compilers don't declare float/double as bitwise comparable
+// but that is too strict
+// for example, the following is true (or 0):
+// float a = 5;
+// float b = 5;
+// memcmp(&a, &b, sizeof(float));
+CUCO_DECLARE_BITWISE_COMPARABLE(float);
+CUCO_DECLARE_BITWISE_COMPARABLE(double);
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
+ public:
+  using insert_type = typename cuco::legacy::
+    static_map<value_idx, value_t, cuda::thread_scope_block>::device_mutable_view;
+  using smem_type = typename insert_type::slot_type*;
+  using find_type =
+    typename cuco::legacy::static_map<value_idx, value_t, cuda::thread_scope_block>::device_view;
+
+  hash_strategy(const distances_config_t<value_idx, value_t>& config_,
+                float capacity_threshold_ = 0.5,
+                int map_size_             = get_map_size())
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_),
+      capacity_threshold(capacity_threshold_),
+      map_size(map_size_)
+  {
+  }
+
+  void chunking_needed(const value_idx* indptr,
+                       const value_idx n_rows,
+                       rmm::device_uvector<value_idx>& mask_indptr,
+                       std::tuple<value_idx, value_idx>& n_rows_divided,
+                       cudaStream_t stream)
+  {
+    auto policy = raft::resource::get_thrust_policy(this->config.handle);
+
+    auto less                   = thrust::copy_if(policy,
+                                thrust::make_counting_iterator(value_idx(0)),
+                                thrust::make_counting_iterator(n_rows),
+                                mask_indptr.data(),
+                                fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
+    std::get<0>(n_rows_divided) = less - mask_indptr.data();
+
+    auto more = thrust::copy_if(
+      policy,
+      thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows),
+      less,
+      fits_in_hash_table(
+        indptr, capacity_threshold * map_size, std::numeric_limits<value_idx>::max()));
+    std::get<1>(n_rows_divided) = more - less;
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb);
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.a_nrows, raft::resource::get_cuda_stream(this->config.handle));
+    std::tuple<value_idx, value_idx> n_rows_divided;
+
+    chunking_needed(this->config.a_indptr,
+                    this->config.a_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    raft::resource::get_cuda_stream(this->config.handle));
+
+    auto less_rows = std::get<0>(n_rows_divided);
+    if (less_rows > 0) {
+      mask_row_it<value_idx> less(this->config.a_indptr, less_rows, mask_indptr.data());
+
+      auto n_less_blocks = less_rows * n_blocks_per_row;
+      this->_dispatch_base(*this,
+                           map_size,
+                           less,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_less_blocks,
+                           n_blocks_per_row);
+    }
+
+    auto more_rows = std::get<1>(n_rows_divided);
+    if (more_rows > 0) {
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, raft::resource::get_cuda_stream(this->config.handle));
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, raft::resource::get_cuda_stream(this->config.handle));
+      chunked_mask_row_it<value_idx>::init(this->config.a_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           raft::resource::get_cuda_stream(this->config.handle));
+
+      chunked_mask_row_it<value_idx> more(this->config.a_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          raft::resource::get_cuda_stream(this->config.handle));
+
+      auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
+      this->_dispatch_base(*this,
+                           map_size,
+                           more,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_more_blocks,
+                           n_blocks_per_row);
+    }
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb);
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.b_nrows, raft::resource::get_cuda_stream(this->config.handle));
+    std::tuple<value_idx, value_idx> n_rows_divided;
+
+    chunking_needed(this->config.b_indptr,
+                    this->config.b_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    raft::resource::get_cuda_stream(this->config.handle));
+
+    auto less_rows = std::get<0>(n_rows_divided);
+    if (less_rows > 0) {
+      mask_row_it<value_idx> less(this->config.b_indptr, less_rows, mask_indptr.data());
+
+      auto n_less_blocks = less_rows * n_blocks_per_row;
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               less,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_less_blocks,
+                               n_blocks_per_row);
+    }
+
+    auto more_rows = std::get<1>(n_rows_divided);
+    if (more_rows > 0) {
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, raft::resource::get_cuda_stream(this->config.handle));
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, raft::resource::get_cuda_stream(this->config.handle));
+      chunked_mask_row_it<value_idx>::init(this->config.b_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           raft::resource::get_cuda_stream(this->config.handle));
+
+      chunked_mask_row_it<value_idx> more(this->config.b_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          raft::resource::get_cuda_stream(this->config.handle));
+
+      auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               more,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_more_blocks,
+                               n_blocks_per_row);
+    }
+  }
+
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
+    return insert_type::make_from_uninitialized_slots(cooperative_groups::this_thread_block(),
+                                                      cache,
+                                                      cache_size,
+                                                      cuco::empty_key{value_idx{-1}},
+                                                      cuco::empty_value{value_t{0}});
+  }
+
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
+    auto success = cache.insert(cuco::pair<value_idx, value_t>(key, value));
+  }
+
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
+    return find_type(
+      cache, cache_size, cuco::empty_key{value_idx{-1}}, cuco::empty_value{value_t{0}});
+  }
+
+  __device__ inline value_t find(find_type cache, const value_idx& key)
+  {
+    auto a_pair = cache.find(key);
+
+    value_t a_col = 0.0;
+    if (a_pair != cache.end()) { a_col = a_pair->second; }
+    return a_col;
+  }
+
+  struct fits_in_hash_table {
+   public:
+    fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_)
+      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_)
+    {
+    }
+
+    __host__ __device__ bool operator()(const value_idx& i)
+    {
+      auto degree = indptr[i + 1] - indptr[i];
+
+      return degree >= degree_l && degree < degree_r;
+    }
+
+   private:
+    const value_idx* indptr;
+    const value_idx degree_l, degree_r;
+  };
+
+  inline static int get_map_size()
+  {
+    return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+           sizeof(typename insert_type::slot_type);
+  }
+
+ private:
+  float capacity_threshold;
+  int map_size;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/ip_distance.cuh b/cpp/src/distance/detail/sparse/ip_distance.cuh
new file mode 100644
index 000000000..3a11d4e99
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/ip_distance.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "coo_spmv.cuh"
+
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/linalg/transpose.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t>
+class ip_distances_t : public distances_t<value_t> {
+ public:
+  /**
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   */
+  ip_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), coo_rows_b(config.b_nnz, raft::resource::get_cuda_stream(config.handle))
+  {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows_b.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  /**
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t* out_distances)
+  {
+    /**
+     * Compute pairwise distances and return dense matrix in row-major format
+     */
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_distances,
+                                                               *config_,
+                                                               coo_rows_b.data(),
+                                                               raft::mul_op(),
+                                                               raft::add_op(),
+                                                               raft::atomic_add_op());
+  }
+
+  value_idx* b_rows_coo() { return coo_rows_b.data(); }
+
+  value_t* b_data_coo() { return config_->b_data; }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<value_idx> coo_rows_b;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/l2_distance.cuh b/cpp/src/distance/detail/sparse/l2_distance.cuh
new file mode 100644
index 000000000..40e7070fc
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/l2_distance.cuh
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "ip_distance.cuh"
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_row_norm_kernel(value_t* out,
+                                    const value_idx* __restrict__ coo_rows,
+                                    const value_t* __restrict__ data,
+                                    value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
+}
+
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_row_sum_kernel(value_t* out,
+                                   const value_idx* __restrict__ coo_rows,
+                                   const value_t* __restrict__ data,
+                                   value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+RAFT_KERNEL compute_euclidean_warp_kernel(value_t* __restrict__ C,
+                                          const value_t* __restrict__ Q_sq_norms,
+                                          const value_t* __restrict__ R_sq_norms,
+                                          value_idx n_rows,
+                                          value_idx n_cols,
+                                          expansion_f expansion_func)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t dot = C[(size_t)i * n_cols + j];
+
+  // e.g. Euclidean expansion func = -2.0 * dot + q_norm + r_norm
+  value_t val = expansion_func(dot, Q_sq_norms[i], R_sq_norms[j]);
+
+  // correct for small instabilities
+  C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
+}
+
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_correlation_warp_kernel(value_t* __restrict__ C,
+                                            const value_t* __restrict__ Q_sq_norms,
+                                            const value_t* __restrict__ R_sq_norms,
+                                            const value_t* __restrict__ Q_norms,
+                                            const value_t* __restrict__ R_norms,
+                                            value_idx n_rows,
+                                            value_idx n_cols,
+                                            value_idx n)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t dot  = C[(size_t)i * n_cols + j];
+  value_t Q_l1 = Q_norms[i];
+  value_t R_l1 = R_norms[j];
+
+  value_t Q_l2 = Q_sq_norms[i];
+  value_t R_l2 = R_sq_norms[j];
+
+  value_t numer   = n * dot - (Q_l1 * R_l1);
+  value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
+  value_t R_denom = n * R_l2 - (R_l1 * R_l1);
+
+  value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom));
+
+  // correct for small instabilities
+  C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_euclidean(value_t* C,
+                       const value_t* Q_sq_norms,
+                       const value_t* R_sq_norms,
+                       value_idx n_rows,
+                       value_idx n_cols,
+                       cudaStream_t stream,
+                       expansion_f expansion_func)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_l2(value_t* out,
+                const value_idx* Q_coo_rows,
+                const value_t* Q_data,
+                value_idx Q_nnz,
+                const value_idx* R_coo_rows,
+                const value_t* R_data,
+                value_idx R_nnz,
+                value_idx m,
+                value_idx n,
+                cudaStream_t stream,
+                expansion_f expansion_func)
+{
+  rmm::device_uvector<value_t> Q_sq_norms(m, stream);
+  rmm::device_uvector<value_t> R_sq_norms(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+
+  compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256>
+void compute_correlation(value_t* C,
+                         const value_t* Q_sq_norms,
+                         const value_t* R_sq_norms,
+                         const value_t* Q_norms,
+                         const value_t* R_norms,
+                         value_idx n_rows,
+                         value_idx n_cols,
+                         value_idx n,
+                         cudaStream_t stream)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_correlation_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256>
+void compute_corr(value_t* out,
+                  const value_idx* Q_coo_rows,
+                  const value_t* Q_data,
+                  value_idx Q_nnz,
+                  const value_idx* R_coo_rows,
+                  const value_t* R_data,
+                  value_idx R_nnz,
+                  value_idx m,
+                  value_idx n,
+                  value_idx n_cols,
+                  cudaStream_t stream)
+{
+  // sum_sq for std dev
+  rmm::device_uvector<value_t> Q_sq_norms(m, stream);
+  rmm::device_uvector<value_t> R_sq_norms(n, stream);
+
+  // sum for mean
+  rmm::device_uvector<value_t> Q_norms(m, stream);
+  rmm::device_uvector<value_t> R_norms(n, stream);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+
+  compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_row_sum_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_sum_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_correlation(out,
+                      Q_sq_norms.data(),
+                      R_sq_norms.data(),
+                      Q_norms.data(),
+                      R_norms.data(),
+                      m,
+                      n,
+                      n_cols,
+                      stream);
+}
+
+/**
+ * L2 distance using the expanded form: sum(x_k)^2 + sum(y_k)^2 - 2 * sum(x_k * y_k)
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class l2_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit l2_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               raft::resource::get_cuda_stream(config_->handle),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 return -2 * dot + q_norm + r_norm;
+               });
+  }
+
+  ~l2_expanded_distances_t() = default;
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * L2 sqrt distance performing the sqrt operation after the distance computation
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, value_t> {
+ public:
+  explicit l2_sqrt_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_expanded_distances_t<value_idx, value_t>(config)
+  {
+  }
+
+  void compute(value_t* out_dists) override
+  {
+    l2_expanded_distances_t<value_idx, value_t>::compute(out_dists);
+    // Sqrt Post-processing
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
+      [] __device__(value_t input) {
+        int neg = input < 0 ? -1 : 1;
+        return raft::sqrt(abs(input) * neg);
+      },
+      raft::resource::get_cuda_stream(this->config_->handle));
+  }
+
+  ~l2_sqrt_expanded_distances_t() = default;
+};
+
+template <typename value_idx, typename value_t>
+class correlation_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit correlation_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_corr(out_dists,
+                 search_coo_rows.data(),
+                 config_->a_data,
+                 config_->a_nnz,
+                 b_indices,
+                 b_data,
+                 config_->b_nnz,
+                 config_->a_nrows,
+                 config_->b_nrows,
+                 config_->b_ncols,
+                 raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  ~correlation_expanded_distances_t() = default;
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) *
+ * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class cosine_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit cosine_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               raft::resource::get_cuda_stream(config_->handle),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm);
+                 // deal with potential for 0 in denominator by forcing 0/1 instead
+                 value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+
+                 // flip the similarity when both rows are 0
+                 bool both_empty = (q_norm == 0) && (r_norm == 0);
+                 return 1 - ((!both_empty * cos) + both_empty);
+               });
+  }
+
+  ~cosine_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Hellinger distance using the expanded form: sqrt(1 - sum(sqrt(x_k) * sqrt(y_k)))
+ * The expanded form is more efficient for sparse data.
+ *
+ * This distance computation modifies A and B by computing a sqrt
+ * and then performing a `pow(x, 2)` to convert it back. Because of this,
+ * it is possible that the values in A and B might differ slightly
+ * after this is invoked.
+ */
+template <typename value_idx = int, typename value_t = float>
+class hellinger_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit hellinger_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, raft::resource::get_cuda_stream(config.handle))
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                            raft::resource::get_cuda_stream(config_->handle));
+
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) {
+        // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
+        bool rectifier = (1 - input) > 0;
+        return raft::sqrt(rectifier * (1 - input));
+      },
+      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  ~hellinger_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class russelrao_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit russelrao_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_t n_cols     = config_->a_ncols;
+    value_t n_cols_inv = 1.0 / n_cols;
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; },
+      raft::resource::get_cuda_stream(config_->handle));
+
+    auto exec_policy  = rmm::exec_policy(raft::resource::get_cuda_stream(config_->handle));
+    auto diags        = thrust::counting_iterator<value_idx>(0);
+    value_idx b_nrows = config_->b_nrows;
+    thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) {
+      out_dists[input * b_nrows + input] = 0.0;
+    });
+  }
+
+  ~russelrao_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/lp_distance.cuh b/cpp/src/distance/detail/sparse/lp_distance.cuh
new file mode 100644
index 000000000..18e7b04e4
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/lp_distance.cuh
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <algorithm>
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx = int,
+          typename value_t   = float,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+void unexpanded_lp_distances(value_t* out_dists,
+                             const distances_config_t<value_idx, value_t>* config_,
+                             product_f product_func,
+                             accum_f accum_func,
+                             write_f write_func)
+{
+  rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                          raft::resource::get_cuda_stream(config_->handle));
+
+  raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                    config_->b_nrows,
+                                    coo_rows.data(),
+                                    config_->b_nnz,
+                                    raft::resource::get_cuda_stream(config_->handle));
+
+  balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+    out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
+
+  raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                    config_->a_nrows,
+                                    coo_rows.data(),
+                                    config_->a_nnz,
+                                    raft::resource::get_cuda_stream(config_->handle));
+
+  balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
+    out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
+}
+
+/**
+ * Computes L1 distances for sparse input. This does not have
+ * an equivalent expanded form, so it is only executed in
+ * an unexpanded form.
+ * @tparam value_idx
+ * @tparam value_t
+ */
+template <typename value_idx = int, typename value_t = float>
+class l1_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l1_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::add_op(), raft::atomic_add_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class l2_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l2_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::sqdiff_op(), raft::add_op(), raft::atomic_add_op());
+  }
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_idx, value_t> {
+ public:
+  l2_sqrt_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_unexpanded_distances_t<value_idx, value_t>(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    l2_unexpanded_distances_t<value_idx, value_t>::compute(out_dists);
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    // Sqrt Post-processing
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      n,
+      [] __device__(value_t input) {
+        int neg = input < 0 ? -1 : 1;
+        return raft::sqrt(abs(input) * neg);
+      },
+      raft::resource::get_cuda_stream(this->config_->handle));
+  }
+};
+
+template <typename value_idx = int, typename value_t = float>
+class linf_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit linf_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::max_op(), raft::atomic_max_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class canberra_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit canberra_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      [] __device__(value_t a, value_t b) {
+        value_t d = fabs(a) + fabs(b);
+
+        // deal with potential for 0 in denominator by
+        // forcing 1/0 instead
+        return ((d != 0) * fabs(a - b)) / (d + (d == 0));
+      },
+      raft::add_op(),
+      raft::atomic_add_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class lp_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit lp_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config,
+                                     value_t p_)
+    : config_(&config), p(p_)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      raft::compose_op(raft::pow_const_op<value_t>(p), raft::sub_op()),
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n         = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    value_t one_over_p = value_t{1} / p;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::pow_const_op<value_t>(one_over_p),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  value_t p;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class hamming_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit hamming_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::notequal_op(), raft::add_op(), raft::atomic_add_op());
+
+    uint64_t n     = (uint64_t)config_->a_nrows * (uint64_t)config_->b_nrows;
+    value_t n_cols = 1.0 / config_->a_ncols;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::mul_const_op<value_t>(n_cols),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit jensen_shannon_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      [] __device__(value_t a, value_t b) {
+        value_t m   = 0.5f * (a + b);
+        bool a_zero = a == 0;
+        bool b_zero = b == 0;
+
+        value_t x = (!a_zero * m) / (a_zero + a);
+        value_t y = (!b_zero * m) / (b_zero + b);
+
+        bool x_zero = x == 0;
+        bool y_zero = y == 0;
+
+        return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
+      },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      n,
+      [=] __device__(value_t input) { return raft::sqrt(0.5 * input); },
+      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit kl_divergence_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                            raft::resource::get_cuda_stream(config_->handle));
+
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return a * log(a / b); },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::mul_const_op<value_t>(0.5),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/utils.cuh b/cpp/src/distance/detail/sparse/utils.cuh
new file mode 100644
index 000000000..dc7ae6df6
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/utils.cuh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/math.hpp>
+
+#include <cub/cub.cuh>
+#include <cuda_fp16.h>
+#include <cuda_pipeline.h>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+/**
+ * Computes the maximum number of columns that can be stored
+ * in shared memory in dense form with the given block size
+ * and precision.
+ * @return the maximum number of columns that can be stored in smem
+ */
+template <typename value_idx, typename value_t, int tpb = 1024>
+inline int max_cols_per_block()
+{
+  // max cols = (total smem available - cub reduction smem)
+  return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+         sizeof(value_t);
+}
+
+template <typename value_idx, typename value_t, typename dot_t = value_t>
+RAFT_KERNEL faster_dot_on_csr_kernel(dot_t* __restrict__ dot,
+                                     const value_idx* __restrict__ indptr,
+                                     const value_idx* __restrict__ cols,
+                                     const value_t* __restrict__ A,
+                                     const value_t* __restrict__ B,
+                                     const value_idx nnz,
+                                     const value_idx n_rows,
+                                     const value_idx dim)
+{
+  auto vec_id  = threadIdx.x;
+  auto lane_id = threadIdx.x & 0x1f;
+
+  extern __shared__ char smem[];
+  value_t* s_A      = (value_t*)smem;
+  value_idx cur_row = -1;
+
+  for (int row = blockIdx.x; row < n_rows; row += gridDim.x) {
+    for (int dot_id = blockIdx.y + indptr[row]; dot_id < indptr[row + 1]; dot_id += gridDim.y) {
+      if (dot_id >= nnz) { return; }
+      const value_idx col               = cols[dot_id] * dim;
+      const value_t* __restrict__ B_col = B + col;
+
+      if (threadIdx.x == 0) { dot[dot_id] = 0.0; }
+      __syncthreads();
+
+      if (cur_row != row) {
+        for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+          s_A[k] = A[row * dim + k];
+        }
+        cur_row = row;
+      }
+
+      dot_t l_dot_ = 0.0;
+      for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+        asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x));
+        if constexpr ((std::is_same_v<dot_t, float> && std::is_same_v<value_t, half>)) {
+          l_dot_ += __half2float(s_A[k]) * __half2float(__ldcg(B_col + k));
+        } else {
+          l_dot_ += s_A[k] * __ldcg(B_col + k);
+        }
+      }
+
+      typedef cub::WarpReduce<dot_t> WarpReduce;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      dot_t warp_sum = WarpReduce(temp_storage).Sum(l_dot_);
+
+      if (lane_id == 0) { atomicAdd_block(dot + dot_id, warp_sum); }
+    }
+  }
+}
+
+template <typename value_idx, typename value_t, typename dot_t = value_t>
+void faster_dot_on_csr(raft::resources const& handle,
+                       dot_t* dot,
+                       const value_idx nnz,
+                       const value_idx* indptr,
+                       const value_idx* cols,
+                       const value_t* A,
+                       const value_t* B,
+                       const value_idx n_rows,
+                       const value_idx dim)
+{
+  if (nnz == 0 || n_rows == 0) return;
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  constexpr value_idx MAX_ROW_PER_ITER = 500;
+  int dev_id, sm_count, blocks_per_sm;
+
+  const int smem_size = dim * sizeof(value_t);
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  if (dim < 128) {
+    constexpr int tpb = 64;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+
+  } else if (dim < 256) {
+    constexpr int tpb = 128;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else if (dim < 512) {
+    constexpr int tpb = 256;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else {
+    constexpr int tpb = 512;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  }
+
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/sparse_distance.cu b/cpp/src/distance/sparse_distance.cu
new file mode 100644
index 000000000..338c4e908
--- /dev/null
+++ b/cpp/src/distance/sparse_distance.cu
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include "sparse_distance.cuh"
+
+namespace cuvs {
+namespace distance {
+
+template <typename ElementType, typename IndexType>
+void pairwise_distance(
+  raft::resources const& handle,
+  raft::device_csr_matrix_view<const ElementType, IndexType, IndexType, IndexType> x,
+  raft::device_csr_matrix_view<const ElementType, IndexType, IndexType, IndexType> y,
+  raft::device_matrix_view<ElementType, IndexType, raft::row_major> dist,
+  cuvs::distance::DistanceType metric,
+  float metric_arg = 2.0f)
+{
+  auto x_structure = x.structure_view();
+  auto y_structure = y.structure_view();
+
+  RAFT_EXPECTS(x_structure.get_n_cols() == y_structure.get_n_cols(),
+               "Number of columns must be equal");
+
+  RAFT_EXPECTS(dist.extent(0) == x_structure.get_n_rows(),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y_structure.get_n_rows(),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  detail::sparse::distances_config_t<IndexType, ElementType> input_config(handle);
+  input_config.a_nrows   = x_structure.get_n_rows();
+  input_config.a_ncols   = x_structure.get_n_cols();
+  input_config.a_nnz     = x_structure.get_nnz();
+  input_config.a_indptr  = const_cast<IndexType*>(x_structure.get_indptr().data());
+  input_config.a_indices = const_cast<IndexType*>(x_structure.get_indices().data());
+  input_config.a_data    = const_cast<ElementType*>(x.get_elements().data());
+
+  input_config.b_nrows   = y_structure.get_n_rows();
+  input_config.b_ncols   = y_structure.get_n_cols();
+  input_config.b_nnz     = y_structure.get_nnz();
+  input_config.b_indptr  = const_cast<IndexType*>(y_structure.get_indptr().data());
+  input_config.b_indices = const_cast<IndexType*>(y_structure.get_indices().data());
+  input_config.b_data    = const_cast<ElementType*>(y.get_elements().data());
+
+  pairwiseDistance(dist.data_handle(), input_config, metric, metric_arg);
+}
+
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const float, int, int, int> x,
+                       raft::device_csr_matrix_view<const float, int, int, int> y,
+                       raft::device_matrix_view<float, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg)
+{
+  pairwise_distance<float, int>(handle, x, y, dist, metric, metric_arg);
+}
+
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const double, int, int, int> x,
+                       raft::device_csr_matrix_view<const double, int, int, int> y,
+                       raft::device_matrix_view<double, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg)
+{
+  pairwise_distance<double, int>(handle, x, y, dist, metric, metric_arg);
+}
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/sparse_distance.cuh b/cpp/src/distance/sparse_distance.cuh
new file mode 100644
index 000000000..0d6dc0e6f
--- /dev/null
+++ b/cpp/src/distance/sparse_distance.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/sparse/bin_distance.cuh"
+#include "detail/sparse/common.hpp"
+#include "detail/sparse/ip_distance.cuh"
+#include "detail/sparse/l2_distance.cuh"
+#include "detail/sparse/lp_distance.cuh"
+
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+
+#include <unordered_set>
+
+namespace cuvs {
+namespace distance {
+/**
+ * Compute pairwise distances between A and B, using the provided
+ * input configuration and distance function.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @param[out] out dense output array (size A.nrows * B.nrows)
+ * @param[in] input_config input argument configuration
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename value_idx = int, typename value_t = float>
+void pairwiseDistance(value_t* out,
+                      detail::sparse::distances_config_t<value_idx, value_t> input_config,
+                      cuvs::distance::DistanceType metric,
+                      float metric_arg)
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded:
+      detail::sparse::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+      detail::sparse::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::InnerProduct:
+      detail::sparse::ip_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2Unexpanded:
+      detail::sparse::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
+      detail::sparse::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L1:
+      detail::sparse::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::LpUnexpanded:
+      detail::sparse::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::Linf:
+      detail::sparse::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::Canberra:
+      detail::sparse::canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::JaccardExpanded:
+      detail::sparse::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::CosineExpanded:
+      detail::sparse::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::HellingerExpanded:
+      detail::sparse::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::DiceExpanded:
+      detail::sparse::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::CorrelationExpanded:
+      detail::sparse::correlation_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::RusselRaoExpanded:
+      detail::sparse::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::HammingUnexpanded:
+      detail::sparse::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::JensenShannon:
+      detail::sparse::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::KLDivergence:
+      detail::sparse::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+
+    default: THROW("Unsupported distance: %d", metric);
+  }
+}
+};  // namespace distance
+};  // namespace cuvs
diff --git a/cpp/src/neighbors/detail/sparse_knn.cuh b/cpp/src/neighbors/detail/sparse_knn.cuh
new file mode 100644
index 000000000..9c8e971b9
--- /dev/null
+++ b/cpp/src/neighbors/detail/sparse_knn.cuh
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "../../distance/sparse_distance.cuh"
+#include "knn_merge_parts.cuh"
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+
+#include <cuvs/selection/select_k.hpp>
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/op/slice.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <algorithm>
+
+namespace cuvs::neighbors::detail {
+
+template <typename value_idx, typename value_t>
+struct csr_batcher_t {
+  csr_batcher_t(value_idx batch_size,
+                value_idx n_rows,
+                const value_idx* csr_indptr,
+                const value_idx* csr_indices,
+                const value_t* csr_data)
+    : batch_start_(0),
+      batch_stop_(0),
+      batch_rows_(0),
+      total_rows_(n_rows),
+      batch_size_(batch_size),
+      csr_indptr_(csr_indptr),
+      csr_indices_(csr_indices),
+      csr_data_(csr_data),
+      batch_csr_start_offset_(0),
+      batch_csr_stop_offset_(0)
+  {
+  }
+
+  void set_batch(int batch_num)
+  {
+    batch_start_ = batch_num * batch_size_;
+    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
+
+    if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
+
+    batch_rows_ = (batch_stop_ - batch_start_) + 1;
+  }
+
+  value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_indptr(batch_start_,
+                                           batch_stop_,
+                                           csr_indptr_,
+                                           batch_indptr,
+                                           &batch_csr_start_offset_,
+                                           &batch_csr_stop_offset_,
+                                           stream);
+
+    return batch_csr_stop_offset_ - batch_csr_start_offset_;
+  }
+
+  void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_,
+                                             batch_csr_stop_offset_,
+                                             csr_indices_,
+                                             csr_data_,
+                                             csr_indices,
+                                             csr_data,
+                                             stream);
+  }
+
+  value_idx batch_rows() const { return batch_rows_; }
+
+  value_idx batch_start() const { return batch_start_; }
+
+  value_idx batch_stop() const { return batch_stop_; }
+
+ private:
+  value_idx batch_size_;
+  value_idx batch_start_;
+  value_idx batch_stop_;
+  value_idx batch_rows_;
+
+  value_idx total_rows_;
+
+  const value_idx* csr_indptr_;
+  const value_idx* csr_indices_;
+  const value_t* csr_data_;
+
+  value_idx batch_csr_start_offset_;
+  value_idx batch_csr_stop_offset_;
+};
+
+template <typename value_idx, typename value_t>
+class sparse_knn_t {
+ public:
+  sparse_knn_t(const value_idx* idxIndptr_,
+               const value_idx* idxIndices_,
+               const value_t* idxData_,
+               size_t idxNNZ_,
+               int n_idx_rows_,
+               int n_idx_cols_,
+               const value_idx* queryIndptr_,
+               const value_idx* queryIndices_,
+               const value_t* queryData_,
+               size_t queryNNZ_,
+               int n_query_rows_,
+               int n_query_cols_,
+               value_idx* output_indices_,
+               value_t* output_dists_,
+               int k_,
+               raft::resources const& handle_,
+               size_t batch_size_index_             = 2 << 14,  // approx 1M
+               size_t batch_size_query_             = 2 << 14,
+               cuvs::distance::DistanceType metric_ = cuvs::distance::DistanceType::L2Expanded,
+               float metricArg_                     = 0)
+    : idxIndptr(idxIndptr_),
+      idxIndices(idxIndices_),
+      idxData(idxData_),
+      idxNNZ(idxNNZ_),
+      n_idx_rows(n_idx_rows_),
+      n_idx_cols(n_idx_cols_),
+      queryIndptr(queryIndptr_),
+      queryIndices(queryIndices_),
+      queryData(queryData_),
+      queryNNZ(queryNNZ_),
+      n_query_rows(n_query_rows_),
+      n_query_cols(n_query_cols_),
+      output_indices(output_indices_),
+      output_dists(output_dists_),
+      k(k_),
+      handle(handle_),
+      batch_size_index(batch_size_index_),
+      batch_size_query(batch_size_query_),
+      metric(metric_),
+      metricArg(metricArg_)
+  {
+  }
+
+  void run()
+  {
+    using namespace raft::sparse;
+
+    int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
+    csr_batcher_t<value_idx, value_t> query_batcher(
+      batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData);
+
+    size_t rows_processed = 0;
+
+    for (int i = 0; i < n_batches_query; i++) {
+      /**
+       * Compute index batch info
+       */
+      query_batcher.set_batch(i);
+
+      /**
+       * Slice CSR to rows in batch
+       */
+
+      rmm::device_uvector<value_idx> query_batch_indptr(query_batcher.batch_rows() + 1,
+                                                        raft::resource::get_cuda_stream(handle));
+
+      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
+        query_batch_indptr.data(), raft::resource::get_cuda_stream(handle));
+
+      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
+                                                         raft::resource::get_cuda_stream(handle));
+      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
+                                                    raft::resource::get_cuda_stream(handle));
+
+      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
+                                               query_batch_data.data(),
+                                               raft::resource::get_cuda_stream(handle));
+
+      // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
+      // batches and 1 space for the results of the merge, which get copied back to the top
+      rmm::device_uvector<value_idx> merge_buffer_indices(0,
+                                                          raft::resource::get_cuda_stream(handle));
+      rmm::device_uvector<value_t> merge_buffer_dists(0, raft::resource::get_cuda_stream(handle));
+
+      value_t* dists_merge_buffer_ptr;
+      value_idx* indices_merge_buffer_ptr;
+
+      int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
+      csr_batcher_t<value_idx, value_t> idx_batcher(
+        batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData);
+
+      for (int j = 0; j < n_batches_idx; j++) {
+        idx_batcher.set_batch(j);
+
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
+                                    raft::resource::get_cuda_stream(handle));
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
+                                  raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Slice CSR to rows in batch
+         */
+        rmm::device_uvector<value_idx> idx_batch_indptr(idx_batcher.batch_rows() + 1,
+                                                        raft::resource::get_cuda_stream(handle));
+        rmm::device_uvector<value_idx> idx_batch_indices(0,
+                                                         raft::resource::get_cuda_stream(handle));
+        rmm::device_uvector<value_t> idx_batch_data(0, raft::resource::get_cuda_stream(handle));
+
+        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
+          idx_batch_indptr.data(), raft::resource::get_cuda_stream(handle));
+
+        idx_batch_indices.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle));
+        idx_batch_data.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle));
+
+        idx_batcher.get_batch_csr_indices_data(
+          idx_batch_indices.data(), idx_batch_data.data(), raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Compute distances
+         */
+        uint64_t dense_size =
+          (uint64_t)idx_batcher.batch_rows() * (uint64_t)query_batcher.batch_rows();
+        rmm::device_uvector<value_t> batch_dists(dense_size,
+                                                 raft::resource::get_cuda_stream(handle));
+
+        RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher,
+                          query_batcher,
+                          idx_batch_nnz,
+                          n_query_batch_nnz,
+                          idx_batch_indptr.data(),
+                          idx_batch_indices.data(),
+                          idx_batch_data.data(),
+                          query_batch_indptr.data(),
+                          query_batch_indices.data(),
+                          query_batch_data.data(),
+                          batch_dists.data());
+
+        // Build batch indices array
+        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
+                                                     raft::resource::get_cuda_stream(handle));
+
+        // populate batch indices array
+        value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+
+        iota_fill(
+          batch_indices.data(), batch_rows, batch_cols, raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Perform k-selection on batch & merge with other k-selections
+         */
+        size_t merge_buffer_offset = batch_rows * k;
+        dists_merge_buffer_ptr     = merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr   = merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher,
+                            query_batcher,
+                            batch_dists.data(),
+                            batch_indices.data(),
+                            dists_merge_buffer_ptr,
+                            indices_merge_buffer_ptr);
+
+        value_t* dists_merge_buffer_tmp_ptr     = dists_merge_buffer_ptr;
+        value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+
+        // Merge results of difference batches if necessary
+        if (idx_batcher.batch_start() > 0) {
+          size_t merge_buffer_tmp_out  = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr   = merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher,
+                        query_batcher,
+                        merge_buffer_dists.data(),
+                        merge_buffer_indices.data(),
+                        dists_merge_buffer_tmp_ptr,
+                        indices_merge_buffer_tmp_ptr);
+        }
+
+        // copy merged output back into merge buffer partition for next iteration
+        raft::copy_async<value_idx>(merge_buffer_indices.data(),
+                                    indices_merge_buffer_tmp_ptr,
+                                    batch_rows * k,
+                                    raft::resource::get_cuda_stream(handle));
+        raft::copy_async<value_t>(merge_buffer_dists.data(),
+                                  dists_merge_buffer_tmp_ptr,
+                                  batch_rows * k,
+                                  raft::resource::get_cuda_stream(handle));
+      }
+
+      // Copy final merged batch to output array
+      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
+                                  merge_buffer_indices.data(),
+                                  query_batcher.batch_rows() * k,
+                                  raft::resource::get_cuda_stream(handle));
+      raft::copy_async<value_t>(output_dists + (rows_processed * k),
+                                merge_buffer_dists.data(),
+                                query_batcher.batch_rows() * k,
+                                raft::resource::get_cuda_stream(handle));
+
+      rows_processed += query_batcher.batch_rows();
+    }
+  }
+
+ private:
+  void merge_batches(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                     csr_batcher_t<value_idx, value_t>& query_batcher,
+                     value_t* merge_buffer_dists,
+                     value_idx* merge_buffer_indices,
+                     value_t* out_dists,
+                     value_idx* out_indices)
+  {
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    rmm::device_uvector<value_idx> trans(id_ranges.size(), raft::resource::get_cuda_stream(handle));
+    raft::update_device(
+      trans.data(), id_ranges.data(), id_ranges.size(), raft::resource::get_cuda_stream(handle));
+
+    // combine merge buffers only if there's more than 1 partition to combine
+    cuvs::neighbors::detail::knn_merge_parts(merge_buffer_dists,
+                                             merge_buffer_indices,
+                                             out_dists,
+                                             out_indices,
+                                             query_batcher.batch_rows(),
+                                             2,
+                                             k,
+                                             raft::resource::get_cuda_stream(handle),
+                                             trans.data());
+  }
+
+  void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
+                           csr_batcher_t<value_idx, value_t> query_batcher,
+                           value_t* batch_dists,
+                           value_idx* batch_indices,
+                           value_t* out_dists,
+                           value_idx* out_indices)
+  {
+    // populate batch indices array
+    value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    // in the case where the number of idx rows in the batch is < k, we
+    // want to adjust k.
+    value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);
+
+    bool ascending = cuvs::distance::is_min_close(metric);
+
+    // kernel to slice first (min) k cols and copy into batched merge buffer
+    cuvs::selection::select_k(
+      handle,
+      raft::make_device_matrix_view<const value_t, int64_t>(batch_dists, batch_rows, batch_cols),
+      raft::make_device_matrix_view<const value_idx, int64_t>(
+        batch_indices, batch_rows, batch_cols),
+      raft::make_device_matrix_view<value_t, int64_t>(out_dists, batch_rows, n_neighbors),
+      raft::make_device_matrix_view<value_idx, int64_t>(out_indices, batch_rows, n_neighbors),
+      ascending,
+      true);
+  }
+
+  void compute_distances(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                         csr_batcher_t<value_idx, value_t>& query_batcher,
+                         size_t idx_batch_nnz,
+                         size_t query_batch_nnz,
+                         value_idx* idx_batch_indptr,
+                         value_idx* idx_batch_indices,
+                         value_t* idx_batch_data,
+                         value_idx* query_batch_indptr,
+                         value_idx* query_batch_indices,
+                         value_t* query_batch_data,
+                         value_t* batch_dists)
+  {
+    /**
+     * Compute distances
+     */
+    cuvs::distance::detail::sparse::distances_config_t<value_idx, value_t> dist_config(handle);
+    dist_config.b_nrows = idx_batcher.batch_rows();
+    dist_config.b_ncols = n_idx_cols;
+    dist_config.b_nnz   = idx_batch_nnz;
+
+    dist_config.b_indptr  = idx_batch_indptr;
+    dist_config.b_indices = idx_batch_indices;
+    dist_config.b_data    = idx_batch_data;
+
+    dist_config.a_nrows = query_batcher.batch_rows();
+    dist_config.a_ncols = n_query_cols;
+    dist_config.a_nnz   = query_batch_nnz;
+
+    dist_config.a_indptr  = query_batch_indptr;
+    dist_config.a_indices = query_batch_indices;
+    dist_config.a_data    = query_batch_data;
+
+    cuvs::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg);
+  }
+
+  const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
+  value_idx* output_indices;
+  const value_t *idxData, *queryData;
+  value_t* output_dists;
+
+  size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
+
+  cuvs::distance::DistanceType metric;
+
+  float metricArg;
+
+  int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
+
+  raft::resources const& handle;
+};
+
+};  // namespace cuvs::neighbors::detail
diff --git a/cpp/src/neighbors/sparse_brute_force.cu b/cpp/src/neighbors/sparse_brute_force.cu
new file mode 100644
index 000000000..e277961ec
--- /dev/null
+++ b/cpp/src/neighbors/sparse_brute_force.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/neighbors/brute_force.hpp>
+
+#include "detail/sparse_knn.cuh"
+
+namespace cuvs::neighbors::brute_force {
+template <typename T, typename IdxT>
+sparse_index<T, IdxT>::sparse_index(raft::resources const& res,
+                                    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset,
+                                    cuvs::distance::DistanceType metric,
+                                    T metric_arg)
+  : dataset_(dataset), metric_(metric), metric_arg_(metric_arg)
+{
+}
+
+auto build(raft::resources const& handle,
+           raft::device_csr_matrix_view<const float, int, int, int> dataset,
+           cuvs::distance::DistanceType metric,
+           float metric_arg) -> cuvs::neighbors::brute_force::sparse_index<float, int>
+{
+  return sparse_index<float, int>(handle, dataset, metric, metric_arg);
+}
+
+void search(raft::resources const& handle,
+            const sparse_search_params& params,
+            const sparse_index<float, int>& index,
+            raft::device_csr_matrix_view<const float, int, int, int> query,
+            raft::device_matrix_view<int, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances)
+{
+  auto idx_structure   = index.dataset().structure_view();
+  auto query_structure = query.structure_view();
+  int k                = neighbors.extent(1);
+
+  detail::sparse_knn_t<int, float>(idx_structure.get_indptr().data(),
+                                   idx_structure.get_indices().data(),
+                                   index.dataset().get_elements().data(),
+                                   idx_structure.get_nnz(),
+                                   idx_structure.get_n_rows(),
+                                   idx_structure.get_n_cols(),
+                                   query_structure.get_indptr().data(),
+                                   query_structure.get_indices().data(),
+                                   query.get_elements().data(),
+                                   query_structure.get_nnz(),
+                                   query_structure.get_n_rows(),
+                                   query_structure.get_n_cols(),
+                                   neighbors.data_handle(),
+                                   distances.data_handle(),
+                                   k,
+                                   handle,
+                                   params.batch_size_index,
+                                   params.batch_size_query,
+                                   index.metric(),
+                                   index.metric_arg())
+    .run();
+}
+}  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 7754a5043..286d721d7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -94,7 +94,7 @@ endfunction()
 if(BUILD_TESTS)
   ConfigureTest(
     NAME NEIGHBORS_TEST PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu
-    neighbors/refine.cu GPUS 1 PERCENT 100
+    neighbors/sparse_brute_force.cu neighbors/refine.cu GPUS 1 PERCENT 100
   )
 
   ConfigureTest(
@@ -206,6 +206,7 @@ if(BUILD_TESTS)
     distance/dist_lp_unexp.cu
     distance/dist_russell_rao.cu
     distance/masked_nn.cu
+    distance/sparse_distance.cu
     sparse/neighbors/cross_component_nn.cu
     GPUS
     1
diff --git a/cpp/test/distance/sparse_distance.cu b/cpp/test/distance/sparse_distance.cu
new file mode 100644
index 000000000..f95487414
--- /dev/null
+++ b/cpp/test/distance/sparse_distance.cu
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cuvs/distance/distance.hpp>
+
+#include <cusparse_v2.h>
+#include <gtest/gtest.h>
+
+namespace cuvs {
+namespace distance {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseDistanceInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+
+  cuvs::distance::DistanceType metric;
+
+  float metric_arg = 0.0;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs<value_idx, value_t>& dims)
+{
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseDistanceTest
+  : public ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>> {
+ public:
+  SparseDistanceTest()
+    : params(::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>>::GetParam()),
+      indptr(0, resource::get_cuda_stream(handle)),
+      indices(0, resource::get_cuda_stream(handle)),
+      data(0, resource::get_cuda_stream(handle)),
+      out_dists(0, resource::get_cuda_stream(handle)),
+      out_dists_ref(0, resource::get_cuda_stream(handle))
+  {
+  }
+
+  void SetUp() override
+  {
+    make_data();
+
+    int out_size = static_cast<value_idx>(params.indptr_h.size() - 1) *
+                   static_cast<value_idx>(params.indptr_h.size() - 1);
+
+    out_dists.resize(out_size, resource::get_cuda_stream(handle));
+
+    auto out = raft::make_device_matrix_view<value_t, value_idx>(
+      out_dists.data(),
+      static_cast<value_idx>(params.indptr_h.size() - 1),
+      static_cast<value_idx>(params.indptr_h.size() - 1));
+
+    auto x_structure = raft::make_device_compressed_structure_view<value_idx, value_idx, value_idx>(
+      indptr.data(),
+      indices.data(),
+      static_cast<value_idx>(params.indptr_h.size() - 1),
+      params.n_cols,
+      static_cast<value_idx>(params.indices_h.size()));
+    auto x = raft::make_device_csr_matrix_view<const value_t>(data.data(), x_structure);
+
+    cuvs::distance::pairwise_distance(handle, x, x, out, params.metric, params.metric_arg);
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle)));
+  }
+
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(),
+                            out_dists.data(),
+                            params.out_dists_ref_h.size(),
+                            CompareApprox<value_t>(1e-3)));
+  }
+
+ protected:
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h      = params.data_h;
+
+    auto stream = resource::get_cuda_stream(handle);
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
+
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+
+    out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
+
+    update_device(out_dists_ref.data(),
+                  out_dists_ref_h.data(),
+                  out_dists_ref_h.size(),
+                  resource::get_cuda_stream(handle));
+  }
+
+  raft::resources handle;
+
+  // input data
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
+
+  // output data
+  rmm::device_uvector<value_t> out_dists, out_dists_ref;
+
+  SparseDistanceInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
+  {5,
+   {0, 0, 1, 2},
+
+   {1, 2},
+   {0.5, 0.5},
+   {0, 1, 1, 1, 0, 1, 1, 1, 0},
+   cuvs::distance::DistanceType::CosineExpanded,
+   0.0},
+  {5,
+   {0, 0, 1, 2},
+
+   {1, 2},
+   {1.0, 1.0},
+   {0, 1, 1, 1, 0, 1, 1, 1, 0},
+   cuvs::distance::DistanceType::JaccardExpanded,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L2Expanded,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},
+   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
+   cuvs::distance::DistanceType::InnerProduct,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L2Unexpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102,
+    1.,         0.76978799, 0.39419924, 0.,         0.97577154, 0.48904013, 0.48300801, 0.45087445,
+    0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,         0.79593037, 0.48904013,
+    0.51413997, 0.,         0.28605559, 0.35772784, 1.,         0.60889396, 0.43324829, 0.84923694,
+    0.45658883, 0.48300801, 0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0.,
+    0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1.,
+    0.6745457,  0.77917274, 0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,         0.51360432, 0.68185144,
+    1.,         0.54847744, 0.8321819,  0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432,
+    0.,         1.,         0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+    0.61547536, 0.68185144, 1.,         0.},
+   cuvs::distance::DistanceType::CosineExpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {0.0,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.75,
+    0.2857142857142857,
+    0.75,
+    0.7142857142857143,
+    0.5,
+    1.0,
+    0.6666666666666666,
+    0.42857142857142855,
+    0.0,
+    0.75,
+    0.625,
+    0.375,
+    0.42857142857142855,
+    0.75,
+    0.375,
+    0.75,
+    0.7142857142857143,
+    0.7142857142857143,
+    0.75,
+    0.0,
+    0.7142857142857143,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.6666666666666666,
+    0.625,
+    0.6666666666666666,
+    1.0,
+    0.75,
+    0.625,
+    0.7142857142857143,
+    0.0,
+    0.5,
+    0.5714285714285714,
+    1.0,
+    0.8,
+    0.5,
+    0.6666666666666666,
+    0.2857142857142857,
+    0.375,
+    0.42857142857142855,
+    0.5,
+    0.0,
+    0.6666666666666666,
+    0.7777777777777778,
+    0.4444444444444444,
+    0.7777777777777778,
+    0.75,
+    0.75,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.5714285714285714,
+    0.6666666666666666,
+    0.0,
+    0.7142857142857143,
+    0.5,
+    0.5,
+    0.8571428571428571,
+    0.7142857142857143,
+    0.75,
+    0.6666666666666666,
+    1.0,
+    0.7777777777777778,
+    0.7142857142857143,
+    0.0,
+    0.42857142857142855,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.5,
+    0.375,
+    0.625,
+    0.8,
+    0.4444444444444444,
+    0.5,
+    0.42857142857142855,
+    0.0,
+    0.7777777777777778,
+    0.75,
+    1.0,
+    0.75,
+    0.6666666666666666,
+    0.5,
+    0.7777777777777778,
+    0.5,
+    0.8571428571428571,
+    0.7777777777777778,
+    0.0,
+    1.0,
+    0.6666666666666666,
+    0.7142857142857143,
+    1.0,
+    0.6666666666666666,
+    0.75,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.75,
+    1.0,
+    0.0},
+   cuvs::distance::DistanceType::JaccardExpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    3.3954660629919076,
+    5.6469232737388815,
+    6.373112846266441,
+    4.0212880272531715,
+    6.916281504639404,
+    5.741508386786526,
+    5.411470999663036,
+    9.0,
+    4.977014354725805,
+    3.3954660629919076,
+    0.0,
+    7.56256082439209,
+    5.540261147481582,
+    4.832322929216881,
+    4.62003193872216,
+    6.498056792320361,
+    4.309846252268695,
+    6.317531174829905,
+    6.016362684141827,
+    5.6469232737388815,
+    7.56256082439209,
+    0.0,
+    5.974878731322299,
+    4.898357301336036,
+    6.442097410320605,
+    5.227077347287883,
+    7.134101195584642,
+    5.457753923371659,
+    7.0,
+    6.373112846266441,
+    5.540261147481582,
+    5.974878731322299,
+    0.0,
+    5.5507273748583,
+    4.897749658726415,
+    9.0,
+    8.398776718824767,
+    3.908281400328807,
+    4.83431066343688,
+    4.0212880272531715,
+    4.832322929216881,
+    4.898357301336036,
+    5.5507273748583,
+    0.0,
+    6.632989819428174,
+    7.438852294822894,
+    5.6631570310967465,
+    7.579428202635459,
+    6.760811985364303,
+    6.916281504639404,
+    4.62003193872216,
+    6.442097410320605,
+    4.897749658726415,
+    6.632989819428174,
+    0.0,
+    5.249404187382862,
+    6.072559523278559,
+    4.07661278488929,
+    6.19678948003145,
+    5.741508386786526,
+    6.498056792320361,
+    5.227077347287883,
+    9.0,
+    7.438852294822894,
+    5.249404187382862,
+    0.0,
+    3.854811639654704,
+    6.652724827169063,
+    5.298236851430971,
+    5.411470999663036,
+    4.309846252268695,
+    7.134101195584642,
+    8.398776718824767,
+    5.6631570310967465,
+    6.072559523278559,
+    3.854811639654704,
+    0.0,
+    7.529184598969917,
+    6.903282911791188,
+    9.0,
+    6.317531174829905,
+    5.457753923371659,
+    3.908281400328807,
+    7.579428202635459,
+    4.07661278488929,
+    6.652724827169063,
+    7.529184598969917,
+    0.0,
+    7.0,
+    4.977014354725805,
+    6.016362684141827,
+    7.0,
+    4.83431066343688,
+    6.760811985364303,
+    6.19678948003145,
+    5.298236851430971,
+    6.903282911791188,
+    7.0,
+    0.0},
+   cuvs::distance::DistanceType::Canberra,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    1.31462855332296,
+    1.3690307816129905,
+    1.698603990921237,
+    1.3460470789553531,
+    1.6636670712582544,
+    1.2651744044972217,
+    1.1938329352055201,
+    1.8811409082590185,
+    1.3653115050624267,
+    1.31462855332296,
+    0.0,
+    1.9447722703291133,
+    1.42818777206562,
+    1.4685491458946494,
+    1.3071999866010466,
+    1.4988622861692171,
+    0.9698559287406783,
+    1.4972023224597841,
+    1.5243383567266802,
+    1.3690307816129905,
+    1.9447722703291133,
+    0.0,
+    1.2748400840107568,
+    1.0599569946448246,
+    1.546591282841402,
+    1.147526531928459,
+    1.447002179128145,
+    1.5982242387673176,
+    1.3112533607072414,
+    1.698603990921237,
+    1.42818777206562,
+    1.2748400840107568,
+    0.0,
+    1.038121552545461,
+    1.011788365364402,
+    1.3907391109256988,
+    1.3128200942311496,
+    1.19595706584447,
+    1.3233328139624725,
+    1.3460470789553531,
+    1.4685491458946494,
+    1.0599569946448246,
+    1.038121552545461,
+    0.0,
+    1.3642741698145529,
+    1.3493868683808095,
+    1.394942694628328,
+    1.572881849642552,
+    1.380122665319464,
+    1.6636670712582544,
+    1.3071999866010466,
+    1.546591282841402,
+    1.011788365364402,
+    1.3642741698145529,
+    0.0,
+    1.018961640373018,
+    1.0114394258945634,
+    0.8338711034820684,
+    1.1247823842299223,
+    1.2651744044972217,
+    1.4988622861692171,
+    1.147526531928459,
+    1.3907391109256988,
+    1.3493868683808095,
+    1.018961640373018,
+    0.0,
+    0.7701238110357329,
+    1.245486437864406,
+    0.5551259549534626,
+    1.1938329352055201,
+    0.9698559287406783,
+    1.447002179128145,
+    1.3128200942311496,
+    1.394942694628328,
+    1.0114394258945634,
+    0.7701238110357329,
+    0.0,
+    1.1886800117391216,
+    1.0083692448135637,
+    1.8811409082590185,
+    1.4972023224597841,
+    1.5982242387673176,
+    1.19595706584447,
+    1.572881849642552,
+    0.8338711034820684,
+    1.245486437864406,
+    1.1886800117391216,
+    0.0,
+    1.3661374102525012,
+    1.3653115050624267,
+    1.5243383567266802,
+    1.3112533607072414,
+    1.3233328139624725,
+    1.380122665319464,
+    1.1247823842299223,
+    0.5551259549534626,
+    1.0083692448135637,
+    1.3661374102525012,
+    0.0},
+   cuvs::distance::DistanceType::LpUnexpanded,
+   2.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    0.9251771844789913,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.717493881903289,
+    0.6920214832303888,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.0,
+    0.9036452083899731,
+    0.8655339692155823,
+    0.8706483735804971,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.6329837991017668,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.0,
+    0.7988276152181608,
+    0.7028075145996631,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.8429599432532096,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.7988276152181608,
+    0.0,
+    0.48376552205293305,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8429599432532096,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.7028075145996631,
+    0.48376552205293305,
+    0.0,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8429599432532096,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.0,
+    0.8853924473642432,
+    0.535821510936138,
+    0.6497196601457607,
+    0.8853924473642432,
+    0.717493881903289,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.0,
+    0.5279604218147174,
+    0.6658348373853169,
+    0.33799874888632914,
+    0.6920214832303888,
+    0.6329837991017668,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.535821510936138,
+    0.5279604218147174,
+    0.0,
+    0.662579808115858,
+    0.5079750812968089,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.6497196601457607,
+    0.6658348373853169,
+    0.662579808115858,
+    0.0,
+    0.8429599432532096,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.33799874888632914,
+    0.5079750812968089,
+    0.8429599432532096,
+    0.0},
+   cuvs::distance::DistanceType::Linf,
+   0.0},
+
+  {15,
+   {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
+   {0, 1, 5,  6, 9, 1,  4,  14, 7, 3,  4,  7, 9, 11, 14, 0, 3, 7, 8, 12, 0,  2, 5,
+    7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8,  9,  0, 2, 3, 4, 6,  10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219,
+    1.,         0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246,
+    0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739,
+    0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409,  0.31461499, 0.24412279, 0.08327667,
+    0.04444576, 0.05047969, 0.26190054, 0.2077349,  0.10803964},
+   {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
+    9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
+    6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
+    1.00000000e+00, 7.33151506e-01, 1.00000000e+00, 9.86880955e-01,
+    9.19154851e-01, 5.38849774e-01, 1.00000000e+00, 8.98332369e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 8.03303970e-01,
+    6.64465915e-01, 8.69374690e-01, 1.00000000e+00, 1.00000000e+00,
+    1.00000000e+00, 1.00000000e+00, 9.24116813e-01, 7.33151506e-01,
+    8.03303970e-01, 0.00000000e+00, 8.16225843e-01, 9.39818306e-01,
+    7.27700415e-01, 7.30155528e-01, 8.89451011e-01, 8.05419635e-01,
+    9.90039274e-01, 1.00000000e+00, 6.64465915e-01, 8.16225843e-01,
+    0.00000000e+00, 6.38804490e-01, 1.00000000e+00, 1.00000000e+00,
+    9.52559809e-01, 9.53789212e-01, 7.97613546e-01, 9.86880955e-01,
+    8.69374690e-01, 9.39818306e-01, 6.38804490e-01, 0.0,
+    1.00000000e+00, 9.72569112e-01, 8.24907516e-01, 8.07933016e-01,
+    8.91271059e-01, 9.19154851e-01, 1.00000000e+00, 7.27700415e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 7.63596268e-01,
+    8.40131263e-01, 7.40428532e-01, 1.00000000e+00, 5.38849774e-01,
+    1.00000000e+00, 7.30155528e-01, 1.00000000e+00, 9.72569112e-01,
+    7.63596268e-01, 0.00000000e+00, 1.00000000e+00, 7.95485011e-01,
+    6.64669302e-01, 1.00000000e+00, 1.00000000e+00, 8.89451011e-01,
+    9.52559809e-01, 8.24907516e-01, 8.40131263e-01, 1.00000000e+00,
+    0.00000000e+00, 8.51370877e-01, 8.59439512e-01, 8.98332369e-01,
+    1.00000000e+00, 8.05419635e-01, 9.53789212e-01, 8.07933016e-01,
+    7.40428532e-01, 7.95485011e-01, 8.51370877e-01, 1.49011612e-08},
+   // Dataset is L1 normalized into pdfs
+   cuvs::distance::DistanceType::HellingerExpanded,
+   0.0},
+
+  {4,
+   {0, 1, 1, 2, 4},
+   {3, 2, 0, 1},  // indices
+   {0.99296, 0.42180, 0.11687, 0.305869},
+   {
+     // dense output
+     0.0,
+     0.99296,
+     1.41476,
+     1.415707,
+     0.99296,
+     0.0,
+     0.42180,
+     0.42274,
+     1.41476,
+     0.42180,
+     0.0,
+     0.84454,
+     1.41570,
+     0.42274,
+     0.84454,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L1,
+   0.0},
+  {5,
+   {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45},
+   {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2,
+    3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4},
+   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667,
+    0.34426657, 0.2357925,  0.01274851, 0.11422017, 0.3437756,  0.31967718, 0.5956055,  0.31610373,
+    0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
+    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736,
+    0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815,
+    0.31648823, 0.89874295, 0.27366735, 0.5119944,  0.11416134},
+   {// dense output
+    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,  0.76962708, 1.122858,
+    1.1232498,  1.08166081, 0.48769777, 0.,         1.31332116, 0.98318907, 0.42661815, 0.09279052,
+    1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
+    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907,
+    1.82943642, 0.,         0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116,
+    0.26657011, 0.42661815, 1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
+    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093, 0.45060069, 0.,
+    1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281,
+    0.77814948, 1.29899154, 0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
+    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,         1.47318624, 1.92660889,
+    1.1232498,  0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624,
+    0.,         0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
+    1.88812175, 1.92660889, 0.24992619, 0.},
+   cuvs::distance::DistanceType::CorrelationExpanded,
+   0.0},
+  {5,
+   {0, 1, 2, 4, 4, 5, 6, 7, 9, 9, 10},
+   {1, 4, 0, 4, 1, 3, 0, 1, 3, 0},
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {// dense output
+    0.,  1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1.,  0., 0.8, 1., 1.,  1.,  1.,  1.,  1., 1.,
+    1.,  0.8, 0.,  1., 1.,  1., 0.8, 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  1.,  1.,  1., 1.,
+    0.8, 1.,  1.,  1., 0.,  1., 1.,  0.8, 1., 1.,  1.,  1., 1.,  1., 1.,  0.,  1.,  0.8, 1., 1.,
+    1.,  1.,  0.8, 1., 1.,  1., 0.,  1.,  1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1.,  0.,  1., 1.,
+    1.,  1.,  1.,  1., 1.,  1., 1.,  1.,  0., 1.,  1.,  1., 0.8, 1., 1.,  1.,  0.8, 1.,  1., 0.},
+   cuvs::distance::DistanceType::RusselRaoExpanded,
+   0.0},
+  {5,
+   {0, 1, 1, 3, 3, 4, 4, 6, 9, 10, 10},
+   {0, 3, 4, 4, 2, 3, 0, 2, 3, 2},
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {// dense output
+    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4,
+    0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0.,  0.4, 0.,
+    0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,  0.2, 0.6, 0.8, 0.4, 0.2, 0.2,
+    0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2,
+    0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4,
+    0.2, 0.2, 0.4, 0.,  0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
+   cuvs::distance::DistanceType::HammingUnexpanded,
+   0.0},
+  {3,
+   {0, 1, 2},
+   {0, 1},
+   {1.0, 1.0},
+   {0.0, 0.83255, 0.83255, 0.0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+  {2,
+   {0, 1, 3},
+   {0, 0, 1},
+   {1.0, 0.5, 0.5},
+   {0, 0.4645014, 0.4645014, 0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+  {3,
+   {0, 1, 2},
+   {0, 0},
+   {1.0, 1.0},
+   {0.0, 0.0, 0.0, 0.0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+
+  {3,
+   {0, 1, 2},
+   {0, 1},
+   {1.0, 1.0},
+   {0.0, 1.0, 1.0, 0.0},
+   cuvs::distance::DistanceType::DiceExpanded,
+   0.0},
+  {3,
+   {0, 1, 3},
+   {0, 0, 1},
+   {1.0, 1.0, 1.0},
+   {0, 0.333333, 0.333333, 0},
+   cuvs::distance::DistanceType::DiceExpanded,
+   0.0},
+
+};
+
+typedef SparseDistanceTest<int, float> SparseDistanceTestF;
+TEST_P(SparseDistanceTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests,
+                        SparseDistanceTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+}  // end namespace distance
+}  // end namespace cuvs
diff --git a/cpp/test/neighbors/sparse_brute_force.cu b/cpp/test/neighbors/sparse_brute_force.cu
new file mode 100644
index 000000000..cb68989d4
--- /dev/null
+++ b/cpp/test/neighbors/sparse_brute_force.cu
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cusparse_v2.h>
+#include <gtest/gtest.h>
+
+namespace cuvs {
+namespace neighbors {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseKNNInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+  std::vector<value_idx> out_indices_ref_h;
+
+  int k;
+
+  int batch_size_index = 2;
+  int batch_size_query = 2;
+
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2SqrtExpanded;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs<value_idx, value_t>& dims)
+{
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+ public:
+  SparseKNNTest()
+    : params(::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam()),
+      indptr(0, resource::get_cuda_stream(handle)),
+      indices(0, resource::get_cuda_stream(handle)),
+      data(0, resource::get_cuda_stream(handle)),
+      out_indices(0, resource::get_cuda_stream(handle)),
+      out_dists(0, resource::get_cuda_stream(handle)),
+      out_indices_ref(0, resource::get_cuda_stream(handle)),
+      out_dists_ref(0, resource::get_cuda_stream(handle))
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    n_rows = params.indptr_h.size() - 1;
+    nnz    = params.indices_h.size();
+    k      = params.k;
+
+    make_data();
+
+    auto index_structure =
+      raft::make_device_compressed_structure_view<value_idx, value_idx, value_idx>(
+        indptr.data(), indices.data(), n_rows, params.n_cols, nnz);
+    auto index_csr = raft::make_device_csr_matrix_view<const value_t>(data.data(), index_structure);
+
+    auto index = cuvs::neighbors::brute_force::build(handle, index_csr, params.metric);
+
+    cuvs::neighbors::brute_force::sparse_search_params search_params;
+    search_params.batch_size_index = params.batch_size_index;
+    search_params.batch_size_query = params.batch_size_query;
+
+    cuvs::neighbors::brute_force::search(
+      handle,
+      search_params,
+      index,
+      index_csr,
+      raft::make_device_matrix_view<value_idx, int64_t>(out_indices.data(), n_rows, k),
+      raft::make_device_matrix_view<value_t, int64_t>(out_dists.data(), n_rows, k));
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle)));
+  }
+
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(
+      out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(
+      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
+  }
+
+ protected:
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h      = params.data_h;
+
+    auto stream = resource::get_cuda_stream(handle);
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
+
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
+    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+
+    out_indices_ref.resize(out_indices_ref_h.size(), stream);
+    out_dists_ref.resize(out_dists_ref_h.size(), stream);
+
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
+
+    out_dists.resize(n_rows * k, stream);
+    out_indices.resize(n_rows * k, stream);
+  }
+
+  raft::resources handle;
+
+  int n_rows, nnz, k;
+
+  // input data
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
+
+  // output data
+  rmm::device_uvector<value_idx> out_indices;
+  rmm::device_uvector<value_t> out_dists;
+
+  rmm::device_uvector<value_idx> out_indices_ref;
+  rmm::device_uvector<value_t> out_dists_ref;
+
+  SparseKNNInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
+  {9,                                                 // ncols
+   {0, 2, 4, 6, 8},                                   // indptr
+   {0, 4, 0, 3, 0, 2, 0, 8},                          // indices
+   {0.0f, 1.0f, 5.0f, 6.0f, 5.0f, 6.0f, 0.0f, 1.0f},  // data
+   {0, 1.41421, 0, 7.87401, 0, 7.87401, 0, 1.41421},  // dists
+   {0, 3, 1, 0, 2, 0, 3, 0},                          // inds
+   2,
+   2,
+   2,
+   cuvs::distance::DistanceType::L2SqrtExpanded}};
+typedef SparseKNNTest<int, float> SparseKNNTestF;
+TEST_P(SparseKNNTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace neighbors
+};  // end namespace cuvs

From 710e9f5a541c518deffb91f75a87cd4fe1372a8a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 22 Nov 2024 09:25:27 -0500
Subject: [PATCH 32/47] Add `kIsSingleSource` to
 `PairwiseDistanceEpilogueElementwise` (#485)

With raft having recently migrated to cutlass 3.5.1, this field is now required.

Also remove `raft_cutlass` from symbol exclusions.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/485
---
 .github/workflows/pr.yaml                                       | 2 +-
 .github/workflows/test.yaml                                     | 2 +-
 .../distance/detail/pairwise_distance_epilogue_elementwise.h    | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e18e82df0..78648235f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -88,7 +88,7 @@ jobs:
     with:
       build_type: pull-request
       enable_check_symbols: true
-      symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+      symbol_exclusions: (void (thrust::|cub::))
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5f60c0a34..27dc99a11 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -23,7 +23,7 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       enable_check_symbols: true
-      symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+      symbol_exclusions: (void (thrust::|cub::))
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
diff --git a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
index f9955334d..f4a7feaba 100644
--- a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
+++ b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
@@ -61,6 +61,7 @@ class PairwiseDistanceEpilogueElementwise {
   using ElementT                      = ElementT_;
   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kCount             = kElementsPerAccess;
+  static bool const kIsSingleSource   = true;
 
   using DistanceOp = DistanceOp_;
   using FinalOp    = FinalOp_;

From 96d98b12df0030bc21c8588e8905df9cdc00784e Mon Sep 17 00:00:00 2001
From: Azurethi <Github@Azurethi.me>
Date: Sat, 23 Nov 2024 11:02:30 -0500
Subject: [PATCH 33/47] Fix broken link in README.md references (#473)

Fixed the broken link for "Top-K Algorithms on GPU: A Comprehensive Study and New Methods"

Authors:
  - https://github.com/Azurethi

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/473
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 572e8d098..23759f598 100755
--- a/README.md
+++ b/README.md
@@ -242,7 +242,7 @@ If you are interested in contributing to the cuVS library, please read our [Cont
 
 For the interested reader, many of the accelerated implementations in cuVS are also based on research papers which can provide a lot more background. We also ask you to please cite the corresponding algorithms by referencing them in your own research. 
 - [CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search](https://arxiv.org/abs/2308.15136)
-- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062>)
+- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062)
 - [Fast K-NN Graph Construction by GPU Based NN-Descent](https://dl.acm.org/doi/abs/10.1145/3459637.3482344?casa_token=O_nan1B1F5cAAAAA:QHWDEhh0wmd6UUTLY9_Gv6c3XI-5DXM9mXVaUXOYeStlpxTPmV3nKvABRfoivZAaQ3n8FWyrkWw>)
 - [cuSLINK: Single-linkage Agglomerative Clustering on the GPU](https://arxiv.org/abs/2306.16354)
 - [GPU Semiring Primitives for Sparse Neighborhood Methods](https://arxiv.org/abs/2104.06357)

From e1359e1a36ee48d2474a03a3b05c67b6610b220c Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Mon, 25 Nov 2024 21:09:26 +0100
Subject: [PATCH 34/47] Add serialization API to brute-force (#461)

I noticed it was missing while switching Milvus to cuVS

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/461
---
 .gitignore                                    |   1 +
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cuvs/neighbors/brute_force.h      |  60 +++++
 cpp/include/cuvs/neighbors/brute_force.hpp    | 243 ++++++++++++++++++
 cpp/src/neighbors/brute_force.cu              |  15 ++
 cpp/src/neighbors/brute_force_c.cpp           |  55 +++-
 cpp/src/neighbors/brute_force_serialize.cu    | 169 ++++++++++++
 cpp/test/neighbors/ann_brute_force.cuh        |  18 +-
 docs/source/c_api/neighbors_bruteforce_c.rst  |   8 +
 docs/source/c_api/neighbors_hnsw_c.rst        |   4 +-
 docs/source/c_api/neighbors_ivf_flat_c.rst    |   8 +
 docs/source/c_api/neighbors_ivf_pq_c.rst      |   8 +
 docs/source/cpp_api/neighbors_bruteforce.rst  |   8 +
 .../python_api/neighbors_brute_force.rst      |  10 +
 docs/source/python_api/neighbors_cagra.rst    |  10 +
 docs/source/python_api/neighbors_hnsw.rst     |  10 +
 docs/source/python_api/neighbors_ivf_flat.rst |  10 +
 docs/source/python_api/neighbors_ivf_pq.rst   |  10 +
 .../cuvs/neighbors/brute_force/__init__.py    |   4 +-
 .../neighbors/brute_force/brute_force.pxd     |   8 +
 .../neighbors/brute_force/brute_force.pyx     |  86 +++++++
 python/cuvs/cuvs/test/test_serialization.py   |  38 ++-
 22 files changed, 767 insertions(+), 17 deletions(-)
 create mode 100644 cpp/src/neighbors/brute_force_serialize.cu

diff --git a/.gitignore b/.gitignore
index 97eab287d..da6eb07f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,6 +75,7 @@ compile_commands.json
 .clangd/
 
 # serialized ann indexes
+brute_force_index
 cagra_index
 ivf_flat_index
 ivf_pq_index
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 32093776c..eb2e7c7a4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -371,6 +371,7 @@ if(BUILD_SHARED_LIBS)
     src/distance/pairwise_distance.cu
     src/distance/sparse_distance.cu
     src/neighbors/brute_force.cu
+    src/neighbors/brute_force_serialize.cu
     src/neighbors/cagra_build_float.cu
     src/neighbors/cagra_build_half.cu
     src/neighbors/cagra_build_int8.cu
diff --git a/cpp/include/cuvs/neighbors/brute_force.h b/cpp/include/cuvs/neighbors/brute_force.h
index c9e172f62..33b92f11b 100644
--- a/cpp/include/cuvs/neighbors/brute_force.h
+++ b/cpp/include/cuvs/neighbors/brute_force.h
@@ -166,6 +166,66 @@ cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
  * @}
  */
 
+/**
+ * @defgroup bruteforce_c_serialize BRUTEFORCE C-API serialize functions
+ * @{
+ */
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.c}
+ * #include <cuvs/neighbors/brute_force.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create an index with `cuvsBruteforceBuild`
+ * cuvsBruteForceSerialize(res, "/path/to/index", index);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index BRUTEFORCE index
+ *
+ */
+cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                    const char* filename,
+                                    cuvsBruteForceIndex_t index);
+
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.c}
+ * #include <cuvs/neighbors/brute_force.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // Deserialize an index previously built with `cuvsBruteforceBuild`
+ * cuvsBruteForceIndex_t index;
+ * cuvsBruteForceIndexCreate(&index);
+ * cuvsBruteForceDeserialize(res, "/path/to/index", index);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index BRUTEFORCE index loaded disk
+ */
+cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                      const char* filename,
+                                      cuvsBruteForceIndex_t index);
+
+/**
+ * @}
+ */
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp
index ba67797ee..d040e03db 100644
--- a/cpp/include/cuvs/neighbors/brute_force.hpp
+++ b/cpp/include/cuvs/neighbors/brute_force.hpp
@@ -48,6 +48,14 @@ struct index : cuvs::neighbors::index {
   index& operator=(index&&)      = default;
   ~index()                       = default;
 
+  /**
+   * @brief Construct an empty index.
+   *
+   * Constructs an empty index. This index will either need to be trained with `build`
+   * or loaded from a saved copy with `deserialize`
+   */
+  index(raft::resources const& handle);
+
   /** Construct a brute force index from dataset
    *
    * Constructs a brute force index from a dataset. This lets us precompute norms for
@@ -479,4 +487,239 @@ void search(raft::resources const& handle,
 /**
  * @}
  */
+
+/**
+ * @defgroup bruteforce_cpp_index_serialize Bruteforce index serialize functions
+ * @{
+ */
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index brute force index
+ * @param[in] include_dataset whether to include the dataset in the serialized
+ * output
+ */
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::brute_force::index<half, float>& index,
+               bool include_dataset = true);
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index brute force index
+ * @param[in] include_dataset whether to include the dataset in the serialized
+ * output
+ *
+ */
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::brute_force::index<float, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Write the index to an output stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index brute force index
+ * @param[in] include_dataset Whether or not to write out the dataset to the file.
+ */
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const cuvs::neighbors::brute_force::index<half, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Write the index to an output stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index brute force index
+ * @param[in] include_dataset Whether or not to write out the dataset to the file.
+ */
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const cuvs::neighbors::brute_force::index<float, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * using T    = half; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<half, float>* index);
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * using T    = float; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<float, float>* index);
+/**
+ * Load index from input stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an input stream
+ * std::istream is(std::cin.rdbuf());
+ * using T    = half; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, is, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] is input stream
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<half, float>* index);
+/**
+ * Load index from input stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an input stream
+ * std::istream is(std::cin.rdbuf());
+ * using T    = float; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, is, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] is input stream
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<float, float>* index);
+/**
+ * @}
+ */
+
 }  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu
index b0f87e9ac..d534676e3 100644
--- a/cpp/src/neighbors/brute_force.cu
+++ b/cpp/src/neighbors/brute_force.cu
@@ -21,6 +21,21 @@
 #include <raft/core/copy.hpp>
 
 namespace cuvs::neighbors::brute_force {
+
+template <typename T, typename DistT>
+index<T, DistT>::index(raft::resources const& res)
+  // this constructor is just for a temporary index, for use in the deserialization
+  // api. all the parameters here will get replaced with loaded values - that aren't
+  // necessarily known ahead of time before deserialization.
+  // TODO: do we even need a handle here - could just construct one?
+  : cuvs::neighbors::index(),
+    metric_(cuvs::distance::DistanceType::L2Expanded),
+    dataset_(raft::make_device_matrix<T, int64_t>(res, 0, 0)),
+    norms_(std::nullopt),
+    metric_arg_(0)
+{
+}
+
 template <typename T, typename DistT>
 index<T, DistT>::index(raft::resources const& res,
                        raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp
index eda79aa31..f1a8c995d 100644
--- a/cpp/src/neighbors/brute_force_c.cpp
+++ b/cpp/src/neighbors/brute_force_c.cpp
@@ -17,10 +17,12 @@
 
 #include <cstdint>
 #include <dlpack/dlpack.h>
+#include <fstream>
 
 #include <raft/core/error.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
 
 #include <cuvs/core/c_api.h>
 #include <cuvs/core/exceptions.hpp>
@@ -91,6 +93,22 @@ void _search(cuvsResources_t res,
   }
 }
 
+template <typename T>
+void _serialize(cuvsResources_t res, const char* filename, cuvsBruteForceIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::brute_force::index<T>*>(index.addr);
+  cuvs::neighbors::brute_force::serialize(*res_ptr, std::string(filename), *index_ptr);
+}
+
+template <typename T>
+void* _deserialize(cuvsResources_t res, const char* filename)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  auto index   = new cuvs::neighbors::brute_force::index<T>(*res_ptr);
+  cuvs::neighbors::brute_force::deserialize(*res_ptr, std::string(filename), index);
+  return index;
+}
 }  // namespace
 
 extern "C" cuvsError_t cuvsBruteForceIndexCreate(cuvsBruteForceIndex_t* index)
@@ -129,7 +147,7 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res,
     if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
       index->addr =
         reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
-      index->dtype.code = kDLFloat;
+      index->dtype = dataset.dtype;
     } else {
       RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
                 dataset.dtype.code,
@@ -174,3 +192,38 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
     }
   });
 }
+
+extern "C" cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                                 const char* filename,
+                                                 cuvsBruteForceIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    // read the numpy dtype from the beginning of the file
+    std::ifstream is(filename, std::ios::in | std::ios::binary);
+    if (!is) { RAFT_FAIL("Cannot open file %s", filename); }
+    char dtype_string[4];
+    is.read(dtype_string, 4);
+    auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4));
+
+    index->dtype.bits = dtype.itemsize * 8;
+    if (dtype.kind == 'f' && dtype.itemsize == 4) {
+      index->dtype.code = kDLFloat;
+      index->addr       = reinterpret_cast<uintptr_t>(_deserialize<float>(res, filename));
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                               const char* filename,
+                                               cuvsBruteForceIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
+      _serialize<float>(res, filename, *index);
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
\ No newline at end of file
diff --git a/cpp/src/neighbors/brute_force_serialize.cu b/cpp/src/neighbors/brute_force_serialize.cu
new file mode 100644
index 000000000..1b5b5111e
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_serialize.cu
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/copy.cuh>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
+
+#include <fstream>
+
+namespace cuvs::neighbors::brute_force {
+
+int constexpr serialization_version = 0;
+
+template <typename T, typename DistT>
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<T, DistT>& index,
+               bool include_dataset = true)
+{
+  RAFT_LOG_DEBUG(
+    "Saving brute force index, size %zu, dim %u", static_cast<size_t>(index.size()), index.dim());
+
+  auto dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<T>().to_string();
+  dtype_string.resize(4);
+  os << dtype_string;
+
+  raft::serialize_scalar(handle, os, serialization_version);
+  raft::serialize_scalar(handle, os, index.size());
+  raft::serialize_scalar(handle, os, index.dim());
+  raft::serialize_scalar(handle, os, index.metric());
+  raft::serialize_scalar(handle, os, index.metric_arg());
+  raft::serialize_scalar(handle, os, include_dataset);
+  if (include_dataset) { raft::serialize_mdspan(handle, os, index.dataset()); }
+  auto has_norms = index.has_norms();
+  raft::serialize_scalar(handle, os, has_norms);
+  if (has_norms) { raft::serialize_mdspan(handle, os, index.norms()); }
+  raft::resource::sync_stream(handle);
+}
+
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<half, float>& index,
+               bool include_dataset)
+{
+  auto os = std::ofstream{filename, std::ios::out | std::ios::binary};
+  RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str());
+  serialize<half, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<float, float>& index,
+               bool include_dataset)
+{
+  auto os = std::ofstream{filename, std::ios::out | std::ios::binary};
+  RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str());
+  serialize<float, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<half, float>& index,
+               bool include_dataset)
+{
+  serialize<half, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<float, float>& index,
+               bool include_dataset)
+{
+  serialize<float, float>(handle, os, index, include_dataset);
+}
+
+template <typename T, typename DistT>
+auto deserialize(raft::resources const& handle, std::istream& is)
+{
+  auto dtype_string = std::array<char, 4>{};
+  is.read(dtype_string.data(), 4);
+
+  auto ver = raft::deserialize_scalar<int>(handle, is);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  std::int64_t rows = raft::deserialize_scalar<size_t>(handle, is);
+  std::int64_t dim  = raft::deserialize_scalar<size_t>(handle, is);
+  auto metric       = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
+  auto metric_arg   = raft::deserialize_scalar<DistT>(handle, is);
+
+  auto dataset_storage = raft::make_host_matrix<T>(std::int64_t{}, std::int64_t{});
+  auto include_dataset = raft::deserialize_scalar<bool>(handle, is);
+  if (include_dataset) {
+    dataset_storage = raft::make_host_matrix<T>(rows, dim);
+    raft::deserialize_mdspan(handle, is, dataset_storage.view());
+  }
+
+  auto has_norms     = raft::deserialize_scalar<bool>(handle, is);
+  auto norms_storage = has_norms ? std::optional{raft::make_host_vector<DistT, std::int64_t>(rows)}
+                                 : std::optional<raft::host_vector<DistT, std::int64_t>>{};
+  // TODO(wphicks): Use mdbuffer here when available
+  auto norms_storage_dev =
+    has_norms ? std::optional{raft::make_device_vector<DistT, std::int64_t>(handle, rows)}
+              : std::optional<raft::device_vector<DistT, std::int64_t>>{};
+  if (has_norms) {
+    raft::deserialize_mdspan(handle, is, norms_storage->view());
+    raft::copy(handle, norms_storage_dev->view(), norms_storage->view());
+  }
+
+  auto result = index<T, DistT>(handle,
+                                raft::make_const_mdspan(dataset_storage.view()),
+                                std::move(norms_storage_dev),
+                                metric,
+                                metric_arg);
+  raft::resource::sync_stream(handle);
+
+  return result;
+}
+
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<half, float>* index)
+{
+  auto is = std::ifstream{filename, std::ios::in | std::ios::binary};
+  RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str());
+
+  *index = deserialize<half, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<float, float>* index)
+{
+  auto is = std::ifstream{filename, std::ios::in | std::ios::binary};
+  RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str());
+
+  *index = deserialize<float, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<half, float>* index)
+{
+  *index = deserialize<half, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<float, float>* index)
+{
+  *index = deserialize<float, float>(handle, is);
+}
+
+}  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/test/neighbors/ann_brute_force.cuh b/cpp/test/neighbors/ann_brute_force.cuh
index c2afa4e8b..03d6e820c 100644
--- a/cpp/test/neighbors/ann_brute_force.cuh
+++ b/cpp/test/neighbors/ann_brute_force.cuh
@@ -114,12 +114,28 @@ class AnnBruteForceTest : public ::testing::TestWithParam<AnnBruteForceInputs<Id
                                                       0.001f,
                                                       stream_,
                                                       true));
+
+      brute_force::serialize(handle_, std::string{"brute_force_index"}, idx, true);
+      auto index_loaded = brute_force::index<DataT, T>(handle_);
+      brute_force::deserialize(handle_, std::string{"brute_force_index"}, &index_loaded);
+
       brute_force::search(handle_,
-                          idx,
+                          index_loaded,
                           search_queries_view,
                           indices_out_view,
                           dists_out_view,
                           cuvs::neighbors::filtering::none_sample_filter{});
+      raft::resource::sync_stream(handle_);
+
+      ASSERT_TRUE(cuvs::neighbors::devArrMatchKnnPair(indices_naive_dev.data(),
+                                                      indices_bruteforce_dev.data(),
+                                                      distances_naive_dev.data(),
+                                                      distances_bruteforce_dev.data(),
+                                                      ps.num_queries,
+                                                      ps.k,
+                                                      0.001f,
+                                                      stream_,
+                                                      true));
     }
   }
 
diff --git a/docs/source/c_api/neighbors_bruteforce_c.rst b/docs/source/c_api/neighbors_bruteforce_c.rst
index af0356eee..a12175209 100644
--- a/docs/source/c_api/neighbors_bruteforce_c.rst
+++ b/docs/source/c_api/neighbors_bruteforce_c.rst
@@ -32,3 +32,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: bruteforce_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst
index 4d83cd3e3..988e5b6f3 100644
--- a/docs/source/c_api/neighbors_hnsw_c.rst
+++ b/docs/source/c_api/neighbors_hnsw_c.rst
@@ -29,13 +29,13 @@ Index
 Index search
 ------------
 
-.. doxygengroup:: cagra_c_index_search
+.. doxygengroup:: hnsw_c_index_search
     :project: cuvs
     :members:
     :content-only:
 
 Index serialize
-------------
+---------------
 
 .. doxygengroup:: hnsw_c_index_serialize
     :project: cuvs
diff --git a/docs/source/c_api/neighbors_ivf_flat_c.rst b/docs/source/c_api/neighbors_ivf_flat_c.rst
index 9e1ccc0d1..1254d70ef 100644
--- a/docs/source/c_api/neighbors_ivf_flat_c.rst
+++ b/docs/source/c_api/neighbors_ivf_flat_c.rst
@@ -48,3 +48,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_flat_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/c_api/neighbors_ivf_pq_c.rst b/docs/source/c_api/neighbors_ivf_pq_c.rst
index 070719609..260057b8c 100644
--- a/docs/source/c_api/neighbors_ivf_pq_c.rst
+++ b/docs/source/c_api/neighbors_ivf_pq_c.rst
@@ -48,3 +48,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_pq_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/neighbors_bruteforce.rst b/docs/source/cpp_api/neighbors_bruteforce.rst
index 3adcb01c5..f75e26b3c 100644
--- a/docs/source/cpp_api/neighbors_bruteforce.rst
+++ b/docs/source/cpp_api/neighbors_bruteforce.rst
@@ -34,3 +34,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: bruteforce_cpp_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/python_api/neighbors_brute_force.rst b/docs/source/python_api/neighbors_brute_force.rst
index 5fdc3658f..d756a6c80 100644
--- a/docs/source/python_api/neighbors_brute_force.rst
+++ b/docs/source/python_api/neighbors_brute_force.rst
@@ -20,3 +20,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.brute_force.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.brute_force.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.brute_force.load
diff --git a/docs/source/python_api/neighbors_cagra.rst b/docs/source/python_api/neighbors_cagra.rst
index 09b2e2694..e7155efb8 100644
--- a/docs/source/python_api/neighbors_cagra.rst
+++ b/docs/source/python_api/neighbors_cagra.rst
@@ -34,3 +34,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.cagra.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.cagra.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.cagra.load
diff --git a/docs/source/python_api/neighbors_hnsw.rst b/docs/source/python_api/neighbors_hnsw.rst
index 9922805b3..64fe5493b 100644
--- a/docs/source/python_api/neighbors_hnsw.rst
+++ b/docs/source/python_api/neighbors_hnsw.rst
@@ -28,3 +28,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.hnsw.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.hnsw.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.hnsw.load
diff --git a/docs/source/python_api/neighbors_ivf_flat.rst b/docs/source/python_api/neighbors_ivf_flat.rst
index 5514e5e43..f2c21e68a 100644
--- a/docs/source/python_api/neighbors_ivf_flat.rst
+++ b/docs/source/python_api/neighbors_ivf_flat.rst
@@ -32,3 +32,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.ivf_flat.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_flat.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_flat.load
diff --git a/docs/source/python_api/neighbors_ivf_pq.rst b/docs/source/python_api/neighbors_ivf_pq.rst
index e3625ba67..57668fbc3 100644
--- a/docs/source/python_api/neighbors_ivf_pq.rst
+++ b/docs/source/python_api/neighbors_ivf_pq.rst
@@ -32,3 +32,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.ivf_pq.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_pq.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_pq.load
diff --git a/python/cuvs/cuvs/neighbors/brute_force/__init__.py b/python/cuvs/cuvs/neighbors/brute_force/__init__.py
index b88c4b464..6aa0e4bb2 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/__init__.py
+++ b/python/cuvs/cuvs/neighbors/brute_force/__init__.py
@@ -13,6 +13,6 @@
 # limitations under the License.
 
 
-from .brute_force import Index, build, search
+from .brute_force import Index, build, load, save, search
 
-__all__ = ["Index", "build", "search"]
+__all__ = ["Index", "build", "search", "save", "load"]
diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
index 183827916..f1fc14ba7 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
@@ -47,3 +47,11 @@ cdef extern from "cuvs/neighbors/brute_force.h" nogil:
                                      DLManagedTensor* neighbors,
                                      DLManagedTensor* distances,
                                      cuvsFilter filter) except +
+
+    cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                        const char * filename,
+                                        cuvsBruteForceIndex_t index) except +
+
+    cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                          const char * filename,
+                                          cuvsBruteForceIndex_t index) except +
diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
index 9d1d24eae..9d43bfb29 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
@@ -24,6 +24,7 @@ from cuvs.common.resources import auto_sync_resources
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t
 from libcpp cimport bool
+from libcpp.string cimport string
 
 from cuvs.common cimport cydlpack
 from cuvs.distance_type cimport cuvsDistanceType
@@ -256,3 +257,88 @@ def search(Index index,
         ))
 
     return (distances, neighbors)
+
+
+@auto_sync_resources
+def save(filename, Index index, bool include_dataset=True, resources=None):
+    """
+    Saves the index to a file.
+
+    The serialization format can be subject to changes, therefore loading
+    an index saved with a previous version of cuvs is not guaranteed
+    to work.
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    index : Index
+        Trained Brute Force index.
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import brute_force
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = brute_force.build(dataset)
+    >>> # Serialize and deserialize the brute_force index built
+    >>> brute_force.save("my_index.bin", index)
+    >>> index_loaded = brute_force.load("my_index.bin")
+    """
+    cdef string c_filename = filename.encode('utf-8')
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    check_cuvs(cuvsBruteForceSerialize(res,
+                                       c_filename.c_str(),
+                                       index.index))
+
+
+@auto_sync_resources
+def load(filename, resources=None):
+    """
+    Loads index from file.
+
+    The serialization format can be subject to changes, therefore loading
+    an index saved with a previous version of cuvs is not guaranteed
+    to work.
+
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    {resources_docstring}
+
+    Returns
+    -------
+    index : Index
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import brute_force
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = brute_force.build(dataset)
+    >>> # Serialize and deserialize the brute_force index built
+    >>> brute_force.save("my_index.bin", index)
+    >>> index_loaded = brute_force.load("my_index.bin")
+    """
+    cdef Index idx = Index()
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    cdef string c_filename = filename.encode('utf-8')
+
+    check_cuvs(cuvsBruteForceDeserialize(
+        res,
+        c_filename.c_str(),
+        idx.index
+    ))
+    idx.trained = True
+    return idx
diff --git a/python/cuvs/cuvs/test/test_serialization.py b/python/cuvs/cuvs/test/test_serialization.py
index 4ffccf121..1f4a54e87 100644
--- a/python/cuvs/cuvs/test/test_serialization.py
+++ b/python/cuvs/cuvs/test/test_serialization.py
@@ -17,7 +17,7 @@
 import pytest
 from pylibraft.common import device_ndarray
 
-from cuvs.neighbors import cagra, ivf_flat, ivf_pq
+from cuvs.neighbors import brute_force, cagra, ivf_flat, ivf_pq
 from cuvs.test.ann_utils import generate_data
 
 
@@ -35,6 +35,10 @@ def test_save_load_ivf_pq():
     run_save_load(ivf_pq, np.float32)
 
 
+def test_save_load_brute_force():
+    run_save_load(brute_force, np.float32)
+
+
 def run_save_load(ann_module, dtype):
     n_rows = 10000
     n_cols = 50
@@ -43,8 +47,11 @@ def run_save_load(ann_module, dtype):
     dataset = generate_data((n_rows, n_cols), dtype)
     dataset_device = device_ndarray(dataset)
 
-    build_params = ann_module.IndexParams()
-    index = ann_module.build(build_params, dataset_device)
+    if ann_module == brute_force:
+        index = ann_module.build(dataset_device)
+    else:
+        build_params = ann_module.IndexParams()
+        index = ann_module.build(build_params, dataset_device)
 
     assert index.trained
     filename = "my_index.bin"
@@ -54,20 +61,29 @@ def run_save_load(ann_module, dtype):
     queries = generate_data((n_queries, n_cols), dtype)
 
     queries_device = device_ndarray(queries)
-    search_params = ann_module.SearchParams()
     k = 10
-
-    distance_dev, neighbors_dev = ann_module.search(
-        search_params, index, queries_device, k
-    )
+    if ann_module == brute_force:
+        distance_dev, neighbors_dev = ann_module.search(
+            index, queries_device, k
+        )
+    else:
+        search_params = ann_module.SearchParams()
+        distance_dev, neighbors_dev = ann_module.search(
+            search_params, index, queries_device, k
+        )
 
     neighbors = neighbors_dev.copy_to_host()
     dist = distance_dev.copy_to_host()
     del index
 
-    distance_dev, neighbors_dev = ann_module.search(
-        search_params, loaded_index, queries_device, k
-    )
+    if ann_module == brute_force:
+        distance_dev, neighbors_dev = ann_module.search(
+            loaded_index, queries_device, k
+        )
+    else:
+        distance_dev, neighbors_dev = ann_module.search(
+            search_params, loaded_index, queries_device, k
+        )
 
     neighbors2 = neighbors_dev.copy_to_host()
     dist2 = distance_dev.copy_to_host()

From 5062594138a40231475299c7bac61083b0669fd1 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Tue, 26 Nov 2024 10:50:41 +0900
Subject: [PATCH 35/47] [Doc] Fix CAGRA search sample code (#484)

`.view()` is required

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/484
---
 cpp/include/cuvs/neighbors/cagra.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index e48050756..5ceb3010e 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -363,7 +363,7 @@ struct index : cuvs::neighbors::index {
    *   // search K nearest neighbours
    *   auto neighbors = raft::make_device_matrix<uint32_t, int64_t>(res, n_queries, k);
    *   auto distances = raft::make_device_matrix<float, int64_t>(res, n_queries, k);
-   *   cagra::search(res, search_params, index, queries, neighbors, distances);
+   *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
    * @endcode
    *   In the above example, we have passed a host dataset to build. The returned index will own a
    * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a
@@ -530,7 +530,7 @@ struct index : cuvs::neighbors::index {
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -567,7 +567,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -604,7 +604,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -640,7 +640,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -676,7 +676,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -713,7 +713,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -750,7 +750,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -787,7 +787,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res

From 441d2f1bcceb8f653a0fdaec5658c54c5201155b Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Mon, 2 Dec 2024 17:34:08 -0500
Subject: [PATCH 36/47] HNSW CPU Hierarchy (#465)

This PR adds an option to build the full HNSW hierarchy on the CPU when converting a CAGRA index to an hnswlib index. This lets us enable an `extend()` API.

For hnswlib:
1. Update to `v0.7.0`
2. Remove dependency as symbols are compiled within DSO

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/465
---
 .../bench_ann_cuda-118_arch-aarch64.yaml      |   1 -
 .../bench_ann_cuda-118_arch-x86_64.yaml       |   1 -
 .../bench_ann_cuda-125_arch-aarch64.yaml      |   1 -
 .../bench_ann_cuda-125_arch-x86_64.yaml       |   1 -
 cpp/CMakeLists.txt                            |   1 +
 cpp/bench/ann/CMakeLists.txt                  |   4 +-
 cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu  |  34 +-
 .../ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h |  57 ++-
 cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h   |   2 +
 .../ann/src/hnswlib/hnswlib_benchmark.cpp     |   4 +-
 cpp/cmake/modules/ConfigureCUDA.cmake         |   8 +-
 cpp/cmake/patches/hnswlib.diff                | 327 ++++++-------
 cpp/cmake/patches/hnswlib_override.json       |  28 +-
 cpp/cmake/thirdparty/get_hnswlib.cmake        |   1 +
 cpp/include/cuvs/neighbors/hnsw.h             | 279 ++++++++++-
 cpp/include/cuvs/neighbors/hnsw.hpp           | 440 +++++++++++++++---
 cpp/src/neighbors/detail/hnsw.hpp             | 218 ++++++++-
 cpp/src/neighbors/hnsw.cpp                    |  57 ++-
 cpp/src/neighbors/hnsw_c.cpp                  | 157 ++++++-
 cpp/src/neighbors/iface/iface.hpp             |   1 +
 cpp/test/neighbors/ann_hnsw_c.cu              |   4 +-
 cpp/test/neighbors/hnsw.cu                    |   3 +-
 dependencies.yaml                             |   1 -
 docs/source/c_api/neighbors_hnsw_c.rst        |  22 +
 docs/source/cpp_api/neighbors_hnsw.rst        |  23 +-
 python/cuvs/cuvs/neighbors/hnsw/__init__.py   |  15 +-
 python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd      |  49 +-
 python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx      | 276 ++++++++---
 python/cuvs/cuvs/test/test_hnsw.py            |  89 +++-
 .../config/algos/cuvs_cagra_hnswlib.yaml      |   5 +-
 30 files changed, 1683 insertions(+), 426 deletions(-)

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 21cb98180..1e602ccf1 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -24,7 +24,6 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 432509bcb..b060e78c2 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -24,7 +24,6 @@ dependencies:
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 0c5043ac2..485122273 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -25,7 +25,6 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index cbb22333c..d5f48dadb 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -25,7 +25,6 @@ dependencies:
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eb2e7c7a4..34b7cb898 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -577,6 +577,7 @@ if(BUILD_SHARED_LIBS)
 
   if(BUILD_CAGRA_HNSWLIB)
     target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib)
+    target_compile_definitions(cuvs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
     target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
   endif()
 
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 0f6b42ae9..c161a68bc 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -225,9 +225,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA)
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
-  ConfigureAnnBench(
-    NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib
-  )
+  ConfigureAnnBench(NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs)
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_MG)
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index 558ba01e0..e45a3bd5a 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -24,12 +24,35 @@
 
 namespace cuvs::bench {
 
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_cagra_hnswlib<T, IdxT>::build_param& param)
+{
+  if (conf.contains("hierarchy")) {
+    if (conf.at("hierarchy") == "none") {
+      param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::NONE;
+    } else if (conf.at("hierarchy") == "cpu") {
+      param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
+    } else {
+      THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get<std::string>().c_str());
+    }
+  }
+  if (conf.contains("ef_construction")) {
+    param.hnsw_index_params.ef_construction = conf.at("ef_construction");
+  }
+  if (conf.contains("num_threads")) {
+    param.hnsw_index_params.num_threads = conf.at("num_threads");
+  }
+}
+
 template <typename T, typename IdxT>
 void parse_search_param(const nlohmann::json& conf,
                         typename cuvs::bench::cuvs_cagra_hnswlib<T, IdxT>::search_param& param)
 {
-  param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  param.hnsw_search_param.ef = conf.at("ef");
+  if (conf.contains("num_threads")) {
+    param.hnsw_search_param.num_threads = conf.at("num_threads");
+  }
 }
 
 template <typename T>
@@ -43,9 +66,10 @@ auto create_algo(const std::string& algo_name,
 
   if constexpr (std::is_same_v<T, float> or std::is_same_v<T, std::uint8_t>) {
     if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") {
-      typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::build_param param;
-      parse_build_param<T, uint32_t>(conf, param);
-      a = std::make_unique<cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>>(metric, dim, param);
+      typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::build_param bparam;
+      ::parse_build_param<T, uint32_t>(conf, bparam.cagra_build_param);
+      parse_build_param<T, uint32_t>(conf, bparam);
+      a = std::make_unique<cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>>(metric, dim, bparam);
     }
   }
 
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index 875fe0bba..e4169f6f8 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include "../hnswlib/hnswlib_wrapper.h"
 #include "cuvs_cagra_wrapper.h"
+#include <cuvs/neighbors/hnsw.hpp>
 
 #include <memory>
 
@@ -26,14 +26,20 @@ template <typename T, typename IdxT>
 class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
  public:
   using search_param_base = typename algo<T>::search_param;
-  using build_param       = typename cuvs_cagra<T, IdxT>::build_param;
-  using search_param      = typename hnsw_lib<T>::search_param;
+
+  struct build_param {
+    typename cuvs_cagra<T, IdxT>::build_param cagra_build_param;
+    cuvs::neighbors::hnsw::index_params hnsw_index_params;
+  };
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::hnsw::search_params hnsw_search_param;
+  };
 
   cuvs_cagra_hnswlib(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
     : algo<T>(metric, dim),
-      cagra_build_{metric, dim, param, concurrent_searches},
-      // hnsw_lib param values don't matter since we don't build with hnsw_lib
-      hnswlib_search_{metric, dim, typename hnsw_lib<T>::build_param{50, 100}}
+      build_param_{param},
+      cagra_build_{metric, dim, param.cagra_build_param, concurrent_searches}
   {
   }
 
@@ -69,40 +75,67 @@ class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
   }
 
  private:
+  raft::resources handle_{};
+  build_param build_param_;
+  search_param search_param_;
   cuvs_cagra<T, IdxT> cagra_build_;
-  hnsw_lib<T> hnswlib_search_;
+  std::shared_ptr<cuvs::neighbors::hnsw::index<T>> hnsw_index_;
 };
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
 {
   cagra_build_.build(dataset, nrow);
+  auto* cagra_index      = cagra_build_.get_index();
+  auto host_dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+  auto opt_dataset_view =
+    std::optional<raft::host_matrix_view<const T, int64_t>>(std::move(host_dataset_view));
+  hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
+    handle_, build_param_.hnsw_index_params, *cagra_index, opt_dataset_view);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& param_)
 {
-  hnswlib_search_.set_search_param(param_);
+  search_param_ = dynamic_cast<const search_param&>(param_);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
 {
-  cagra_build_.save_to_hnswlib(file);
+  cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get()));
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::load(const std::string& file)
 {
-  hnswlib_search_.load(file);
-  hnswlib_search_.set_base_layer_only();
+  cuvs::neighbors::hnsw::index<T>* idx = nullptr;
+  cuvs::neighbors::hnsw::deserialize(handle_,
+                                     build_param_.hnsw_index_params,
+                                     file,
+                                     this->dim_,
+                                     parse_metric_type(this->metric_),
+                                     &idx);
+  hnsw_index_ = std::shared_ptr<cuvs::neighbors::hnsw::index<T>>(idx);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::search(
   const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
 {
-  hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
+  // Only Latency mode is supported for now
+  auto queries_view =
+    raft::make_host_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
+  auto neighbors_view = raft::make_host_matrix_view<uint64_t, int64_t>(
+    reinterpret_cast<uint64_t*>(neighbors), batch_size, k);
+  auto distances_view = raft::make_host_matrix_view<float, int64_t>(distances, batch_size, k);
+
+  cuvs::neighbors::hnsw::search(handle_,
+                                search_param_.hnsw_search_param,
+                                *(hnsw_index_.get()),
+                                queries_view,
+                                neighbors_view,
+                                distances_view);
 }
 
 }  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
index b2ba35eee..f6d3d60fc 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -154,6 +154,8 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
   void save_to_hnswlib(const std::string& file) const;
   std::unique_ptr<algo<T>> copy() override;
 
+  auto get_index() const -> const cuvs::neighbors::cagra::index<T, IdxT>* { return index_.get(); }
+
  private:
   // handle_ must go first to make sure it dies last and all memory allocated in pool
   configured_raft_resources handle_{};
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index 755c7c8d6..6e219d2a7 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -33,7 +33,7 @@ void parse_build_param(const nlohmann::json& conf,
 {
   param.ef_construction = conf.at("efConstruction");
   param.m               = conf.at("M");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
 }
 
 template <typename T>
@@ -41,7 +41,7 @@ void parse_search_param(const nlohmann::json& conf,
                         typename cuvs::bench::hnsw_lib<T>::search_param& param)
 {
   param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
 }
 
 template <typename T, template <typename> class Algo>
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 74da25660..3e91d9995 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -22,8 +22,12 @@ endif()
 # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
 # clang)
 if(CMAKE_COMPILER_IS_GNUCXX)
-  list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
-  list(APPEND CUVS_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+  list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations
+       -Wno-reorder
+  )
+  list(APPEND CUVS_CUDA_FLAGS
+       -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations,-Wno-reorder
+  )
 
   # set warnings as errors
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
diff --git a/cpp/cmake/patches/hnswlib.diff b/cpp/cmake/patches/hnswlib.diff
index e7f89a8cc..f20c27d91 100644
--- a/cpp/cmake/patches/hnswlib.diff
+++ b/cpp/cmake/patches/hnswlib.diff
@@ -1,188 +1,159 @@
+diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h
+index bef0017..0ee7931 100644
 --- a/hnswlib/hnswalg.h
 +++ b/hnswlib/hnswalg.h
-@@ -3,6 +3,7 @@
- #include "visited_list_pool.h"
- #include "hnswlib.h"
- #include <atomic>
-+#include <limits>
- #include <random>
- #include <stdlib.h>
- #include <assert.h>
-@@ -16,6 +17,8 @@ namespace hnswlib {
-     template<typename dist_t>
-     class HierarchicalNSW : public AlgorithmInterface<dist_t> {
-     public:
-+        bool base_layer_only{false};
-+        int num_seeds=32;
-         static const tableint max_update_element_locks = 65536;
-         HierarchicalNSW(SpaceInterface<dist_t> *s) {
-         }
-@@ -56,7 +59,7 @@ namespace hnswlib {
-             visited_list_pool_ = new VisitedListPool(1, max_elements);
- 
-             //initializations for special treatment of the first node
--            enterpoint_node_ = -1;
-+            enterpoint_node_ = std::numeric_limits<tableint>::max();
-             maxlevel_ = -1;
- 
-             linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
-@@ -527,7 +530,7 @@ namespace hnswlib {
-                     tableint *datal = (tableint *) (data + 1);
-                     for (int i = 0; i < size; i++) {
-                         tableint cand = datal[i];
--                        if (cand < 0 || cand > max_elements_)
-+                        if (cand > max_elements_)
-                             throw std::runtime_error("cand error");
-                         dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
- 
-@@ -1067,7 +1070,7 @@ namespace hnswlib {
-                             tableint *datal = (tableint *) (data + 1);
-                             for (int i = 0; i < size; i++) {
-                                 tableint cand = datal[i];
--                                if (cand < 0 || cand > max_elements_)
-+                                if (cand > max_elements_)
-                                     throw std::runtime_error("cand error");
-                                 dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
-                                 if (d < curdist) {
-@@ -1119,28 +1122,41 @@ namespace hnswlib {
-             tableint currObj = enterpoint_node_;
-             dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
- 
--            for (int level = maxlevel_; level > 0; level--) {
--                bool changed = true;
--                while (changed) {
--                    changed = false;
--                    unsigned int *data;
-+            if (base_layer_only) {
-+                // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
-+                for (int i = 0; i < num_seeds; i++) {
-+                    tableint obj = i * (max_elements_ / num_seeds);
-+                    dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
-+                    if (dist < curdist) {
-+                        curdist = dist;
-+                        currObj = obj;
-+                    }
+@@ -16,6 +16,9 @@ typedef unsigned int linklistsizeint;
+ template<typename dist_t>
+ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+  public:
++    bool base_layer_only = false;
++    int num_seeds = 32;
++    bool base_layer_init = true;
+     static const tableint MAX_LABEL_OPERATION_LOCKS = 65536;
+     static const unsigned char DELETE_MARK = 0x01;
+ 
+@@ -1098,7 +1101,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+ 
+         std::unique_lock <std::mutex> lock_el(link_list_locks_[cur_c]);
+         int curlevel = getRandomLevel(mult_);
+-        if (level > 0)
++        if (level > -1)
+             curlevel = level;
+ 
+         element_levels_[cur_c] = curlevel;
+@@ -1116,6 +1119,9 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+         memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype));
+         memcpy(getDataByInternalId(cur_c), data_point, data_size_);
+ 
++        if (!base_layer_init && curlevel == 0)
++            return cur_c;
++
+         if (curlevel) {
+             linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1);
+             if (linkLists_[cur_c] == nullptr)
+@@ -1138,7 +1144,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+                         tableint *datal = (tableint *) (data + 1);
+                         for (int i = 0; i < size; i++) {
+                             tableint cand = datal[i];
+-                            if (cand < 0 || cand > max_elements_)
++                            if (static_cast<int>(cand) < 0 || cand > max_elements_)
+                                 throw std::runtime_error("cand error");
+                             dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
+                             if (d < curdist) {
+@@ -1188,28 +1194,41 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+         tableint currObj = enterpoint_node_;
+         dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+ 
+-        for (int level = maxlevel_; level > 0; level--) {
+-            bool changed = true;
+-            while (changed) {
+-                changed = false;
+-                unsigned int *data;
++        if (base_layer_only) {
++            // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
++            for (int i = 0; i < num_seeds; i++) {
++                tableint obj = i * (max_elements_ / num_seeds);
++                dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
++                if (dist < curdist) {
++                    curdist = dist;
++                    currObj = obj;
 +                }
 +            }
-+            else{
-+                for (int level = maxlevel_; level > 0; level--) {
-+                    bool changed = true;
-+                    while (changed) {
-+                        changed = false;
-+                        unsigned int *data;
- 
--                    data = (unsigned int *) get_linklist(currObj, level);
--                    int size = getListCount(data);
--                    metric_hops++;
--                    metric_distance_computations+=size;
-+                        data = (unsigned int *) get_linklist(currObj, level);
-+                        int size = getListCount(data);
-+                        metric_hops++;
-+                        metric_distance_computations+=size;
- 
--                    tableint *datal = (tableint *) (data + 1);
--                    for (int i = 0; i < size; i++) {
--                        tableint cand = datal[i];
--                        if (cand < 0 || cand > max_elements_)
--                            throw std::runtime_error("cand error");
--                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
-+                        tableint *datal = (tableint *) (data + 1);
-+                        for (int i = 0; i < size; i++) {
-+                            tableint cand = datal[i];
-+                            if (cand > max_elements_)
-+                                throw std::runtime_error("cand error");
-+                            dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
- 
--                        if (d < curdist) {
--                            curdist = d;
--                            currObj = cand;
--                            changed = true;
-+                            if (d < curdist) {
-+                                curdist = d;
-+                                currObj = cand;
-+                                changed = true;
-+                            }
-                         }
++        }
++        else {
++            for (int level = maxlevel_; level > 0; level--) {
++                bool changed = true;
++                while (changed) {
++                    changed = false;
++                    unsigned int *data;
+ 
+-                data = (unsigned int *) get_linklist(currObj, level);
+-                int size = getListCount(data);
+-                metric_hops++;
+-                metric_distance_computations+=size;
++                    data = (unsigned int *) get_linklist(currObj, level);
++                    int size = getListCount(data);
++                    metric_hops++;
++                    metric_distance_computations+=size;
++
++                    tableint *datal = (tableint *) (data + 1);
++                    for (int i = 0; i < size; i++) {
++                        tableint cand = datal[i];
++                        if (static_cast<int>(cand) < 0 || cand > max_elements_)
++                            throw std::runtime_error("cand error");
++                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+ 
+-                tableint *datal = (tableint *) (data + 1);
+-                for (int i = 0; i < size; i++) {
+-                    tableint cand = datal[i];
+-                    if (cand < 0 || cand > max_elements_)
+-                        throw std::runtime_error("cand error");
+-                    dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+-
+-                    if (d < curdist) {
+-                        curdist = d;
+-                        currObj = cand;
+-                        changed = true;
++                        if (d < curdist) {
++                            curdist = d;
++                            currObj = cand;
++                            changed = true;
++                        }
                      }
                  }
+             }
 diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
-index 4413537..c3240f3 100644
+index 834d19f..0c0af26 100644
 --- a/hnswlib/space_l2.h
 +++ b/hnswlib/space_l2.h
-@@ -252,13 +252,14 @@ namespace hnswlib {
-         ~L2Space() {}
-     };
- 
-+    template <typename T>
-     static int
-     L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
- 
-         size_t qty = *((size_t *) qty_ptr);
-         int res = 0;
--        unsigned char *a = (unsigned char *) pVect1;
--        unsigned char *b = (unsigned char *) pVect2;
-+        T *a = (T *) pVect1;
-+        T *b = (T *) pVect2;
- 
-         qty = qty >> 2;
-         for (size_t i = 0; i < qty; i++) {
-@@ -279,11 +280,12 @@ namespace hnswlib {
-         return (res);
-     }
- 
-+    template <typename T>
-     static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
-         size_t qty = *((size_t*)qty_ptr);
-         int res = 0;
--        unsigned char* a = (unsigned char*)pVect1;
--        unsigned char* b = (unsigned char*)pVect2;
-+        T* a = (T*)pVect1;
-+        T* b = (T*)pVect2;
- 
-         for(size_t i = 0; i < qty; i++)
-         {
-@@ -294,6 +296,7 @@ namespace hnswlib {
-         return (res);
-     }
- 
-+    template <typename T>
-     class L2SpaceI : public SpaceInterface<int> {
- 
-         DISTFUNC<int> fstdistfunc_;
-@@ -302,10 +305,10 @@ namespace hnswlib {
-     public:
-         L2SpaceI(size_t dim) {
-             if(dim % 4 == 0) {
--                fstdistfunc_ = L2SqrI4x;
-+                fstdistfunc_ = L2SqrI4x<T>;
-             }
-             else {
--                fstdistfunc_ = L2SqrI;
-+                fstdistfunc_ = L2SqrI<T>;
-             }
-             dim_ = dim;
-             data_size_ = dim * sizeof(unsigned char);
-diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h
-index 5e1a4a5..4195ebd 100644
---- a/hnswlib/visited_list_pool.h
-+++ b/hnswlib/visited_list_pool.h
-@@ -3,6 +3,7 @@
- #include <mutex>
- #include <string.h>
- #include <deque>
-+#include <limits>
- 
- namespace hnswlib {
-     typedef unsigned short int vl_type;
-@@ -14,7 +15,7 @@ namespace hnswlib {
-         unsigned int numelements;
- 
-         VisitedList(int numelements1) {
--            curV = -1;
-+            curV = std::numeric_limits<vl_type>::max();
-             numelements = numelements1;
-             mass = new vl_type[numelements];
+@@ -252,12 +252,13 @@ class L2Space : public SpaceInterface<float> {
+     ~L2Space() {}
+ };
+ 
++template <typename T>
+ static int
+ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
+     size_t qty = *((size_t *) qty_ptr);
+     int res = 0;
+-    unsigned char *a = (unsigned char *) pVect1;
+-    unsigned char *b = (unsigned char *) pVect2;
++    T *a = (T *) pVect1;
++    T *b = (T *) pVect2;
+ 
+     qty = qty >> 2;
+     for (size_t i = 0; i < qty; i++) {
+@@ -277,11 +278,12 @@ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const voi
+     return (res);
+ }
+ 
++template <typename T>
+ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
+     size_t qty = *((size_t*)qty_ptr);
+     int res = 0;
+-    unsigned char* a = (unsigned char*)pVect1;
+-    unsigned char* b = (unsigned char*)pVect2;
++    T* a = (T*)pVect1;
++    T* b = (T*)pVect2;
+ 
+     for (size_t i = 0; i < qty; i++) {
+         res += ((*a) - (*b)) * ((*a) - (*b));
+@@ -291,6 +293,7 @@ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2,
+     return (res);
+ }
+ 
++template <typename T>
+ class L2SpaceI : public SpaceInterface<int> {
+     DISTFUNC<int> fstdistfunc_;
+     size_t data_size_;
+@@ -299,9 +302,9 @@ class L2SpaceI : public SpaceInterface<int> {
+  public:
+     L2SpaceI(size_t dim) {
+         if (dim % 4 == 0) {
+-            fstdistfunc_ = L2SqrI4x;
++            fstdistfunc_ = L2SqrI4x<T>;
+         } else {
+-            fstdistfunc_ = L2SqrI;
++            fstdistfunc_ = L2SqrI<T>;
          }
--- 
-2.43.0
-
+         dim_ = dim;
+         data_size_ = dim * sizeof(unsigned char);
diff --git a/cpp/cmake/patches/hnswlib_override.json b/cpp/cmake/patches/hnswlib_override.json
index aef2da772..c50220e24 100644
--- a/cpp/cmake/patches/hnswlib_override.json
+++ b/cpp/cmake/patches/hnswlib_override.json
@@ -1,16 +1,16 @@
 {
-    "packages" : {
-      "hnswlib" : {
-        "version": "0.6.2",
-        "git_url": "https://github.com/nmslib/hnswlib.git",
-        "git_tag": "v${version}",
-        "patches" : [
-          {
-            "file" : "${current_json_dir}/hnswlib.diff",
-            "issue" : "Correct compilation issues",
-            "fixed_in" : ""
-          }
-        ]
-      }
+  "packages": {
+    "hnswlib": {
+      "version": "0.7.0",
+      "git_url": "https://github.com/nmslib/hnswlib.git",
+      "git_tag": "v${version}",
+      "patches": [
+        {
+          "file": "${current_json_dir}/hnswlib.diff",
+          "issue": "Correct compilation issues",
+          "fixed_in": ""
+        }
+      ]
     }
-  }
\ No newline at end of file
+  }
+}
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 2e6c895e5..5b4d89aa2 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -15,6 +15,7 @@
 #=============================================================================
 
 function(find_and_configure_hnswlib)
+  message(STATUS "Finding or building hnswlib")
   set(oneValueArgs)
 
   include(${rapids-cmake-dir}/cpm/package_override.cmake)
diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h
index 0495c574a..b7eda54b8 100644
--- a/cpp/include/cuvs/neighbors/hnsw.h
+++ b/cpp/include/cuvs/neighbors/hnsw.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "cagra.h"
+
 #include <cuvs/core/c_api.h>
 #include <cuvs/distance/distance.h>
 #include <dlpack/dlpack.h>
@@ -27,32 +29,51 @@ extern "C" {
 #endif
 
 /**
- * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
+ * @defgroup hnsw_c_index_params C API for HNSW index params
  * @{
  */
 
-struct cuvsHnswSearchParams {
-  int32_t ef;
-  int32_t numThreads;
+/**
+ * @brief Hierarchy for HNSW index when converting from CAGRA index
+ *
+ * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
+ */
+enum cuvsHnswHierarchy {
+  /* Flat hierarchy, search is base-layer only */
+  NONE,
+  /* Full hierarchy is built using the CPU */
+  CPU
 };
 
-typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
+struct cuvsHnswIndexParams {
+  /* hierarchy of the hnsw index */
+  cuvsHnswHierarchy hierarchy;
+  /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
+  int ef_construction;
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
+  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
+  to parallelism, and increasing the number of threads can reduce the quality of the index.
+   */
+  int num_threads;
+};
+
+typedef struct cuvsHnswIndexParams* cuvsHnswIndexParams_t;
 
 /**
- * @brief Allocate HNSW search params, and populate with default values
+ * @brief Allocate HNSW Index params, and populate with default values
  *
- * @param[in] params cuvsHnswSearchParams_t to allocate
+ * @param[in] params cuvsHnswIndexParams_t to allocate
  * @return cuvsError_t
  */
-cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
+cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params);
 
 /**
- * @brief De-allocate HNSW search params
+ * @brief De-allocate HNSW Index params
  *
- * @param[in] params cuvsHnswSearchParams_t to de-allocate
+ * @param[in] params
  * @return cuvsError_t
  */
-cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
+cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params);
 
 /**
  * @}
@@ -90,6 +111,184 @@ cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index);
  */
 cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
 
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_extend_params Parameters for extending HNSW index
+ * @{
+ */
+
+struct cuvsHnswExtendParams {
+  /** Number of CPU threads used to extend additional vectors */
+  int num_threads;
+};
+
+typedef struct cuvsHnswExtendParams* cuvsHnswExtendParams_t;
+
+/**
+ * @brief Allocate HNSW extend params, and populate with default values
+ *
+ * @param[in] params cuvsHnswExtendParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params);
+
+/**
+ * @brief De-allocate HNSW extend params
+ *
+ * @param[in] params cuvsHnswExtendParams_t to de-allocate
+ * @return cuvsError_t
+ */
+
+cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_index_load Load CAGRA index as hnswlib index
+ * @{
+ */
+
+/**
+ * @brief Convert a CAGRA Index to an HNSW index.
+ * NOTE: When hierarchy is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
+ * @param[in] cagra_index cuvsCagraIndex_t to convert to HNSW index
+ * @param[out] hnsw_index cuvsHnswIndex_t to return the HNSW index
+ *
+ * @return cuvsError_t
+ *
+ * @code{.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/cagra.h>
+ * #include <cuvs/neighbors/hnsw.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create a CAGRA index with `cuvsCagraBuild`
+ *
+ * // Convert the CAGRA index to an HNSW index
+ * cuvsHnswIndex_t hnsw_index;
+ * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnswIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+ *
+ * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+ * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+ * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ */
+cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                              cuvsHnswIndexParams_t params,
+                              cuvsCagraIndex_t cagra_index,
+                              cuvsHnswIndex_t hnsw_index);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_index_extend Extend HNSW index with additional vectors
+ * @{
+ */
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the hierarchy is `CPU`
+ *       when converting from a CAGRA index.
+
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswExtendParams_t used to extend Hnsw index
+ * @param[in] additional_dataset DLManagedTensor* additional dataset to extend the index
+ * @param[inout] index cuvsHnswIndex_t to extend
+  *
+  * @return cuvsError_t
+  *
+  * @code{.c}
+  * #include <cuvs/core/c_api.h>
+  * #include <cuvs/neighbors/cagra.h>
+  * #include <cuvs/neighbors/hnsw.h>
+  *
+  * // Create cuvsResources_t
+  * cuvsResources_t res;
+  * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+  *
+  * // create an index with `cuvsCagraBuild`
+  *
+  * // Convert the CAGRA index to an HNSW index
+  * cuvsHnswIndex_t hnsw_index;
+  * cuvsHnswIndexCreate(&hnsw_index);
+  * cuvsHnswIndexParams_t hnsw_params;
+  * cuvsHnswIndexParamsCreate(&hnsw_params);
+  * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+  *
+  * // Extend the HNSW index with additional vectors
+  * DLManagedTensor additional_dataset;
+  * cuvsHnswExtendParams_t extend_params;
+  * cuvsHnswExtendParamsCreate(&extend_params);
+  * cuvsHnswExtend(res, extend_params, additional_dataset, hnsw_index);
+  *
+  * // de-allocate `hnsw_params`, `hnsw_index`, `extend_params` and `res`
+  * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+  * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+  * cuvsError_t extend_params_destroy_status = cuvsHnswExtendParamsDestroy(extend_params);
+  * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+  * @endcode
+  */
+
+cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                           cuvsHnswExtendParams_t params,
+                           DLManagedTensor* additional_dataset,
+                           cuvsHnswIndex_t index);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
+ * @{
+ */
+
+struct cuvsHnswSearchParams {
+  int32_t ef;
+  int32_t num_threads;
+};
+
+typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
+
+/**
+ * @brief Allocate HNSW search params, and populate with default values
+ *
+ * @param[in] params cuvsHnswSearchParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
+
+/**
+ * @brief De-allocate HNSW search params
+ *
+ * @param[in] params cuvsHnswSearchParams_t to de-allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
+
 /**
  * @}
  */
@@ -111,8 +310,8 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  *          c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
  *        2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 64`
  *        3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * NOTE: When hierarchy is `NONE`, the HNSW index can only be searched by the hnswlib wrapper in
+ * cuVS, as the format is not compatible with the original hnswlib.
  *
  * @code {.c}
  * #include <cuvs/core/c_api.h>
@@ -131,7 +330,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  * cuvsHnswSearchParams_t params;
  * cuvsError_t params_create_status = cuvsHnswSearchParamsCreate(&params);
  *
- * // Search the `index` built using `cuvsHnswBuild`
+ * // Search the `index` built using `cuvsHnswFromCagra`
  * cuvsError_t search_status = cuvsHnswSearch(res, params, index, &queries, &neighbors,
  * &distances);
  *
@@ -142,7 +341,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  *
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] params cuvsHnswSearchParams_t used to search Hnsw index
- * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswBuild`
+ * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswFromCagra`
  * @param[in] queries DLManagedTensor* queries dataset to search
  * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
  * @param[out] distances DLManagedTensor* output `k` distances for queries
@@ -163,9 +362,50 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,
  * @{
  */
 
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the name of the file to save the index
+ * @param[in] index cuvsHnswIndex_t to serialize
+ * @return cuvsError_t
+ *
+ * @code{.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/cagra.h>
+ * #include <cuvs/neighbors/hnsw.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create an index with `cuvsCagraBuild`
+ *
+ * // Convert the CAGRA index to an HNSW index
+ * cuvsHnswIndex_t hnsw_index;
+ * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnswIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+ *
+ * // Serialize the HNSW index
+ * cuvsHnswSerialize(res, "/path/to/index", hnsw_index);
+ *
+ * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+ * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+ * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ */
+cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHnswIndex_t index);
+
 /**
  * Load hnswlib index from file which was serialized from a HNSW index.
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
  * Experimental, both the API and the serialization format are subject to change.
  *
@@ -185,17 +425,22 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,
  * // The index should have the same dtype as the one used to build CAGRA the index
  * cuvsHnswIndex_t hnsw_index;
  * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnsWIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * hnsw_params->hierarchy = NONE;
  * hnsw_index->dtype = index->dtype;
- * cuvsCagraDeserialize(res, "/path/to/index", hnsw_index);
+ * cuvsHnswDeserialize(res, hnsw_params, "/path/to/index", dim, metric hnsw_index);
  * @endcode
  *
  * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
  * @param[in] filename the name of the file that stores the index
  * @param[in] dim the dimension of the vectors in the index
  * @param[in] metric the distance metric used to build the index
  * @param[out] index HNSW index loaded disk
  */
 cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                cuvsHnswIndexParams_t params,
                                 const char* filename,
                                 int dim,
                                 cuvsDistanceType metric,
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index d5abd6d55..f0b433d8e 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -34,14 +34,30 @@
 namespace cuvs::neighbors::hnsw {
 
 /**
- * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib
+ * @defgroup hnsw_cpp_index_params hnswlib index wrapper params
  * @{
  */
 
-struct search_params : cuvs::neighbors::search_params {
-  int ef;               // size of the candidate list
-  int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
-                        // automatically maximizes parallelism
+/**
+ * @brief Hierarchy for HNSW index when converting from CAGRA index
+ *
+ * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
+ */
+enum class HnswHierarchy {
+  NONE,  // base-layer-only index
+  CPU    // full index with CPU-built hierarchy
+};
+
+struct index_params : cuvs::neighbors::index_params {
+  /** Hierarchy build type for HNSW index when converting from CAGRA index */
+  HnswHierarchy hierarchy = HnswHierarchy::NONE;
+  /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
+  int ef_construction = 200;
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
+  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
+  to parallelism, and increasing the number of threads can reduce the quality of the index.
+   */
+  int num_threads = 2;
 };
 
 /**@}*/
@@ -62,8 +78,12 @@ struct index : cuvs::neighbors::index {
    *
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
+   * @param[in] hierarchy hierarchy used for upper HNSW layers
    */
-  index(int dim, cuvs::distance::DistanceType metric) : dim_{dim}, metric_{metric} {}
+  index(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy = HnswHierarchy::NONE)
+    : dim_{dim}, metric_{metric}, hierarchy_{hierarchy}
+  {
+  }
 
   virtual ~index() {}
 
@@ -76,6 +96,8 @@ struct index : cuvs::neighbors::index {
 
   auto metric() const -> cuvs::distance::DistanceType { return metric_; }
 
+  auto hierarchy() const -> HnswHierarchy { return hierarchy_; }
+
   /**
   @brief Set ef for search
   */
@@ -84,24 +106,41 @@ struct index : cuvs::neighbors::index {
  private:
   int dim_;
   cuvs::distance::DistanceType metric_;
+  HnswHierarchy hierarchy_;
 };
 
 /**@}*/
 
+/**
+ * @defgroup hnsw_cpp_extend_params HNSW index extend parameters
+ * @{
+ */
+
+struct extend_params {
+  /** Number of host threads to use to add additional vectors to the index.
+  Value of 0 automatically maximizes parallelism. */
+  int num_threads = 0;
+};
+
 /**
  * @defgroup hnsw_cpp_index_load Load CAGRA index as hnswlib index
  * @{
  */
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file. The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -110,24 +149,34 @@ struct index : cuvs::neighbors::index {
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<float>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<float, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<float, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const float, int64_t, raft::row_major>> dataset =
+    std::nullopt);
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file.  The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -136,24 +185,34 @@ std::unique_ptr<index<float>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<uint8_t>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const uint8_t, int64_t, raft::row_major>> dataset =
+    std::nullopt);
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file.  The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -162,14 +221,138 @@ std::unique_ptr<index<uint8_t>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<int8_t>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<int8_t, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<int8_t, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const int8_t, int64_t, raft::row_major>> dataset =
+    std::nullopt);
+
+/**@}*/
+
+/**
+ * @defgroup hnsw_cpp_index_extend Extend HNSW index with additional vectors
+ * @{
+ */
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> additional_dataset,
+            index<float>& idx);
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> additional_dataset,
+            index<uint8_t>& idx);
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const int8_t, int64_t, raft::row_major> additional_dataset,
+            index<int8_t>& idx);
+
+/**@} */
+
+/**
+ * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib
+ * @{
+ */
+
+struct search_params : cuvs::neighbors::search_params {
+  int ef;               // size of the candidate list
+  int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
+                        // automatically maximizes parallelism
+};
 
 /**@}*/
 
@@ -181,9 +364,9 @@ std::unique_ptr<index<int8_t>> from_cagra(
  */
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSW index constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -201,10 +384,11 @@ std::unique_ptr<index<int8_t>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -224,9 +408,9 @@ void search(raft::resources const& res,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSWindex constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -244,10 +428,11 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -267,9 +452,9 @@ void search(raft::resources const& res,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSW index constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -287,10 +472,11 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -312,16 +498,106 @@ void search(raft::resources const& res,
 /**@}*/
 
 /**
- * @defgroup hnsw_cpp_index_deserialize Deserialize CAGRA index as hnswlib index
+ * @defgroup hnsw_cpp_index_serialize Deserialize CAGRA index as hnswlib index
  * @{
  */
 
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<float>& idx);
+
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<uint8_t>& idx);
+
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<int8_t>& idx);
+
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -334,19 +610,23 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<float>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<float>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
@@ -354,10 +634,13 @@ void deserialize(raft::resources const& res,
 
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -370,19 +653,23 @@ void deserialize(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<uint8_t>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<uint8_t>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
@@ -390,10 +677,13 @@ void deserialize(raft::resources const& res,
 
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -406,19 +696,23 @@ void deserialize(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<int8_t>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<int8_t>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index ce1e03264..e129d23e8 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -22,9 +22,63 @@
 #include <hnswlib/hnswlib.h>
 #include <memory>
 #include <random>
+#include <thread>
 
 namespace cuvs::neighbors::hnsw::detail {
 
+// Multithreaded executor
+// The helper function is copied from the hnswlib repository
+// as for some reason, adding vectors to the hnswlib index does not
+// work well with omp parallel for
+template <class Function>
+inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn)
+{
+  if (numThreads <= 0) { numThreads = std::thread::hardware_concurrency(); }
+
+  if (numThreads == 1) {
+    for (size_t id = start; id < end; id++) {
+      fn(id, 0);
+    }
+  } else {
+    std::vector<std::thread> threads;
+    std::atomic<size_t> current(start);
+
+    // keep track of exceptions in threads
+    // https://stackoverflow.com/a/32428427/1713196
+    std::exception_ptr lastException = nullptr;
+    std::mutex lastExceptMutex;
+
+    for (size_t threadId = 0; threadId < numThreads; ++threadId) {
+      threads.push_back(std::thread([&, threadId] {
+        while (true) {
+          size_t id = current.fetch_add(1);
+
+          if (id >= end) { break; }
+
+          try {
+            fn(id, threadId);
+          } catch (...) {
+            std::unique_lock<std::mutex> lastExcepLock(lastExceptMutex);
+            lastException = std::current_exception();
+            /*
+             * This will work even when current is the largest value that
+             * size_t can fit, because fetch_add returns the previous value
+             * before the increment (what will result in overflow
+             * and produce 0 instead of current + 1).
+             */
+            current = end;
+            break;
+          }
+        }
+      }));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    if (lastException) { std::rethrow_exception(lastException); }
+  }
+}
+
 template <typename T>
 struct hnsw_dist_t {
   using type = void;
@@ -54,9 +108,10 @@ struct index_impl : index<T> {
    * @param[in] filepath path to the index
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
+   * @param[in] hierarchy hierarchy used for upper HNSW layers
    */
-  index_impl(const std::string& filepath, int dim, cuvs::distance::DistanceType metric)
-    : index<T>{dim, metric}
+  index_impl(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy)
+    : index<T>{dim, metric, hierarchy}
   {
     if constexpr (std::is_same_v<T, float>) {
       if (metric == cuvs::distance::DistanceType::L2Expanded) {
@@ -71,11 +126,6 @@ struct index_impl : index<T> {
     }
 
     RAFT_EXPECTS(space_ != nullptr, "Unsupported metric type was used");
-
-    appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
-      space_.get(), filepath);
-
-    appr_alg_->base_layer_only = true;
   }
 
   /**
@@ -88,14 +138,32 @@ struct index_impl : index<T> {
   */
   void set_ef(int ef) const override { appr_alg_->ef_ = ef; }
 
+  /**
+  @brief Set index
+   */
+  void set_index(std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>&& index)
+  {
+    appr_alg_ = std::move(index);
+  }
+
+  /**
+  @brief Get space
+   */
+  auto get_space() const -> hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>*
+  {
+    return space_.get();
+  }
+
  private:
   std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
   std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
 };
 
-template <typename T>
-std::unique_ptr<index<T>> from_cagra(raft::resources const& res,
-                                     const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index)
+template <typename T, HnswHierarchy hierarchy>
+std::enable_if_t<hierarchy == HnswHierarchy::NONE, std::unique_ptr<index<T>>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index)
 {
   std::random_device dev;
   std::mt19937 rng(dev());
@@ -103,13 +171,125 @@ std::unique_ptr<index<T>> from_cagra(raft::resources const& res,
   auto uuid            = std::to_string(dist(rng));
   std::string filepath = "/tmp/" + uuid + ".bin";
   cuvs::neighbors::cagra::serialize_to_hnswlib(res, filepath, cagra_index);
+
   index<T>* hnsw_index = nullptr;
   cuvs::neighbors::hnsw::deserialize(
-    res, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index);
+    res, params, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index);
   std::filesystem::remove(filepath);
   return std::unique_ptr<index<T>>(hnsw_index);
 }
 
+template <typename T, HnswHierarchy hierarchy>
+std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
+{
+  // auto host_dataset = raft::make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
+  auto host_dataset = raft::make_host_matrix<T, int64_t>(0, 0);
+  raft::host_matrix_view<const T, int64_t, raft::row_major> host_dataset_view(
+    host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1));
+  if (dataset.has_value()) {
+    host_dataset_view = dataset.value();
+  } else {
+    // move dataset to host, remove padding
+    auto cagra_dataset = cagra_index.dataset();
+    host_dataset =
+      raft::make_host_matrix<T, int64_t>(cagra_dataset.extent(0), cagra_dataset.extent(1));
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.data_handle(),
+                                    sizeof(T) * cagra_dataset.stride(0),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.extent(0),
+                                    cudaMemcpyDefault,
+                                    raft::resource::get_cuda_stream(res)));
+    raft::resource::sync_stream(res);
+    host_dataset_view = host_dataset.view();
+  }
+  // build upper layers of hnsw index
+  auto hnsw_index =
+    std::make_unique<index_impl<T>>(cagra_index.dim(), cagra_index.metric(), hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(),
+    host_dataset_view.extent(0),
+    cagra_index.graph().extent(1) / 2,
+    params.ef_construction);
+  appr_algo->base_layer_init = false;  // tell hnswlib to build upper layers only
+  ParallelFor(0, host_dataset_view.extent(0), params.num_threads, [&](size_t i, size_t threadId) {
+    appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)),
+                        i);
+  });
+  appr_algo->base_layer_init = true;  // reset to true to allow addition of new points
+
+  // move cagra graph to host
+  auto graph = cagra_index.graph();
+  auto host_graph =
+    raft::make_host_matrix<uint32_t, int64_t, raft::row_major>(graph.extent(0), graph.extent(1));
+  raft::copy(host_graph.data_handle(),
+             graph.data_handle(),
+             graph.size(),
+             raft::resource::get_cuda_stream(res));
+  raft::resource::sync_stream(res);
+
+// copy cagra graph to hnswlib base layer
+#pragma omp parallel for
+  for (size_t i = 0; i < static_cast<size_t>(host_graph.extent(0)); ++i) {
+    auto ll_i = appr_algo->get_linklist0(i);
+    appr_algo->setListCount(ll_i, host_graph.extent(1));
+    auto* data = (uint32_t*)(ll_i + 1);
+    for (size_t j = 0; j < static_cast<size_t>(host_graph.extent(1)); ++j) {
+      data[j] = host_graph(i, j);
+    }
+  }
+
+  hnsw_index->set_index(std::move(appr_algo));
+  return hnsw_index;
+}
+
+template <typename T>
+std::unique_ptr<index<T>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
+{
+  if (params.hierarchy == HnswHierarchy::NONE) {
+    return from_cagra<T, HnswHierarchy::NONE>(res, params, cagra_index);
+  } else if (params.hierarchy == HnswHierarchy::CPU) {
+    return from_cagra<T, HnswHierarchy::CPU>(res, params, cagra_index, dataset);
+  }
+  {
+    RAFT_FAIL("Unsupported hierarchy type");
+  }
+}
+
+template <typename T>
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> additional_dataset,
+            index<T>& idx)
+{
+  auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
+    const_cast<void*>(idx.get_index()));
+  auto current_element_count = hnswlib_index->getCurrentElementCount();
+  auto new_element_count     = additional_dataset.extent(0);
+  auto num_threads           = params.num_threads == 0 ? std::thread::hardware_concurrency()
+                                                       : static_cast<size_t>(params.num_threads);
+
+  hnswlib_index->resizeIndex(current_element_count + new_element_count);
+  ParallelFor(current_element_count,
+              current_element_count + new_element_count,
+              num_threads,
+              [&](size_t i, size_t threadId) {
+                hnswlib_index->addPoint(
+                  (void*)(additional_dataset.data_handle() +
+                          (i - current_element_count) * additional_dataset.extent(1)),
+                  i);
+              });
+}
+
 template <typename T>
 void get_search_knn_results(hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const* idx,
                             const T* query,
@@ -171,14 +351,28 @@ void search(raft::resources const& res,
   }
 }
 
+template <typename T>
+void serialize(raft::resources const& res, const std::string& filename, const index<T>& idx)
+{
+  auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
+    const_cast<void*>(idx.get_index()));
+  hnswlib_index->saveIndex(filename);
+}
+
 template <typename T>
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
                  index<T>** idx)
 {
-  *idx = new detail::index_impl<T>(filename, dim, metric);
+  auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
+  auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), filename);
+  if (params.hierarchy == HnswHierarchy::NONE) { appr_algo->base_layer_only = true; }
+  hnsw_index->set_index(std::move(appr_algo));
+  *idx = hnsw_index.release();
 }
 
 }  // namespace cuvs::neighbors::hnsw::detail
diff --git a/cpp/src/neighbors/hnsw.cpp b/cpp/src/neighbors/hnsw.cpp
index e6f3fbcc7..f165176ec 100644
--- a/cpp/src/neighbors/hnsw.cpp
+++ b/cpp/src/neighbors/hnsw.cpp
@@ -21,11 +21,14 @@
 
 namespace cuvs::neighbors::hnsw {
 
-#define CUVS_INST_HNSW_FROM_CAGRA(T)                                                           \
-  std::unique_ptr<index<T>> from_cagra(                                                        \
-    raft::resources const& res, const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index) \
-  {                                                                                            \
-    return detail::from_cagra<T>(res, cagra_index);                                            \
+#define CUVS_INST_HNSW_FROM_CAGRA(T)                                                  \
+  std::unique_ptr<index<T>> from_cagra(                                               \
+    raft::resources const& res,                                                       \
+    const index_params& params,                                                       \
+    const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,                    \
+    std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset) \
+  {                                                                                   \
+    return detail::from_cagra<T>(res, params, cagra_index, dataset);                  \
   }
 
 CUVS_INST_HNSW_FROM_CAGRA(float);
@@ -34,6 +37,21 @@ CUVS_INST_HNSW_FROM_CAGRA(int8_t);
 
 #undef CUVS_INST_HNSW_FROM_CAGRA
 
+#define CUVS_INST_HNSW_EXTEND(T)                                                            \
+  void extend(raft::resources const& res,                                                   \
+              const extend_params& params,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> additional_dataset, \
+              index<T>& idx)                                                                \
+  {                                                                                         \
+    detail::extend<T>(res, params, additional_dataset, idx);                                \
+  }
+
+CUVS_INST_HNSW_EXTEND(float);
+CUVS_INST_HNSW_EXTEND(uint8_t);
+CUVS_INST_HNSW_EXTEND(int8_t);
+
+#undef CUVS_INST_HNSW_EXTEND
+
 #define CUVS_INST_HNSW_SEARCH(T)                                                    \
   void search(raft::resources const& res,                                           \
               const search_params& params,                                          \
@@ -51,20 +69,25 @@ CUVS_INST_HNSW_SEARCH(int8_t);
 
 #undef CUVS_INST_HNSW_SEARCH
 
-#define CUVS_INST_HNSW_DESERIALIZE(T)                        \
-  void deserialize(raft::resources const& res,               \
-                   const std::string& filename,              \
-                   int dim,                                  \
-                   cuvs::distance::DistanceType metric,      \
-                   index<T>** idx)                           \
-  {                                                          \
-    detail::deserialize<T>(res, filename, dim, metric, idx); \
+#define CUVS_INST_HNSW_SERIALIZE(T)                                                            \
+  void serialize(raft::resources const& res, const std::string& filename, const index<T>& idx) \
+  {                                                                                            \
+    detail::serialize<T>(res, filename, idx);                                                  \
+  }                                                                                            \
+  void deserialize(raft::resources const& res,                                                 \
+                   const index_params& params,                                                 \
+                   const std::string& filename,                                                \
+                   int dim,                                                                    \
+                   cuvs::distance::DistanceType metric,                                        \
+                   index<T>** idx)                                                             \
+  {                                                                                            \
+    detail::deserialize<T>(res, params, filename, dim, metric, idx);                           \
   }
 
-CUVS_INST_HNSW_DESERIALIZE(float);
-CUVS_INST_HNSW_DESERIALIZE(uint8_t);
-CUVS_INST_HNSW_DESERIALIZE(int8_t);
+CUVS_INST_HNSW_SERIALIZE(float);
+CUVS_INST_HNSW_SERIALIZE(uint8_t);
+CUVS_INST_HNSW_SERIALIZE(int8_t);
 
-#undef CUVS_INST_HNSW_DESERIALIZE
+#undef CUVS_INST_HNSW_SERIALIZE
 
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/src/neighbors/hnsw_c.cpp b/cpp/src/neighbors/hnsw_c.cpp
index a19875641..0233a510a 100644
--- a/cpp/src/neighbors/hnsw_c.cpp
+++ b/cpp/src/neighbors/hnsw_c.cpp
@@ -31,6 +31,44 @@
 #include <cuvs/neighbors/hnsw.hpp>
 
 namespace {
+
+template <typename T>
+void _from_cagra(cuvsResources_t res,
+                 cuvsHnswIndexParams_t params,
+                 cuvsCagraIndex_t cagra_index,
+                 cuvsHnswIndex_t hnsw_index)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  auto index   = reinterpret_cast<cuvs::neighbors::cagra::index<T, uint32_t>*>(cagra_index->addr);
+  auto cpp_params            = cuvs::neighbors::hnsw::index_params();
+  cpp_params.hierarchy       = static_cast<cuvs::neighbors::hnsw::HnswHierarchy>(params->hierarchy);
+  cpp_params.ef_construction = params->ef_construction;
+  cpp_params.num_threads     = params->num_threads;
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset = std::nullopt;
+
+  auto hnsw_index_unique_ptr =
+    cuvs::neighbors::hnsw::from_cagra(*res_ptr, cpp_params, *index, dataset);
+  auto hnsw_index_ptr = hnsw_index_unique_ptr.release();
+  hnsw_index->addr    = reinterpret_cast<uintptr_t>(hnsw_index_ptr);
+}
+
+template <typename T>
+void _extend(cuvsResources_t res,
+             cuvsHnswExtendParams_t params,
+             DLManagedTensor* additional_dataset,
+             cuvsHnswIndex index)
+{
+  auto res_ptr           = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr         = reinterpret_cast<cuvs::neighbors::hnsw::index<T>*>(index.addr);
+  auto cpp_params        = cuvs::neighbors::hnsw::extend_params();
+  cpp_params.num_threads = params->num_threads;
+
+  using additional_dataset_mdspan_type = raft::host_matrix_view<T const, int64_t, raft::row_major>;
+  auto additional_dataset_mds =
+    cuvs::core::from_dlpack<additional_dataset_mdspan_type>(additional_dataset);
+  cuvs::neighbors::hnsw::extend(*res_ptr, cpp_params, additional_dataset_mds, *index_ptr);
+}
+
 template <typename T>
 void _search(cuvsResources_t res,
              cuvsHnswSearchParams params,
@@ -44,7 +82,7 @@ void _search(cuvsResources_t res,
 
   auto search_params        = cuvs::neighbors::hnsw::search_params();
   search_params.ef          = params.ef;
-  search_params.num_threads = params.numThreads;
+  search_params.num_threads = params.num_threads;
 
   using queries_mdspan_type   = raft::host_matrix_view<T const, int64_t, raft::row_major>;
   using neighbors_mdspan_type = raft::host_matrix_view<uint64_t, int64_t, raft::row_major>;
@@ -57,26 +95,42 @@ void _search(cuvsResources_t res,
 }
 
 template <typename T>
-void* _deserialize(cuvsResources_t res, const char* filename, int dim, cuvsDistanceType metric)
+void _serialize(cuvsResources_t res, const char* filename, cuvsHnswIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::hnsw::index<T>*>(index.addr);
+  cuvs::neighbors::hnsw::serialize(*res_ptr, std::string(filename), *index_ptr);
+}
+
+template <typename T>
+void* _deserialize(cuvsResources_t res,
+                   cuvsHnswIndexParams_t params,
+                   const char* filename,
+                   int dim,
+                   cuvsDistanceType metric)
 {
   auto res_ptr                           = reinterpret_cast<raft::resources*>(res);
   cuvs::neighbors::hnsw::index<T>* index = nullptr;
-  cuvs::neighbors::hnsw::deserialize(*res_ptr, std::string(filename), dim, metric, &index);
+  auto cpp_params                        = cuvs::neighbors::hnsw::index_params();
+  cpp_params.hierarchy = static_cast<cuvs::neighbors::hnsw::HnswHierarchy>(params->hierarchy);
+  cuvs::neighbors::hnsw::deserialize(
+    *res_ptr, cpp_params, std::string(filename), dim, metric, &index);
   return index;
 }
 }  // namespace
 
-extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params)
+extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
 {
-  return cuvs::core::translate_exceptions(
-    [=] { *params = new cuvsHnswSearchParams{.ef = 200, .numThreads = 0}; });
+  return cuvs::core::translate_exceptions([=] {
+    *params = new cuvsHnswIndexParams{
+      .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 2};
+  });
 }
 
-extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params)
+extern "C" cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params)
 {
   return cuvs::core::translate_exceptions([=] { delete params; });
 }
-
 extern "C" cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index)
 {
   return cuvs::core::translate_exceptions([=] { *index = new cuvsHnswIndex{}; });
@@ -101,6 +155,66 @@ extern "C" cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index_c_ptr)
   });
 }
 
+extern "C" cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { *params = new cuvsHnswExtendParams{.num_threads = 0}; });
+}
+
+extern "C" cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
+extern "C" cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                                         cuvsHnswIndexParams_t params,
+                                         cuvsCagraIndex_t cagra_index,
+                                         cuvsHnswIndex_t hnsw_index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index        = *cagra_index;
+    hnsw_index->dtype = index.dtype;
+    if (index.dtype.code == kDLFloat) {
+      _from_cagra<float>(res, params, cagra_index, hnsw_index);
+    } else if (index.dtype.code == kDLUInt) {
+      _from_cagra<uint8_t>(res, params, cagra_index, hnsw_index);
+    } else if (index.dtype.code == kDLInt) {
+      _from_cagra<int8_t>(res, params, cagra_index, hnsw_index);
+    } else {
+      RAFT_FAIL("Unsupported dtype: %d", index.dtype.code);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                                      cuvsHnswExtendParams_t params,
+                                      DLManagedTensor* additional_dataset,
+                                      cuvsHnswIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat) {
+      _extend<float>(res, params, additional_dataset, *index);
+    } else if (index->dtype.code == kDLUInt) {
+      _extend<uint8_t>(res, params, additional_dataset, *index);
+    } else if (index->dtype.code == kDLInt) {
+      _extend<int8_t>(res, params, additional_dataset, *index);
+    } else {
+      RAFT_FAIL("Unsupported dtype: %d", index->dtype.code);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { *params = new cuvsHnswSearchParams{.ef = 200, .num_threads = 0}; });
+}
+
+extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
 extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res,
                                       cuvsHnswSearchParams_t params,
                                       cuvsHnswIndex_t index_c_ptr,
@@ -140,7 +254,25 @@ extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsHnswSerialize(cuvsResources_t res,
+                                         const char* filename,
+                                         cuvsHnswIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat) {
+      _serialize<float>(res, filename, *index);
+    } else if (index->dtype.code == kDLInt) {
+      _serialize<int8_t>(res, filename, *index);
+    } else if (index->dtype.code == kDLUInt) {
+      _serialize<uint8_t>(res, filename, *index);
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
+
 extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                           cuvsHnswIndexParams_t params,
                                            const char* filename,
                                            int dim,
                                            cuvsDistanceType metric,
@@ -148,11 +280,14 @@ extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
 {
   return cuvs::core::translate_exceptions([=] {
     if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<float>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<float>(res, params, filename, dim, metric));
     } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<uint8_t>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<uint8_t>(res, params, filename, dim, metric));
     } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<int8_t>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<int8_t>(res, params, filename, dim, metric));
     } else {
       RAFT_FAIL("Unsupported dtype in file %s", filename);
     }
diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp
index 9b3da75a4..98ef3fdd3 100644
--- a/cpp/src/neighbors/iface/iface.hpp
+++ b/cpp/src/neighbors/iface/iface.hpp
@@ -20,6 +20,7 @@
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
+#include <fstream>
 #include <raft/core/device_resources.hpp>
 
 #include <fstream>
diff --git a/cpp/test/neighbors/ann_hnsw_c.cu b/cpp/test/neighbors/ann_hnsw_c.cu
index fc740b924..2a6401b1d 100644
--- a/cpp/test/neighbors/ann_hnsw_c.cu
+++ b/cpp/test/neighbors/ann_hnsw_c.cu
@@ -111,7 +111,9 @@ TEST(CagraHnswC, BuildSearch)
   cuvsHnswIndex_t hnsw_index;
   cuvsHnswIndexCreate(&hnsw_index);
   hnsw_index->dtype = index->dtype;
-  cuvsHnswDeserialize(res, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index);
+  cuvsHnswIndexParams_t hnsw_params;
+  cuvsHnswIndexParamsCreate(&hnsw_params);
+  cuvsHnswDeserialize(res, hnsw_params, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index);
 
   // search index
   cuvsHnswSearchParams_t search_params;
diff --git a/cpp/test/neighbors/hnsw.cu b/cpp/test/neighbors/hnsw.cu
index 9fb88be05..20ee83a11 100644
--- a/cpp/test/neighbors/hnsw.cu
+++ b/cpp/test/neighbors/hnsw.cu
@@ -108,7 +108,8 @@ class AnnHNSWTest : public ::testing::TestWithParam<AnnHNSWInputs> {
 
       cuvs::neighbors::hnsw::search_params search_params;
       search_params.ef = ps.ef;
-      auto hnsw_index  = cuvs::neighbors::hnsw::from_cagra(handle_, index);
+      cuvs::neighbors::hnsw::index_params hnsw_params;
+      auto hnsw_index = cuvs::neighbors::hnsw::from_cagra(handle_, hnsw_params, index);
       auto queries_HNSW_view =
         raft::make_host_matrix_view<DataT, int64_t>(queries_h.data(), ps.n_queries, ps.dim);
       auto indices_HNSW_view =
diff --git a/dependencies.yaml b/dependencies.yaml
index e909ad0dc..80a7d2024 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -470,7 +470,6 @@ dependencies:
     common:
       - output_types: [conda, pyproject, requirements]
         packages:
-          - hnswlib=0.6.2
           - nlohmann_json>=3.11.2
           - glog>=0.6.0
           - h5py>=3.8.0
diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst
index 988e5b6f3..22ffc236d 100644
--- a/docs/source/c_api/neighbors_hnsw_c.rst
+++ b/docs/source/c_api/neighbors_hnsw_c.rst
@@ -26,6 +26,28 @@ Index
     :members:
     :content-only:
 
+Index extend parameters
+-----------------------
+
+.. doxygengroup:: hnsw_c_extend_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
+------------
+.. doxygengroup:: hnsw_c_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index load
+----------
+.. doxygengroup:: hnsw_c_index_load
+    :project: cuvs
+    :members:
+    :content-only:
+
 Index search
 ------------
 
diff --git a/docs/source/cpp_api/neighbors_hnsw.rst b/docs/source/cpp_api/neighbors_hnsw.rst
index b0af88af0..00dd3a213 100644
--- a/docs/source/cpp_api/neighbors_hnsw.rst
+++ b/docs/source/cpp_api/neighbors_hnsw.rst
@@ -27,10 +27,25 @@ Index
     :members:
     :content-only:
 
-Index load
+Index extend parameters
+-----------------------
+
+.. doxygengroup:: hnsw_cpp_extend_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
 ------------
+.. doxygengroup:: hnsw_cpp_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
 
-.. doxygengroup:: hnsw_cpp_index_search
+Index load
+----------
+
+.. doxygengroup:: hnsw_cpp_index_load
     :project: cuvs
     :members:
     :content-only:
@@ -43,10 +58,10 @@ Index search
     :members:
     :content-only:
 
-Index deserialize
+Index serialize
 ---------------
 
-.. doxygengroup:: hnsw_cpp_index_deserialize
+.. doxygengroup:: hnsw_cpp_index_serialize
     :project: cuvs
     :members:
     :content-only:
diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.py b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
index 5efcdf68b..fafff7d03 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/__init__.py
+++ b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
@@ -13,10 +13,23 @@
 # limitations under the License.
 
 
-from .hnsw import Index, SearchParams, from_cagra, load, save, search
+from .hnsw import (
+    ExtendParams,
+    Index,
+    IndexParams,
+    SearchParams,
+    extend,
+    from_cagra,
+    load,
+    save,
+    search,
+)
 
 __all__ = [
+    "IndexParams",
     "Index",
+    "ExtendParams",
+    "extend",
     "SearchParams",
     "load",
     "save",
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
index 1cdc97406..e0c517933 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
@@ -20,14 +20,25 @@ from libc.stdint cimport int32_t, uintptr_t
 from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
 from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
 from cuvs.distance_type cimport cuvsDistanceType
+from cuvs.neighbors.cagra.cagra cimport cuvsCagraIndex_t
 
 
 cdef extern from "cuvs/neighbors/hnsw.h" nogil:
-    ctypedef struct cuvsHnswSearchParams:
-        int32_t ef
-        int32_t numThreads
 
-    ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t
+    ctypedef enum cuvsHnswHierarchy:
+        NONE
+        CPU
+
+    ctypedef struct cuvsHnswIndexParams:
+        cuvsHnswHierarchy hierarchy
+        int32_t ef_construction
+        int32_t num_threads
+
+    ctypedef cuvsHnswIndexParams* cuvsHnswIndexParams_t
+
+    cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
+
+    cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params)
 
     ctypedef struct cuvsHnswIndex:
         uintptr_t addr
@@ -39,6 +50,31 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
 
     cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index)
 
+    ctypedef struct cuvsHnswExtendParams:
+        int32_t num_threads
+
+    ctypedef cuvsHnswExtendParams* cuvsHnswExtendParams_t
+
+    cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params)
+
+    cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params)
+
+    cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                                  cuvsHnswIndexParams_t params,
+                                  cuvsCagraIndex_t cagra_index,
+                                  cuvsHnswIndex_t hnsw_index) except +
+
+    cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                               cuvsHnswExtendParams_t params,
+                               DLManagedTensor* data,
+                               cuvsHnswIndex_t index) except +
+
+    ctypedef struct cuvsHnswSearchParams:
+        int32_t ef
+        int32_t num_threads
+
+    ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t
+
     cuvsError_t cuvsHnswSearch(cuvsResources_t res,
                                cuvsHnswSearchParams* params,
                                cuvsHnswIndex_t index,
@@ -46,7 +82,12 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
                                DLManagedTensor* neighbors,
                                DLManagedTensor* distances) except +
 
+    cuvsError_t cuvsHnswSerialize(cuvsResources_t res,
+                                  const char * filename,
+                                  cuvsHnswIndex_t index) except +
+
     cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                    cuvsHnswIndexParams_t params,
                                     const char * filename,
                                     int32_t dim,
                                     cuvsDistanceType metric,
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index bcfaf167e..4c44350e8 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -39,41 +39,63 @@ from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
 
 
-cdef class SearchParams:
+cdef class IndexParams:
     """
-    HNSW search parameters
+    Parameters to build index for HNSW nearest neighbor search
 
     Parameters
     ----------
-    ef: int, default = 200
-        Maximum number of candidate list size used during search.
-    num_threads: int, default = 0
-        Number of CPU threads used to increase search parallelism.
-        When set to 0, the number of threads is automatically determined
-        using OpenMP's `omp_get_max_threads()`.
+    hierarchy : string, default = "none" (optional)
+        The hierarchy of the HNSW index. Valid values are ["none", "cpu"].
+        - "none": No hierarchy is built.
+        - "cpu": Hierarchy is built using CPU.
+    ef_construction : int, default = 200 (optional)
+        Maximum number of candidate list size used during construction
+        when hierarchy is `cpu`.
+    num_threads : int, default = 2 (optional)
+        Number of CPU threads used to increase construction parallelism
+        when hierarchy is `cpu`.
+        NOTE: Constructing the hierarchy when converting from a CAGRA graph
+        is highly sensitive to parallelism, and increasing the number of
+        threads can reduce the quality of the index.
     """
 
-    cdef cuvsHnswSearchParams params
+    cdef cuvsHnswIndexParams* params
+
+    def __cinit__(self):
+        check_cuvs(cuvsHnswIndexParamsCreate(&self.params))
+
+    def __dealloc__(self):
+        check_cuvs(cuvsHnswIndexParamsDestroy(self.params))
 
     def __init__(self, *,
-                 ef=200,
-                 num_threads=0):
-        self.params.ef = ef
-        self.params.numThreads = num_threads
+                 hierarchy="none",
+                 ef_construction=200,
+                 num_threads=2):
+        if hierarchy == "none":
+            self.params.hierarchy = cuvsHnswHierarchy.NONE
+        elif hierarchy == "cpu":
+            self.params.hierarchy = cuvsHnswHierarchy.CPU
+        else:
+            raise ValueError("Invalid hierarchy type."
+                             " Valid values are 'none' and 'cpu'.")
+        self.params.ef_construction = ef_construction
+        self.params.num_threads = num_threads
 
-    def __repr__(self):
-        attr_str = [attr + "=" + str(getattr(self, attr))
-                    for attr in [
-                        "ef", "num_threads"]]
-        return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")"
+    @property
+    def hierarchy(self):
+        if self.params.hierarchy == cuvsHnswHierarchy.NONE:
+            return "none"
+        elif self.params.hierarchy == cuvsHnswHierarchy.CPU:
+            return "cpu"
 
     @property
-    def ef(self):
-        return self.params.ef
+    def ef_construction(self):
+        return self.params.ef_construction
 
     @property
     def num_threads(self):
-        return self.params.numThreads
+        return self.params.num_threads
 
 
 cdef class Index:
@@ -103,13 +125,44 @@ cdef class Index:
         return "Index(type=HNSW, metric=L2" + (", ".join(attr_str)) + ")"
 
 
+cdef class ExtendParams:
+    """
+    Parameters to extend the HNSW index with new data
+
+    Parameters
+    ----------
+    num_threads : int, default = 0 (optional)
+        Number of CPU threads used to increase construction parallelism.
+        When set to 0, the number of threads is automatically determined.
+    """
+
+    cdef cuvsHnswExtendParams* params
+
+    def __cinit__(self):
+        check_cuvs(cuvsHnswExtendParamsCreate(&self.params))
+
+    def __dealloc__(self):
+        check_cuvs(cuvsHnswExtendParamsDestroy(self.params))
+
+    def __init__(self, *,
+                 num_threads=0):
+        self.params.num_threads = num_threads
+
+    @property
+    def num_threads(self):
+        return self.params.num_threads
+
+
 @auto_sync_resources
-def save(filename, cagra.Index index, resources=None):
+def save(filename, Index index, resources=None):
     """
     Saves the CAGRA index to a file as an hnswlib index.
-    The saved index is immutable and can only be searched by the hnswlib
-    wrapper in cuVS, as the format is not compatible with the original
-    hnswlib.
+    If the index was constructed with `hnsw.IndexParams(hierarchy="none")`,
+    then the saved index is immutable and can only be searched by the hnswlib
+    wrapper in cuVS, as the format is not compatible with the original hnswlib.
+    However, if the index was constructed with
+    `hnsw.IndexParams(hierarchy="cpu")`, then the saved index is mutable and
+    compatible with the original hnswlib.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
@@ -119,7 +172,7 @@ def save(filename, cagra.Index index, resources=None):
     filename : string
         Name of the file.
     index : Index
-        Trained CAGRA index.
+        Trained HNSW index.
     {resources_docstring}
 
     Examples
@@ -131,23 +184,28 @@ def save(filename, cagra.Index index, resources=None):
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> # Build index
-    >>> index = cagra.build(cagra.IndexParams(), dataset)
+    >>> cagra_index = cagra.build(cagra.IndexParams(), dataset)
     >>> # Serialize and deserialize the cagra index built
-    >>> hnsw.save("my_index.bin", index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), cagra_index)
+    >>> hnsw.save("my_index.bin", hnsw_index)
     """
     cdef string c_filename = filename.encode('utf-8')
     cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
-    check_cuvs(cagra.cuvsCagraSerializeToHnswlib(res,
-                                                 c_filename.c_str(),
-                                                 index.index))
+    check_cuvs(cuvsHnswSerialize(res,
+                                 c_filename.c_str(),
+                                 index.index))
 
 
 @auto_sync_resources
-def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
+def load(IndexParams index_params, filename, dim, dtype, metric="sqeuclidean",
+         resources=None):
     """
-    Loads base-layer-only hnswlib index from file, which was originally
-    saved as a built CAGRA index. The loaded index is immutable and can only
-    be searched by the hnswlib wrapper in cuVS, as the format is not
+    Loads an HNSW index.
+    If the index was constructed with `hnsw.IndexParams(hierarchy="none")`,
+    then the loaded index is immutable and can only be searched by the hnswlib
+    wrapper in cuVS, as the format is not compatible with the original hnswlib.
+    However, if the index was constructed with
+    `hnsw.IndexParams(hierarchy="cpu")`, then the loaded index is mutable and
     compatible with the original hnswlib.
 
     Saving / loading the index is experimental. The serialization format is
@@ -156,6 +214,8 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
     Parameters
     ----------
+    index_params : IndexParams
+        Parameters that were used to convert CAGRA index to HNSW index.
     filename : string
         Name of the file.
     dim : int
@@ -214,6 +274,7 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
     check_cuvs(cuvsHnswDeserialize(
         res,
+        index_params.params,
         c_filename.c_str(),
         dim,
         distance_type,
@@ -224,26 +285,30 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
 
 @auto_sync_resources
-def from_cagra(cagra.Index index, temporary_index_path=None, resources=None):
+def from_cagra(IndexParams index_params, cagra.Index cagra_index,
+               temporary_index_path=None, resources=None):
     """
-    Returns an hnsw base-layer-only index from a CAGRA index.
-
-    NOTE: This method uses the filesystem to write the CAGRA index in
-          `/tmp/<random_number>.bin` or the parameter `temporary_index_path`
-          if not None before reading it as an hnsw index,
-          then deleting the temporary file. The returned index is immutable
-          and can only be searched by the hnsw wrapper in cuVS, as the
-          format is not compatible with the original hnswlib library.
-          By `base_layer_only`, we mean that the hnsw index is created
-          without the additional layers that are used for the hierarchical
-          search in hnswlib. Instead, the base layer is used for the search.
+    Returns an HNSW index from a CAGRA index.
+
+    NOTE: When `index_params.hierarchy` is:
+          1. `NONE`: This method uses the filesystem to write the CAGRA index
+                     in `/tmp/<random_number>.bin` before reading it as an
+                     hnswlib index, then deleting the temporary file. The
+                     returned index is immutable and can only be searched by
+                     the hnswlib wrapper in cuVS, as the format is not
+                    compatible with the original hnswlib.
+          2. `CPU`: The returned index is mutable and can be extended with
+                    additional vectors. The serialized index is also compatible
+                    with the original hnswlib library.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
 
     Parameters
     ----------
-    index : Index
+    index_params : IndexParams
+        Parameters to convert the CAGRA index to HNSW index.
+    cagra_index : cagra.Index
         Trained CAGRA index.
     temporary_index_path : string, default = None
         Path to save the temporary index file. If None, the temporary file
@@ -262,18 +327,107 @@ def from_cagra(cagra.Index index, temporary_index_path=None, resources=None):
     >>> # Build index
     >>> index = cagra.build(cagra.IndexParams(), dataset)
     >>> # Serialize the CAGRA index to hnswlib base layer only index format
-    >>> hnsw_index = hnsw.from_cagra(index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index)
     """
-    uuid_num = uuid.uuid4()
-    filename = temporary_index_path if temporary_index_path else \
-        f"/tmp/{uuid_num}.bin"
-    save(filename, index, resources=resources)
-    hnsw_index = load(filename, index.dim, np.dtype(index.active_index_type),
-                      "sqeuclidean", resources=resources)
-    os.remove(filename)
+
+    cdef Index hnsw_index = Index()
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    check_cuvs(cuvsHnswFromCagra(
+        res,
+        index_params.params,
+        cagra_index.index,
+        hnsw_index.index
+    ))
+
+    hnsw_index.trained = True
     return hnsw_index
 
 
+@auto_sync_resources
+def extend(ExtendParams extend_params, Index index, data, resources=None):
+    """
+    Extends the HNSW index with new data.
+
+    Parameters
+    ----------
+    extend_params : ExtendParams
+    index : Index
+        Trained HNSW index.
+    data : Host array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [float32, int8, uint8]
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cuvs.neighbors import hnsw, cagra
+    >>>
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = np.random.random_sample((n_samples, n_features))
+    >>>
+    >>> # Build index
+    >>> index = cagra.build(hnsw.IndexParams(), dataset)
+    >>> # Load index
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(hierarchy="cpu"), index)
+    >>> # Extend the index with new data
+    >>> new_data = np.random.random_sample((n_samples, n_features))
+    >>> hnsw.extend(hnsw.ExtendParams(), hnsw_index, new_data)
+    """
+
+    data_ai = wrap_array(data)
+    _check_input_array(data_ai, [np.dtype('float32'),
+                                 np.dtype('uint8'),
+                                 np.dtype('int8')])
+
+    cdef cydlpack.DLManagedTensor* data_dlpack = cydlpack.dlpack_c(data_ai)
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+
+    check_cuvs(cuvsHnswExtend(
+        res,
+        extend_params.params,
+        data_dlpack,
+        index.index
+    ))
+
+
+cdef class SearchParams:
+    """
+    HNSW search parameters
+
+    Parameters
+    ----------
+    ef: int, default = 200
+        Maximum number of candidate list size used during search.
+    num_threads: int, default = 0
+        Number of CPU threads used to increase search parallelism.
+        When set to 0, the number of threads is automatically determined
+        using OpenMP's `omp_get_max_threads()`.
+    """
+
+    cdef cuvsHnswSearchParams params
+
+    def __init__(self, *,
+                 ef=200,
+                 num_threads=0):
+        self.params.ef = ef
+        self.params.num_threads = num_threads
+
+    def __repr__(self):
+        attr_str = [attr + "=" + str(getattr(self, attr))
+                    for attr in [
+                        "ef", "num_threads"]]
+        return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")"
+
+    @property
+    def ef(self):
+        return self.params.ef
+
+    @property
+    def num_threads(self):
+        return self.params.num_threads
+
+
 @auto_sync_resources
 @auto_convert_output
 def search(SearchParams search_params,
@@ -290,15 +444,15 @@ def search(SearchParams search_params,
     ----------
     search_params : SearchParams
     index : Index
-        Trained CAGRA index.
-    queries : CUDA array interface compliant matrix shape (n_samples, dim)
+        Trained HNSW index.
+    queries : CPU array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int]
     k : int
         The number of neighbors.
-    neighbors : Optional CUDA array interface compliant matrix shape
+    neighbors : Optional CPU array interface compliant matrix shape
                 (n_queries, k), dtype uint64_t. If supplied, neighbor
                 indices will be written here in-place. (default None)
-    distances : Optional CUDA array interface compliant matrix shape
+    distances : Optional CPU array interface compliant matrix shape
                 (n_queries, k) If supplied, the distances to the
                 neighbors will be written here in-place. (default None)
     {resources_docstring}
@@ -323,7 +477,7 @@ def search(SearchParams search_params,
     ...     num_threads=0
     ... )
     >>> # Convert CAGRA index to HNSW
-    >>> hnsw_index = hnsw.from_cagra(index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index)
     >>> # Using a pooling allocator reduces overhead of temporary array
     >>> # creation during search. This is useful if multiple searches
     >>> # are performed with same query size.
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
index 20a35401e..20f583ae8 100644
--- a/python/cuvs/cuvs/test/test_hnsw.py
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -32,6 +32,7 @@ def run_hnsw_build_search_test(
     build_algo="ivf_pq",
     intermediate_graph_degree=128,
     graph_degree=64,
+    hierarchy="none",
     search_params={},
 ):
     dataset = generate_data((n_rows, n_cols), dtype)
@@ -53,7 +54,8 @@ def run_hnsw_build_search_test(
 
     assert index.trained
 
-    hnsw_index = hnsw.from_cagra(index)
+    hnsw_params = hnsw.IndexParams(hierarchy=hierarchy, num_threads=1)
+    hnsw_index = hnsw.from_cagra(hnsw_params, index)
 
     queries = generate_data((n_queries, n_cols), dtype)
 
@@ -83,10 +85,93 @@ def run_hnsw_build_search_test(
 @pytest.mark.parametrize("num_threads", [2, 4])
 @pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
-def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
+@pytest.mark.parametrize("hierarchy", ["none", "cpu"])
+def test_hnsw(dtype, k, ef, num_threads, metric, build_algo, hierarchy):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
     run_hnsw_build_search_test(
+        dtype=dtype,
+        k=k,
+        metric=metric,
+        build_algo=build_algo,
+        hierarchy=hierarchy,
+        search_params={"ef": ef, "num_threads": num_threads},
+    )
+
+
+def run_hnsw_extend_test(
+    n_rows=10000,
+    add_rows=2000,
+    n_cols=10,
+    n_queries=100,
+    k=10,
+    dtype=np.float32,
+    metric="sqeuclidean",
+    build_algo="ivf_pq",
+    intermediate_graph_degree=128,
+    graph_degree=64,
+    search_params={},
+):
+    dataset = generate_data((n_rows, n_cols), dtype)
+    add_dataset = generate_data((add_rows, n_cols), dtype)
+    if metric == "inner_product":
+        dataset = normalize(dataset, norm="l2", axis=1)
+        add_dataset = normalize(add_dataset, norm="l2", axis=1)
+        if dtype in [np.int8, np.uint8]:
+            pytest.skip(
+                "inner_product metric is not supported for int8/uint8 data"
+            )
+        if build_algo == "nn_descent":
+            pytest.skip("inner_product metric is not supported for nn_descent")
+
+    build_params = cagra.IndexParams(
+        metric=metric,
+        intermediate_graph_degree=intermediate_graph_degree,
+        graph_degree=graph_degree,
+        build_algo=build_algo,
+    )
+
+    index = cagra.build(build_params, dataset)
+
+    assert index.trained
+
+    hnsw_params = hnsw.IndexParams(hierarchy="cpu", num_threads=1)
+    hnsw_index = hnsw.from_cagra(hnsw_params, index)
+    hnsw.extend(hnsw.ExtendParams(), hnsw_index, add_dataset)
+
+    queries = generate_data((n_queries, n_cols), dtype)
+
+    search_params = hnsw.SearchParams(**search_params)
+
+    out_dist, out_idx = hnsw.search(search_params, hnsw_index, queries, k)
+
+    # Calculate reference values with sklearn
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
+    nn_skl = NearestNeighbors(
+        n_neighbors=k, algorithm="brute", metric=skl_metric
+    )
+    nn_skl.fit(np.vstack([dataset, add_dataset]))
+    skl_dist, skl_idx = nn_skl.kneighbors(queries, return_distance=True)
+
+    recall = calc_recall(out_idx, skl_idx)
+    print(recall)
+    assert recall > 0.95
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
+@pytest.mark.parametrize("k", [10, 20])
+@pytest.mark.parametrize("ef", [30, 40])
+@pytest.mark.parametrize("num_threads", [2, 4])
+@pytest.mark.parametrize("metric", ["sqeuclidean"])
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+def test_hnsw_extend(dtype, k, ef, num_threads, metric, build_algo):
+    # Note that inner_product tests use normalized input which we cannot
+    # represent in int8, therefore we test only sqeuclidean metric here.
+    run_hnsw_extend_test(
         dtype=dtype,
         k=k,
         metric=metric,
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index f1a7f272c..90a561bca 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -4,8 +4,11 @@ constraints:
 groups:
   base:
     build:
-      graph_degree: [32, 64, 128, 256]
+      graph_degree: [32, 64, 96, 128]
       intermediate_graph_degree: [32, 64, 96, 128]
       graph_build_algo: ["NN_DESCENT"]
+      hierarchy: ["none", "cpu"]
+      ef_construction: [64, 128, 256, 512]
+      num_threads: [2, 5, 10]
     search:
       ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]

From e0aebfd0c499189585319f5a5fbf46dfb9ce04f9 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 3 Dec 2024 14:41:47 -0600
Subject: [PATCH 37/47] add a README for wheels (#504)

Wheel-building CI jobs are failing like this:

> Checking
final_dist/cuvs_cu12-25.2.0a26-cp310-cp310-manylinux_2_28_aarch64.whl:
FAILED due to warnings
> WARNING `long_description` missing.
> Error: Process completed with exit code 1.

([build
link](https://github.com/rapidsai/cuvs/actions/runs/12133882036))

Looks like the root cause is a combination of the following:

* there was a new `twine` release (6.x) 3 days ago:
https://pypi.org/project/twine/#history
* it contains https://github.com/pypa/twine/pull/1168, which makes
`twine check --strict` fail if the wheel's `long_description` is empty
* the `cuvs` wheel README (used as the wheel `long_description`) is
empty

This proposes adding a small README, with just 2 sentences copied from
the project's root-level README, to get past that check.

## Notes for Reviewers

The `long_description` becomes the project homepage when a project is
hosted on PyPI. The wheels produced from this repo aren't currently
being published to pypi.org so this change won't be seen there, but a
more user-friendly README should be added if/when we decide to publish
`cuvs-cu{11,12}` to pypi.org.

ref: https://github.com/rapidsai/build-planning/issues/70
---
 python/cuvs/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/cuvs/README.md b/python/cuvs/README.md
index e69de29bb..27b494811 100644
--- a/python/cuvs/README.md
+++ b/python/cuvs/README.md
@@ -0,0 +1,3 @@
+# cuVS
+
+cuVS contains state-of-the-art implementations of several algorithms for running approximate nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering.

From fbbca0570db27d476b500ef021c03482b0d989e2 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Wed, 4 Dec 2024 00:57:52 +0100
Subject: [PATCH 38/47] Add Question Retrieval notebook using Milvus (#451)

This notebook is adapting the Question Retrieval nb to use Milvus.
It can serve as a good example on how to do Bulk ingest, how to use cuVS, and especially CAGRA+HNSW on Milvus

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/451
---
 ...ectorSearch_QuestionRetrieval_Milvus.ipynb | 732 ++++++++++++++++++
 1 file changed, 732 insertions(+)
 create mode 100644 notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb

diff --git a/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb
new file mode 100644
index 000000000..09a6cca43
--- /dev/null
+++ b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb
@@ -0,0 +1,732 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f5499b54",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Similar Questions Retrieval - Milvus - CAGRA-HNSW\n",
+    "\n",
+    "This notebook is inspired by the [similar search example of Sentence-Transformers](https://www.sbert.net/examples/applications/semantic-search/README.html#similar-questions-retrieval), and adapted to be used with [Milvus](https://milvus.io) and [cuVS](https://rapids.ai/cuvs/).\n",
+    "\n",
+    "The model was pre-trained on the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions). It consists of about 100k real Google search queries, together with an annotated passage from Wikipedia that provides the answer. It is an example of an asymmetric search task. As corpus, we use the smaller [Simple English Wikipedia](http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz) so that it fits easily into memory.\n",
+    "\n",
+    "The steps to install the latest Milvus package are available in the [Milvus documentation](https://milvus.io/docs/quickstart.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8d55ede",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:21.149465Z",
+     "iopub.status.busy": "2024-11-08T14:47:21.149218Z",
+     "iopub.status.idle": "2024-11-08T14:47:23.440275Z",
+     "shell.execute_reply": "2024-11-08T14:47:23.439436Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!pip install sentence_transformers torch pymilvus pymilvus[bulk_writer] dask dask[distributed]\n",
+    "\n",
+    "# Note: if you have a Hopper based GPU, like an H100, use these to install:\n",
+    "# pip install torch --index-url https://download.pytorch.org/whl/cu118\n",
+    "# pip install sentence_transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb1e81c3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:23.444058Z",
+     "iopub.status.busy": "2024-11-08T14:47:23.443683Z",
+     "iopub.status.idle": "2024-11-08T14:47:24.219903Z",
+     "shell.execute_reply": "2024-11-08T14:47:24.219228Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee4c5cc0",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:24.223131Z",
+     "iopub.status.busy": "2024-11-08T14:47:24.222874Z",
+     "iopub.status.idle": "2024-11-08T14:47:34.024085Z",
+     "shell.execute_reply": "2024-11-08T14:47:34.023435Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import dask.array as da\n",
+    "import gzip\n",
+    "import json\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import pymilvus\n",
+    "import time\n",
+    "import torch\n",
+    "\n",
+    "from minio import Minio\n",
+    "from multiprocessing import Process\n",
+    "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n",
+    "from typing import List\n",
+    "\n",
+    "\n",
+    "from pymilvus import (\n",
+    "    connections, utility\n",
+    ")\n",
+    "from pymilvus.bulk_writer import LocalBulkWriter, BulkFileType  # pip install pymilvus[bulk_writer]\n",
+    "\n",
+    "if not torch.cuda.is_available():\n",
+    "  print(\"Warning: No GPU found. Please add GPU to your notebook\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47cabaca",
+   "metadata": {},
+   "source": [
+    "# Setup Milvus Collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fcd259c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:34.027677Z",
+     "iopub.status.busy": "2024-11-08T14:47:34.027288Z",
+     "iopub.status.idle": "2024-11-08T14:47:34.109212Z",
+     "shell.execute_reply": "2024-11-08T14:47:34.108609Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "DIM = 768\n",
+    "MILVUS_PORT = 30004\n",
+    "MILVUS_HOST = f\"http://localhost:{MILVUS_PORT}\"\n",
+    "ID_FIELD=\"id\"\n",
+    "EMBEDDING_FIELD=\"embedding\"\n",
+    "\n",
+    "collection_name = \"simple_wiki\"\n",
+    "\n",
+    "def get_milvus_client():\n",
+    "    return pymilvus.MilvusClient(uri=MILVUS_HOST)\n",
+    "\n",
+    "client = get_milvus_client()\n",
+    "\n",
+    "fields = [\n",
+    "    pymilvus.FieldSchema(name=ID_FIELD, dtype=pymilvus.DataType.INT64, is_primary=True),\n",
+    "    pymilvus.FieldSchema(name=EMBEDDING_FIELD, dtype=pymilvus.DataType.FLOAT_VECTOR, dim=DIM)\n",
+    "]\n",
+    "\n",
+    "schema = pymilvus.CollectionSchema(fields)\n",
+    "schema.verify()\n",
+    "\n",
+    "if collection_name in client.list_collections():\n",
+    "    print(f\"Collection '{collection_name}' already exists. Deleting collection...\")\n",
+    "    client.drop_collection(collection_name)\n",
+    "\n",
+    "client.create_collection(collection_name, schema=schema, dimension=DIM, vector_field_name=EMBEDDING_FIELD)\n",
+    "collection = pymilvus.Collection(name=collection_name, using=client._using)\n",
+    "collection.release()\n",
+    "collection.drop_index()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00bd20f5",
+   "metadata": {},
+   "source": [
+    "# Setup Sentence Transformer model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a1a6307",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:34.111782Z",
+     "iopub.status.busy": "2024-11-08T14:47:34.111556Z",
+     "iopub.status.idle": "2024-11-08T14:47:39.654323Z",
+     "shell.execute_reply": "2024-11-08T14:47:39.653386Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search\n",
+    "model_name = 'nq-distilbert-base-v1'\n",
+    "bi_encoder = SentenceTransformer(model_name)\n",
+    "\n",
+    "# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only\n",
+    "# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder\n",
+    "\n",
+    "wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'\n",
+    "\n",
+    "if not os.path.exists(wikipedia_filepath):\n",
+    "    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n",
+    "\n",
+    "passages = []\n",
+    "with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:\n",
+    "    for line in fIn:\n",
+    "        data = json.loads(line.strip())\n",
+    "        for paragraph in data['paragraphs']:\n",
+    "            # We encode the passages as [title, text]\n",
+    "            passages.append([data['title'], paragraph])\n",
+    "\n",
+    "# If you like, you can also limit the number of passages you want to use\n",
+    "print(\"Passages:\", len(passages))\n",
+    "\n",
+    "# To speed things up, pre-computed embeddings are downloaded.\n",
+    "# The provided file encoded the passages with the model 'nq-distilbert-base-v1'\n",
+    "if model_name == 'nq-distilbert-base-v1':\n",
+    "    embeddings_filepath = 'simplewiki-2020-11-01-nq-distilbert-base-v1.pt'\n",
+    "    if not os.path.exists(embeddings_filepath):\n",
+    "        util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt', embeddings_filepath)\n",
+    "\n",
+    "    corpus_embeddings = torch.load(embeddings_filepath, map_location='cpu', weights_only=True).float()  # Convert embedding file to float\n",
+    "    #if torch.cuda.is_available():\n",
+    "    #    corpus_embeddings = corpus_embeddings.to('cuda')\n",
+    "else:  # Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)\n",
+    "    corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True).to('cpu')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f4e9b9d",
+   "metadata": {},
+   "source": [
+    "# Vector Search using Milvus and RAPIDS cuVS \n",
+    "Now that our embeddings are ready to be indexed and that the model has been loaded, we can use Milvus and RAPIDS cuVS to do our vector search.\n",
+    "\n",
+    "This is done in 3 steps: First we ingest all the vectors in the Milvus collection, then we build the Milvus index, to finally search it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "563751c1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:39.658832Z",
+     "iopub.status.busy": "2024-11-08T14:47:39.658374Z",
+     "iopub.status.idle": "2024-11-08T14:49:47.244768Z",
+     "shell.execute_reply": "2024-11-08T14:49:47.244162Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# minio\n",
+    "MINIO_PORT = 30009\n",
+    "MINIO_URL = f\"localhost:{MINIO_PORT}\"\n",
+    "MINIO_SECRET_KEY = \"minioadmin\"\n",
+    "MINIO_ACCESS_KEY = \"minioadmin\"\n",
+    "\n",
+    "def upload_to_minio(file_paths: List[List[str]], remote_paths: List[List[str]], bucket_name=\"milvus-bucket\"):\n",
+    "    minio_client = Minio(endpoint=MINIO_URL, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)\n",
+    "    if not minio_client.bucket_exists(bucket_name):\n",
+    "        minio_client.make_bucket(bucket_name)\n",
+    "\n",
+    "    for local_batch, remote_batch in zip(file_paths, remote_paths):\n",
+    "        for local_file, remote_file in zip(local_batch, remote_batch):\n",
+    "            minio_client.fput_object(bucket_name, \n",
+    "                                     object_name=remote_file,\n",
+    "                                     file_path=local_file,\n",
+    "                                     part_size=512 * 1024 * 1024,\n",
+    "                                     num_parallel_uploads=5)\n",
+    "     \n",
+    "    \n",
+    "def ingest_data_bulk(collection_name, vectors, schema: pymilvus.CollectionSchema, log_times=True, bulk_writer_type=\"milvus\", debug=False):\n",
+    "    print(f\"-  Ingesting {len(vectors) // 1000}k vectors, Bulk\")\n",
+    "    tic = time.perf_counter()\n",
+    "    collection = pymilvus.Collection(collection_name, using=get_milvus_client()._using)\n",
+    "    remote_path = None\n",
+    "\n",
+    "    if bulk_writer_type == 'milvus':\n",
+    "        # # Prepare source data for faster ingestion\n",
+    "        writer = LocalBulkWriter(\n",
+    "            schema=schema,\n",
+    "            local_path='bulk_data',\n",
+    "            segment_size=512 * 1024 * 1024, # Default value\n",
+    "            file_type=BulkFileType.NPY\n",
+    "        )\n",
+    "        for id, vec in enumerate(vectors):\n",
+    "            writer.append_row({ID_FIELD: id, EMBEDDING_FIELD: vec})\n",
+    "\n",
+    "        if debug:\n",
+    "            print(writer.batch_files)\n",
+    "        def callback(file_list):\n",
+    "            if debug:\n",
+    "                print(f\"  -  Commit successful\")\n",
+    "                print(file_list)\n",
+    "        writer.commit(call_back=callback)\n",
+    "        files_to_upload = writer.batch_files\n",
+    "    elif bulk_writer_type == 'dask':\n",
+    "        # Prepare source data for faster ingestion\n",
+    "        if not os.path.isdir(\"bulk_data\"):\n",
+    "            os.mkdir(\"bulk_data\")\n",
+    "\n",
+    "        from dask.distributed import Client, LocalCluster\n",
+    "        cluster = LocalCluster(n_workers=1, threads_per_worker=1)\n",
+    "        client = Client(cluster)\n",
+    "\n",
+    "        chunk_size = 100000\n",
+    "        da_vectors = da.from_array(vectors, chunks=(chunk_size, vectors.shape[1]))\n",
+    "        da_ids = da.arange(len(vectors), chunks=(chunk_size,))\n",
+    "        da.to_npy_stack(\"bulk_data/da_embedding/\", da_vectors)\n",
+    "        da.to_npy_stack(\"bulk_data/da_id/\", da_ids)\n",
+    "        files_to_upload = []\n",
+    "        remote_path = []\n",
+    "        for chunk_nb in range(math.ceil(len(vectors) / chunk_size)):\n",
+    "            files_to_upload.append([f\"bulk_data/da_embedding/{chunk_nb}.npy\", f\"bulk_data/da_id/{chunk_nb}.npy\"])\n",
+    "            remote_path.append([f\"bulk_data/da_{chunk_nb}/embedding.npy\", f\"bulk_data/da__{chunk_nb}/id.npy\"])\n",
+    "\n",
+    "    elif bulk_writer_type == 'numpy':\n",
+    "        # Directly save NPY files\n",
+    "        np.save(\"bulk_data/embedding.npy\", vectors)\n",
+    "        np.save(\"bulk_data/id.npy\", np.arange(len(vectors)))\n",
+    "        files_to_upload = [[\"bulk_data/embedding.npy\", \"bulk_data/id.npy\"]]\n",
+    "    else:\n",
+    "        raise ValueError(\"Invalid bulk writer type\")\n",
+    "    \n",
+    "    toc = time.perf_counter()\n",
+    "    if log_times:\n",
+    "        print(f\"  -  File save time: {toc - tic:.2f} seconds\")\n",
+    "    # Import data\n",
+    "    if remote_path is None:\n",
+    "        remote_path = files_to_upload\n",
+    "    upload_to_minio(files_to_upload, remote_path)\n",
+    "    \n",
+    "    job_ids = [utility.do_bulk_insert(collection_name, batch, using=get_milvus_client()._using) for batch in remote_path]\n",
+    "\n",
+    "    while True:\n",
+    "        tasks = [utility.get_bulk_insert_state(job_id, using=get_milvus_client()._using) for job_id in job_ids]\n",
+    "        success = all(task.state_name == \"Completed\" for task in tasks)\n",
+    "        failure = any(task.state_name == \"Failed\" for task in tasks)\n",
+    "        for i in range(len(tasks)):\n",
+    "            task = tasks[i]\n",
+    "            if debug:\n",
+    "                print(f\"  -  Task {i}/{len(tasks)} state: {task.state_name}, Progress percent: {task.infos['progress_percent']}, Imported row count: {task.row_count}\")\n",
+    "            if task.state_name == \"Failed\":\n",
+    "                print(task)\n",
+    "        if success or failure:\n",
+    "            break\n",
+    "        time.sleep(2)\n",
+    "\n",
+    "    added_entities = str(sum([task.row_count for task in tasks]))\n",
+    "    failure = failure or added_entities != str(len(vectors))\n",
+    "    if failure:\n",
+    "        print(f\"-  Ingestion failed. Added entities: {added_entities}\")\n",
+    "    toc = time.perf_counter()\n",
+    "    if log_times:\n",
+    "        datasize = vectors.nbytes / 1024 / 1024\n",
+    "        print(f\"-  Ingestion time: {toc - tic:.2f} seconds. ({(datasize / (toc-tic)):.2f}MB/s)\")\n",
+    "\n",
+    "ingest_data_bulk(collection_name, np.array(corpus_embeddings), schema, bulk_writer_type='dask', log_times=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad90b4be",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:49:47.247498Z",
+     "iopub.status.busy": "2024-11-08T14:49:47.247268Z",
+     "iopub.status.idle": "2024-11-08T14:50:00.737502Z",
+     "shell.execute_reply": "2024-11-08T14:50:00.736808Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Setups the IVFPQ index\n",
+    "\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_IVF_PQ\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"nlist\": 150, # Number of clusters\n",
+    "            \"m\": 96})      # Product Quantization dimension\n",
+    "\n",
+    "# Drop the index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the index\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c75acea7",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:00.740443Z",
+     "iopub.status.busy": "2024-11-08T14:50:00.740142Z",
+     "iopub.status.idle": "2024-11-08T14:50:00.745403Z",
+     "shell.execute_reply": "2024-11-08T14:50:00.744672Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Search the index\n",
+    "def search_cuvs_pq(query, top_k = 5, n_probe = 30):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "\n",
+    "    search_params = {\"nprobe\": n_probe}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07935bca",
+   "metadata": {},
+   "source": [
+    "The ideal use-case for the IVF-PQ algorithm is when there is a need to reduce the memory footprint while keeping a good accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c27d4715",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:00.748001Z",
+     "iopub.status.busy": "2024-11-08T14:50:00.747783Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.785914Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.785223Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc375518",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.788877Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.788640Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.813820Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.813153Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab154181",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.816625Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.816362Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.839593Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.838986Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query = \"What is creating tides?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "836344ec",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.842319Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.842022Z",
+     "iopub.status.idle": "2024-11-08T14:50:15.969324Z",
+     "shell.execute_reply": "2024-11-08T14:50:15.968562Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Drop the current index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the IVF Flat index\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_IVF_FLAT\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"nlist\": 150}) # Number of clusters)\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "2d6017ed",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:15.972764Z",
+     "iopub.status.busy": "2024-11-08T14:50:15.972368Z",
+     "iopub.status.idle": "2024-11-08T14:50:15.977806Z",
+     "shell.execute_reply": "2024-11-08T14:50:15.977064Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def search_cuvs_flat(query, top_k = 5, n_probe = 30):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "    \n",
+    "    search_params = {\"nprobe\": n_probe}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5cfb644",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:15.980796Z",
+     "iopub.status.busy": "2024-11-08T14:50:15.980408Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.009271Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.008579Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5694d00",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.012253Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.011924Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.043432Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.042751Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcfc3c5b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.046439Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.046093Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.071322Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.070614Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query = \"What is creating tides?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a59d7b32-0832-4c3a-864e-aeb2e6e7fe1f",
+   "metadata": {},
+   "source": [
+    "## Using CAGRA: Hybrid GPU-CPU graph-based Vector Search\n",
+    "\n",
+    "CAGRA is a graph-based nearest neighbors implementation with state-of-the art performance for both small- and large-batch sized vector searches. \n",
+    "\n",
+    "CAGRA follows the same steps as IVF-FLAT and IVF-PQ in Milvus, but is also able to be adapted for querying on CPU.\n",
+    "This means that CAGRA is able to profit from a high training speed on GPU, as well as a low inference time on CPU, that minimize latency even on the smallest queries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5ce4dab",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.074449Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.074128Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.479027Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.478265Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Drop the current index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the IVF Flat index\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_CAGRA\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"graph_degree\": 64, \"intermediate_graph_degree\": 128, \"build_algo\": \"NN_DESCENT\", \"adapt_for_cpu\": True})\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "df229e21-f6b6-4d6c-ad54-2724f8738934",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.481748Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.481474Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.486324Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.485696Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def search_cuvs_cagra(query, top_k = 5, itopk = 32):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "\n",
+    "    search_params = {\"params\": {\"itopk\": itopk, \"ef\": 35}}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5e862fd-b7e5-4423-8fbf-36918f02c8f3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.489077Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.488790Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.513998Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.513319Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb8a5b7b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.516748Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.516521Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.538982Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.538269Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c89810a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.541508Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.541287Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.562722Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.562085Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"What is creating tides?\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From acbd097ed15afe186367b5a46a5d4b366ac9d804 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Wed, 4 Dec 2024 18:06:17 +0900
Subject: [PATCH 39/47] [BUG] Fix CAGRA filter (#489)

Ref : https://github.com/rapidsai/cuvs/issues/472

## The cause of the bug
The bitonic sort was used on an array that was not a power of 2 long. In the current search implementation, the bitonic sort is used to move the invalid elements to the end of the buffer as:
https://github.com/rapidsai/cuvs/blob/5062594138a40231475299c7bac61083b0669fd1/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh#L758-L763
https://github.com/rapidsai/cuvs/blob/5062594138a40231475299c7bac61083b0669fd1/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh#L644-L649

The problem is that the (max) array length (=`MAX_ITOPK + MAX_CANDIDATES`) is not always the power of two.
These bitonic sorts are called even if no elements are filtered out unless `cuvs::neighbors::filtering::none_sample_filter` is specified as the filter, so #472 occurs.

## Fix
This PR changes the filtering process so that the bitonic sort is not used to move the invalid elements to the end of the buffer.

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)

URL: https://github.com/rapidsai/cuvs/pull/489
---
 .../detail/cagra/search_single_cta.cuh        |  16 +-
 .../cagra/search_single_cta_kernel-inl.cuh    | 182 +++++++++++++-----
 cpp/test/neighbors/ann_cagra.cuh              |   6 +-
 3 files changed, 153 insertions(+), 51 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 2bed19009..fa71dbaf9 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -129,17 +129,27 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
       (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
       sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width +
       sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t);
-    smem_size = base_smem_size;
+
+    std::uint32_t additional_smem_size = 0;
     if (num_itopk_candidates > 256) {
       // Tentatively calculate the required share memory size when radix
       // sort based topk is used, assuming the block size is the maximum.
       if (itopk_size <= 256) {
-        smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        additional_smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
       } else {
-        smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        additional_smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
       }
     }
 
+    if (!std::is_same_v<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>) {
+      // For filtering postprocess
+      using scan_op_t = cub::WarpScan<unsigned>;
+      additional_smem_size =
+        std::max<std::uint32_t>(additional_smem_size, sizeof(scan_op_t::TempStorage));
+    }
+
+    smem_size = base_smem_size + additional_smem_size;
+
     uint32_t block_size = thread_block_size;
     if (block_size == 0) {
       block_size = min_block_size;
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 79cb6bc10..678ed0cb4 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -111,7 +111,7 @@ RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const termin
 }
 
 template <unsigned MAX_CANDIDATES, class IdxT = void>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_full(
   float* candidate_distances,  // [num_candidates]
   IdxT* candidate_indices,     // [num_candidates]
   const std::uint32_t num_candidates,
@@ -215,7 +215,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
 }
 
 template <unsigned MAX_ITOPK, class IdxT = void>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge(
   float* itopk_distances,  // [num_itopk]
   IdxT* itopk_indices,     // [num_itopk]
   const std::uint32_t num_itopk,
@@ -424,7 +424,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
 template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           class IdxT>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge(
   float* itopk_distances,  // [num_itopk]
   IdxT* itopk_indices,     // [num_itopk]
   const std::uint32_t num_itopk,
@@ -437,20 +437,62 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
   const unsigned MULTI_WARPS_2)
 {
   // The results in candidate_distances/indices are sorted by bitonic sort.
-  topk_by_bitonic_sort_1st<MAX_CANDIDATES, IdxT>(
+  topk_by_bitonic_sort_and_full<MAX_CANDIDATES, IdxT>(
     candidate_distances, candidate_indices, num_candidates, num_itopk, MULTI_WARPS_1);
 
   // The results sorted above are merged with the internal intermediate top-k
   // results so far using bitonic merge.
-  topk_by_bitonic_sort_2nd<MAX_ITOPK, IdxT>(itopk_distances,
-                                            itopk_indices,
-                                            num_itopk,
-                                            candidate_distances,
-                                            candidate_indices,
-                                            num_candidates,
-                                            work_buf,
-                                            first,
-                                            MULTI_WARPS_2);
+  topk_by_bitonic_sort_and_merge<MAX_ITOPK, IdxT>(itopk_distances,
+                                                  itopk_indices,
+                                                  num_itopk,
+                                                  candidate_distances,
+                                                  candidate_indices,
+                                                  num_candidates,
+                                                  work_buf,
+                                                  first,
+                                                  MULTI_WARPS_2);
+}
+
+// This function move the invalid index element to the end of the itopk list.
+// Require : array_length % 32 == 0 && The invalid entry is only one.
+template <class IdxT>
+RAFT_DEVICE_INLINE_FUNCTION void move_invalid_to_end_of_list(IdxT* const index_array,
+                                                             float* const distance_array,
+                                                             const std::uint32_t array_length)
+{
+  constexpr std::uint32_t warp_size     = 32;
+  constexpr std::uint32_t invalid_index = utils::get_max_value<IdxT>();
+  const std::uint32_t lane_id           = threadIdx.x % warp_size;
+
+  if (threadIdx.x >= warp_size) { return; }
+
+  bool found_invalid = false;
+  if (array_length % warp_size == 0) {
+    for (std::uint32_t i = lane_id; i < array_length; i += warp_size) {
+      const auto index    = index_array[i];
+      const auto distance = distance_array[i];
+
+      if (found_invalid) {
+        index_array[i - 1]    = index;
+        distance_array[i - 1] = distance;
+      } else {
+        // Check if the index is invalid
+        const auto I_found_invalid = (index == invalid_index);
+        const auto who_has_invalid = raft::ballot(I_found_invalid);
+        // if a value that is loaded by a smaller lane id thread, shift the array
+        if (who_has_invalid << (warp_size - lane_id)) {
+          index_array[i - 1]    = index;
+          distance_array[i - 1] = distance;
+        }
+
+        found_invalid = who_has_invalid;
+      }
+    }
+  }
+  if (lane_id == 0) {
+    index_array[array_length - 1]    = invalid_index;
+    distance_array[array_length - 1] = utils::get_max_value<float>();
+  }
 }
 
 template <class INDEX_T>
@@ -589,10 +631,10 @@ __device__ void search_core(
     // sort
     if constexpr (TOPK_BY_BITONIC_SORT) {
       // [Notice]
-      // It is good to use multiple warps in topk_by_bitonic_sort() when
+      // It is good to use multiple warps in topk_by_bitonic_sort_and_merge() when
       // batch size is small (short-latency), but it might not be always good
       // when batch size is large (high-throughput).
-      // topk_by_bitonic_sort() consists of two operations:
+      // topk_by_bitonic_sort_and_merge() consists of two operations:
       // if MAX_CANDIDATES is greater than 128, the first operation uses two warps;
       // if MAX_ITOPK is greater than 256, the second operation used two warps.
       const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
@@ -601,9 +643,9 @@ __device__ void search_core(
       // reset small-hash table.
       if ((iter + 1) % small_hash_reset_interval == 0) {
         // Depending on the block size and the number of warps used in
-        // topk_by_bitonic_sort(), determine which warps are used to reset
+        // topk_by_bitonic_sort_and_merge(), determine which warps are used to reset
         // the small hash and whether they are performed in overlap with
-        // topk_by_bitonic_sort().
+        // topk_by_bitonic_sort_and_merge().
         _CLK_START();
         unsigned hash_start_tid;
         if (blockDim.x == 32) {
@@ -627,28 +669,28 @@ __device__ void search_core(
 
       // topk with bitonic sort
       _CLK_START();
-      if (std::is_same<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>::value ||
-          *filter_flag == 0) {
-        topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES>(result_distances_buffer,
-                                                        result_indices_buffer,
-                                                        internal_topk,
-                                                        result_distances_buffer + internal_topk,
-                                                        result_indices_buffer + internal_topk,
-                                                        search_width * graph_degree,
-                                                        topk_ws,
-                                                        (iter == 0),
-                                                        multi_warps_1,
-                                                        multi_warps_2);
-        __syncthreads();
-      } else {
-        topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
-          result_distances_buffer,
-          result_indices_buffer,
-          internal_topk + search_width * graph_degree,
-          internal_topk,
-          false);
+      if (!(std::is_same<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>::value ||
+            *filter_flag == 0)) {
+        // Move the filtered out index to the end of the itopk list
+        for (unsigned i = 0; i < search_width; i++) {
+          move_invalid_to_end_of_list(
+            result_indices_buffer, result_distances_buffer, internal_topk);
+        }
+
         if (threadIdx.x == 0) { *terminate_flag = 0; }
       }
+      topk_by_bitonic_sort_and_merge<MAX_ITOPK, MAX_CANDIDATES>(
+        result_distances_buffer,
+        result_indices_buffer,
+        internal_topk,
+        result_distances_buffer + internal_topk,
+        result_indices_buffer + internal_topk,
+        search_width * graph_degree,
+        topk_ws,
+        (iter == 0),
+        multi_warps_1,
+        multi_warps_2);
+      __syncthreads();
       _CLK_REC(clk_topk);
     } else {
       _CLK_START();
@@ -755,12 +797,66 @@ __device__ void search_core(
     }
 
     __syncthreads();
-    topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
-      result_distances_buffer,
-      result_indices_buffer,
-      internal_topk + search_width * graph_degree,
-      top_k,
-      false);
+    // Move invalid index items to the end of the buffer without sorting the entire buffer
+    using scan_op_t    = cub::WarpScan<unsigned>;
+    auto& temp_storage = *reinterpret_cast<typename scan_op_t::TempStorage*>(smem_work_ptr);
+
+    constexpr std::uint32_t warp_size = 32;
+    if (threadIdx.x < warp_size) {
+      std::uint32_t num_found_valid = 0;
+      for (std::uint32_t buffer_offset = 0; buffer_offset < internal_topk;
+           buffer_offset += warp_size) {
+        // Calculate the new buffer index
+        const auto src_position = buffer_offset + threadIdx.x;
+        const std::uint32_t is_valid_index =
+          (result_indices_buffer[src_position] & (~index_msb_1_mask)) == invalid_index ? 0 : 1;
+        std::uint32_t new_position;
+        scan_op_t(temp_storage).InclusiveSum(is_valid_index, new_position);
+        if (is_valid_index) {
+          const auto dst_position               = num_found_valid + (new_position - 1);
+          result_indices_buffer[dst_position]   = result_indices_buffer[src_position];
+          result_distances_buffer[dst_position] = result_distances_buffer[src_position];
+        }
+
+        // Calculate the largest valid position within a warp and bcast it for the next iteration
+        num_found_valid += new_position;
+        for (std::uint32_t offset = (warp_size >> 1); offset > 0; offset >>= 1) {
+          const auto v = raft::shfl_xor(num_found_valid, offset);
+          if ((threadIdx.x & offset) == 0) { num_found_valid = v; }
+        }
+
+        // If the enough number of items are found, do early termination
+        if (num_found_valid >= top_k) { break; }
+      }
+
+      if (num_found_valid < top_k) {
+        // Fill the remaining buffer with invalid values so that `topk_by_bitonic_sort_and_merge` is
+        // usable in the next step
+        for (std::uint32_t i = num_found_valid + threadIdx.x; i < internal_topk; i += warp_size) {
+          result_indices_buffer[i]   = invalid_index;
+          result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+        }
+      }
+    }
+
+    // If the sufficient number of valid indexes are not in the internal topk, pick up from the
+    // candidate list.
+    if (top_k > internal_topk || result_indices_buffer[top_k - 1] == invalid_index) {
+      __syncthreads();
+      const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
+      const unsigned multi_warps_2 = ((blockDim.x >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
+      topk_by_bitonic_sort_and_merge<MAX_ITOPK, MAX_CANDIDATES>(
+        result_distances_buffer,
+        result_indices_buffer,
+        internal_topk,
+        result_distances_buffer + internal_topk,
+        result_indices_buffer + internal_topk,
+        search_width * graph_degree,
+        topk_ws,
+        (iter == 0),
+        multi_warps_1,
+        multi_warps_2);
+    }
     __syncthreads();
   }
 
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 660246c67..8d5701439 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -758,11 +758,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
         search_params.team_size   = ps.team_size;
-
-        // TODO: setting search_params.itopk_size here breaks the filter tests, but is required for
-        // k>1024 skip these tests until fixed
-        if (ps.k >= 1024) { GTEST_SKIP(); }
-        // search_params.itopk_size   = ps.itopk_size;
+        search_params.itopk_size  = ps.itopk_size;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);

From a96b72086320ff1dab7b843c67a3c96352a7563d Mon Sep 17 00:00:00 2001
From: Ajit Mistry <55892788+ajit283@users.noreply.github.com>
Date: Wed, 4 Dec 2024 17:18:45 +0100
Subject: [PATCH 40/47] [WIP] Add pinned memory resource to C API (#311)

Let me know if this is out of scope for cuVS!

Authors:
  - Ajit Mistry (https://github.com/ajit283)
  - Ben Frederickson (https://github.com/benfred)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/311
---
 cpp/include/cuvs/core/c_api.h | 16 ++++++++++++++++
 cpp/src/core/c_api.cpp        | 16 ++++++++++++++++
 cpp/test/core/c_api.c         |  9 +++++++++
 3 files changed, 41 insertions(+)

diff --git a/cpp/include/cuvs/core/c_api.h b/cpp/include/cuvs/core/c_api.h
index c8c8d3934..400d162ad 100644
--- a/cpp/include/cuvs/core/c_api.h
+++ b/cpp/include/cuvs/core/c_api.h
@@ -151,6 +151,22 @@ cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
  */
 cuvsError_t cuvsRMMMemoryResourceReset();
 
+/**
+ * @brief Allocates pinned memory on the host using RMM
+ * @param[out] ptr Pointer to allocated host memory
+ * @param[in] bytes Size in bytes to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes);
+
+/**
+ * @brief Deallocates pinned memory on the host using RMM
+ * @param[in] ptr Pointer to allocated host memory to free
+ * @param[in] bytes Size in bytes to deallocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes);
+
 /** @} */
 
 #ifdef __cplusplus
diff --git a/cpp/src/core/c_api.cpp b/cpp/src/core/c_api.cpp
index cfbeed2d5..4333bff0c 100644
--- a/cpp/src/core/c_api.cpp
+++ b/cpp/src/core/c_api.cpp
@@ -26,6 +26,7 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
 #include <thread>
 
 extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
@@ -130,6 +131,21 @@ extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
   });
 }
 
+thread_local std::unique_ptr<rmm::mr::pinned_memory_resource> pinned_mr;
+
+extern "C" cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (pinned_mr == nullptr) { pinned_mr = std::make_unique<rmm::mr::pinned_memory_resource>(); }
+    *ptr = pinned_mr->allocate(bytes);
+  });
+}
+
+extern "C" cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes)
+{
+  return cuvs::core::translate_exceptions([=] { pinned_mr->deallocate(ptr, bytes); });
+}
+
 thread_local std::string last_error_text = "";
 
 extern "C" const char* cuvsGetLastErrorText()
diff --git a/cpp/test/core/c_api.c b/cpp/test/core/c_api.c
index a3dae6004..a51824d2b 100644
--- a/cpp/test/core/c_api.c
+++ b/cpp/test/core/c_api.c
@@ -73,6 +73,15 @@ int main()
   error = cuvsRMMMemoryResourceReset();
   if (error == CUVS_ERROR) { exit(EXIT_FAILURE); }
 
+  // Alloc memory on host (pinned)
+  void* ptr3;
+  cuvsError_t alloc_error_pinned = cuvsRMMHostAlloc(&ptr3, 1024);
+  if (alloc_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); }
+
+  // Free memory
+  cuvsError_t free_error_pinned = cuvsRMMHostFree(ptr3, 1024);
+  if (free_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); }
+
   // Destroy resources
   error = cuvsResourcesDestroy(res);
   if (error == CUVS_ERROR) { exit(EXIT_FAILURE); }

From 9fb21adc82e625deb7cc0f20b68c0f42902246f1 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 4 Dec 2024 17:38:05 +0100
Subject: [PATCH 41/47] Dynamic Batching (#261)

Non-blocking / stream-ordered dynamic batching as a new index type.

## API

This PR implements dynamic batching as a new index type, mirroring the API of other indices.

  * [_building is wrapping_] Building the index means creating a lightweight wrapper on top of an existing index and initializing necessary components, such as IO batch buffers and synchronization primitives.
  * [_type erasure_] The underlying/upstream index type is erased once the dynamic_batching wrapper is created, i.e. there's no way to recover the original search index type or parameters.
  * [_explicit control over batching_] To allow multiple user requests group into a dynamic batch request, the users must use copies of the same dynamic batching index (the user-facing index type is a thin wrapper on top of a shared pointer, hence the copy is shallow and cheap). The search function is thread-safe.

## Feature:  stream-ordered dynamic batching

Non-blocking / stream-ordered dynamic batching means the batching does not involve synchronizing with a GPU stream. The control is returned to the user as soon as the necessary work is submitted to the GPU. This entails a few good-to-know features:

1. The dynamic batching index has the same blocking properties as the upstream index: if the upstream index does not involve stream sync during search, that the dynamic batching index does not involve it as well (otherwise, the dynamic batching search obviously waits till the upstream search synchronizes under the hood).
2. It's responsibility of the user to synchronize the stream before getting the results back - even if the upstream index search does not need it (the batch results are scattered back to the request threads in a post-processing kernel).
3. If the upstream index does not synchronize during search, the dynamic batching index can group the queries even in a single-threaded application (_try it with --no-lap-sync option in the ann-bench benchmarks_).

Overall, stream-ordered dynamic batching makes it easy to modify existing cuVS indexes, because the wrapped index has the same execution behavior as the upstream index.

## Work-in-progress TODO

- [x] Add dynamic batching option to more indices in ann-bench
- [x] Add tests
- [x] **(postponed to 25.02)** Do proper benchmarking and possibly fine-tune the inter-thread communication
- [x] Review the API side (`cpp/include/cuvs/neighbors/dynamic_batching.hpp`) [ready for review CC @cjnolet]
- [x] Review the algorithm side (`cpp/src/neighbors/detail/dynamic_batching.cuh`) [ready for preliminary review: requests for algorithm docsting/clarifications are especially welcome]

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/261
---
 cpp/CMakeLists.txt                            |    1 +
 .../src/cuvs/cuvs_ann_bench_param_parser.h    |   26 +
 cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h   |   97 +-
 cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h  |   40 +-
 cpp/include/cuvs/neighbors/cagra.hpp          |    4 +
 .../cuvs/neighbors/dynamic_batching.hpp       |  290 ++++
 cpp/include/cuvs/neighbors/ivf_flat.hpp       |    4 +
 cpp/include/cuvs/neighbors/ivf_pq.hpp         |    3 +
 cpp/src/neighbors/detail/dynamic_batching.cuh | 1197 +++++++++++++++++
 cpp/src/neighbors/dynamic_batching.cu         |   91 ++
 cpp/test/CMakeLists.txt                       |   13 +
 cpp/test/neighbors/dynamic_batching.cuh       |  292 ++++
 .../neighbors/dynamic_batching/test_cagra.cu  |   84 ++
 .../dynamic_batching/test_ivf_flat.cu         |   44 +
 .../neighbors/dynamic_batching/test_ivf_pq.cu |   41 +
 docs/source/cpp_api/neighbors.rst             |    1 +
 .../cpp_api/neighbors_dynamic_batching.rst    |   45 +
 examples/cpp/CMakeLists.txt                   |    4 +
 examples/cpp/src/dynamic_batching_example.cu  |  282 ++++
 19 files changed, 2539 insertions(+), 20 deletions(-)
 create mode 100644 cpp/include/cuvs/neighbors/dynamic_batching.hpp
 create mode 100644 cpp/src/neighbors/detail/dynamic_batching.cuh
 create mode 100644 cpp/src/neighbors/dynamic_batching.cu
 create mode 100644 cpp/test/neighbors/dynamic_batching.cuh
 create mode 100644 cpp/test/neighbors/dynamic_batching/test_cagra.cu
 create mode 100644 cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu
 create mode 100644 cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu
 create mode 100644 docs/source/cpp_api/neighbors_dynamic_batching.rst
 create mode 100644 examples/cpp/src/dynamic_batching_example.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 34b7cb898..6af423bd5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -397,6 +397,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
     src/neighbors/detail/cagra/cagra_build.cpp
     src/neighbors/detail/cagra/topk_for_cagra/topk.cu
+    src/neighbors/dynamic_batching.cu
     $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
     src/neighbors/ivf_flat_index.cpp
     src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 57d5b1910..7617bfa66 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -56,6 +56,26 @@ extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #include "cuvs_mg_cagra_wrapper.h"
 #endif
 
+template <typename ParamT>
+void parse_dynamic_batching_params(const nlohmann::json& conf, ParamT& param)
+{
+  if (!conf.value("dynamic_batching", false)) { return; }
+  param.dynamic_batching = true;
+  if (conf.contains("dynamic_batching_max_batch_size")) {
+    param.dynamic_batching_max_batch_size = conf.at("dynamic_batching_max_batch_size");
+  }
+  param.dynamic_batching_conservative_dispatch =
+    conf.value("dynamic_batching_conservative_dispatch", false);
+  if (conf.contains("dynamic_batching_dispatch_timeout_ms")) {
+    param.dynamic_batching_dispatch_timeout_ms = conf.at("dynamic_batching_dispatch_timeout_ms");
+  }
+  if (conf.contains("dynamic_batching_n_queues")) {
+    param.dynamic_batching_n_queues = conf.at("dynamic_batching_n_queues");
+  }
+  param.dynamic_batching_k =
+    uint32_t(uint32_t(conf.at("k")) * float(conf.value("refine_ratio", 1.0f)));
+}
+
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
@@ -138,6 +158,9 @@ void parse_search_param(const nlohmann::json& conf,
     param.refine_ratio = conf.at("refine_ratio");
     if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
   }
+
+  // enable dynamic batching
+  parse_dynamic_batching_params(conf, param);
 }
 #endif
 
@@ -291,5 +314,8 @@ void parse_search_param(const nlohmann::json& conf,
   }
   // Same ratio as in IVF-PQ
   param.refine_ratio = conf.value("refine_ratio", 1.0f);
+
+  // enable dynamic batching
+  parse_dynamic_batching_params(conf, param);
 }
 #endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
index f6d3d60fc..8c9cb2d4f 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -24,6 +24,7 @@
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -63,6 +64,13 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
     AllocatorType graph_mem   = AllocatorType::kDevice;
     AllocatorType dataset_mem = AllocatorType::kDevice;
     [[nodiscard]] auto needs_dataset() const -> bool override { return true; }
+    /* Dynamic batching */
+    bool dynamic_batching = false;
+    int64_t dynamic_batching_k;
+    int64_t dynamic_batching_max_batch_size     = 4;
+    double dynamic_batching_dispatch_timeout_ms = 0.01;
+    size_t dynamic_batching_n_queues            = 8;
+    bool dynamic_batching_conservative_dispatch = false;
   };
 
   struct build_param {
@@ -173,6 +181,12 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
   std::shared_ptr<raft::device_matrix<T, int64_t, raft::row_major>> dataset_;
   std::shared_ptr<raft::device_matrix_view<const T, int64_t, raft::row_major>> input_dataset_v_;
 
+  std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
+  cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
+  int64_t dynamic_batching_max_batch_size_;
+  size_t dynamic_batching_n_queues_;
+  bool dynamic_batching_conservative_dispatch_;
+
   inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
   {
     switch (mem_type) {
@@ -216,26 +230,33 @@ inline auto allocator_to_string(AllocatorType mem_type) -> std::string
 template <typename T, typename IdxT>
 void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
 {
-  auto sp        = dynamic_cast<const search_param&>(param);
-  search_params_ = sp.p;
-  refine_ratio_  = sp.refine_ratio;
+  auto sp = dynamic_cast<const search_param&>(param);
+  bool needs_dynamic_batcher_update =
+    (dynamic_batching_max_batch_size_ != sp.dynamic_batching_max_batch_size) ||
+    (dynamic_batching_n_queues_ != sp.dynamic_batching_n_queues) ||
+    (dynamic_batching_conservative_dispatch_ != sp.dynamic_batching_conservative_dispatch);
+  dynamic_batching_max_batch_size_        = sp.dynamic_batching_max_batch_size;
+  dynamic_batching_n_queues_              = sp.dynamic_batching_n_queues;
+  dynamic_batching_conservative_dispatch_ = sp.dynamic_batching_conservative_dispatch;
+  search_params_                          = sp.p;
+  refine_ratio_                           = sp.refine_ratio;
   if (sp.graph_mem != graph_mem_) {
     // Move graph to correct memory space
     graph_mem_ = sp.graph_mem;
     RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str());
     // We create a new graph and copy to it from existing graph
-    auto mr        = get_mr(graph_mem_);
-    auto new_graph = raft::make_device_mdarray<IdxT, int64_t>(
+    auto mr = get_mr(graph_mem_);
+    *graph_ = raft::make_device_mdarray<IdxT, int64_t>(
       handle_, mr, raft::make_extents<int64_t>(index_->graph().extent(0), index_->graph_degree()));
 
-    raft::copy(new_graph.data_handle(),
+    raft::copy(graph_->data_handle(),
                index_->graph().data_handle(),
                index_->graph().size(),
                raft::resource::get_cuda_stream(handle_));
 
-    index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
-    // update_graph() only stores a view in the index. We need to keep the graph object alive.
-    *graph_ = std::move(new_graph);
+    // NB: update_graph() only stores a view in the index. We need to keep the graph object alive.
+    index_->update_graph(handle_, make_const_mdspan(graph_->view()));
+    needs_dynamic_batcher_update = true;
   }
 
   if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) {
@@ -256,7 +277,26 @@ void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
       dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1));
     index_->update_dataset(handle_, dataset_view);
 
-    need_dataset_update_ = false;
+    need_dataset_update_         = false;
+    needs_dynamic_batcher_update = true;
+  }
+
+  // dynamic batching
+  if (sp.dynamic_batching) {
+    if (!dynamic_batcher_ || needs_dynamic_batcher_update) {
+      dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
+        handle_,
+        cuvs::neighbors::dynamic_batching::index_params{{},
+                                                        sp.dynamic_batching_k,
+                                                        sp.dynamic_batching_max_batch_size,
+                                                        sp.dynamic_batching_n_queues,
+                                                        sp.dynamic_batching_conservative_dispatch},
+        *index_,
+        search_params_);
+    }
+    dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+  } else {
+    if (dynamic_batcher_) { dynamic_batcher_.reset(); }
   }
 }
 
@@ -306,7 +346,7 @@ void cuvs_cagra<T, IdxT>::load(const std::string& file)
 template <typename T, typename IdxT>
 std::unique_ptr<algo<T>> cuvs_cagra<T, IdxT>::copy()
 {
-  return std::make_unique<cuvs_cagra<T, IdxT>>(*this);  // use copy constructor
+  return std::make_unique<cuvs_cagra<T, IdxT>>(std::cref(*this));  // use copy constructor
 }
 
 template <typename T, typename IdxT>
@@ -330,8 +370,17 @@ void cuvs_cagra<T, IdxT>::search_base(const T* queries,
     raft::make_device_matrix_view<IdxT, int64_t>(neighbors_idx_t, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
 
-  cuvs::neighbors::cagra::search(
-    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  if (dynamic_batcher_) {
+    cuvs::neighbors::dynamic_batching::search(handle_,
+                                              dynamic_batcher_sp_,
+                                              *dynamic_batcher_,
+                                              queries_view,
+                                              neighbors_view,
+                                              distances_view);
+  } else {
+    cuvs::neighbors::cagra::search(
+      handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  }
 
   if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
     if (raft::get_device_for_address(neighbors) < 0 &&
@@ -367,11 +416,23 @@ void cuvs_cagra<T, IdxT>::search(
   const raft::resources& res    = handle_;
   auto mem_type =
     raft::get_device_for_address(neighbors) >= 0 ? MemoryType::kDevice : MemoryType::kHostPinned;
-  auto& tmp_buf = get_tmp_buffer_from_global_pool(
-    ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
-     (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
-    batch_size * k0);
-  auto* candidates_ptr = reinterpret_cast<algo_base::index_type*>(tmp_buf.data(mem_type));
+
+  // If dynamic batching is used and there's no sync between benchmark laps, multiple sequential
+  // requests can group together. The data is copied asynchronously, and if the same intermediate
+  // buffer is used for multiple requests, they can override each other's data. Hence, we need to
+  // allocate as much space as required by the maximum number of sequential requests.
+  auto max_dyn_grouping = dynamic_batcher_ ? raft::div_rounding_up_safe<int64_t>(
+                                               dynamic_batching_max_batch_size_, batch_size) *
+                                               dynamic_batching_n_queues_
+                                           : 1;
+  auto tmp_buf_size = ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
+                       (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
+                      batch_size * k0;
+  auto& tmp_buf = get_tmp_buffer_from_global_pool(tmp_buf_size * max_dyn_grouping);
+  thread_local static int64_t group_id = 0;
+  auto* candidates_ptr                 = reinterpret_cast<algo_base::index_type*>(
+    reinterpret_cast<uint8_t*>(tmp_buf.data(mem_type)) + tmp_buf_size * group_id);
+  group_id = (group_id + 1) % max_dyn_grouping;
   auto* candidate_dists_ptr =
     reinterpret_cast<float*>(candidates_ptr + (disable_refinement ? 0 : batch_size * k0));
   auto* neighbors_idx_t =
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
index 4c8a91f23..dac766669 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
@@ -19,7 +19,9 @@
 #include "cuvs_ann_bench_utils.h"
 
 #include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
@@ -46,6 +48,13 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
     cuvs::neighbors::ivf_pq::search_params pq_param;
     float refine_ratio = 1.0f;
     [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
+    /* Dynamic batching */
+    bool dynamic_batching = false;
+    int64_t dynamic_batching_k;
+    int64_t dynamic_batching_max_batch_size     = 128;
+    double dynamic_batching_dispatch_timeout_ms = 0.01;
+    size_t dynamic_batching_n_queues            = 3;
+    bool dynamic_batching_conservative_dispatch = true;
   };
 
   using build_param = cuvs::neighbors::ivf_pq::index_params;
@@ -98,6 +107,9 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
   int dimension_;
   float refine_ratio_ = 1.0;
   raft::device_matrix_view<const T, IdxT> dataset_;
+
+  std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
+  cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
 };
 
 template <typename T, typename IdxT>
@@ -138,6 +150,21 @@ void cuvs_ivf_pq<T, IdxT>::set_search_param(const search_param_base& param)
   search_params_ = sp.pq_param;
   refine_ratio_  = sp.refine_ratio;
   assert(search_params_.n_probes <= index_params_.n_lists);
+
+  if (sp.dynamic_batching) {
+    dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
+      handle_,
+      cuvs::neighbors::dynamic_batching::index_params{{},
+                                                      sp.dynamic_batching_k,
+                                                      sp.dynamic_batching_max_batch_size,
+                                                      sp.dynamic_batching_n_queues,
+                                                      sp.dynamic_batching_conservative_dispatch},
+      *index_,
+      search_params_);
+    dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+  } else {
+    dynamic_batcher_.reset();
+  }
 }
 
 template <typename T, typename IdxT>
@@ -168,8 +195,17 @@ void cuvs_ivf_pq<T, IdxT>::search_base(
     raft::make_device_matrix_view<IdxT, uint32_t>(neighbors_idx_t, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
 
-  cuvs::neighbors::ivf_pq::search(
-    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  if (dynamic_batcher_) {
+    cuvs::neighbors::dynamic_batching::search(handle_,
+                                              dynamic_batcher_sp_,
+                                              *dynamic_batcher_,
+                                              queries_view,
+                                              neighbors_view,
+                                              distances_view);
+  } else {
+    cuvs::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  }
 
   if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
     raft::linalg::unaryOp(neighbors,
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 5ceb3010e..a4684ce26 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -272,6 +272,10 @@ static_assert(std::is_aggregate_v<search_params>);
  */
 template <typename T, typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = cagra::index_params;
+  using search_params_type = cagra::search_params;
+  using index_type         = IdxT;
+  using value_type         = T;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/include/cuvs/neighbors/dynamic_batching.hpp b/cpp/include/cuvs/neighbors/dynamic_batching.hpp
new file mode 100644
index 000000000..410800357
--- /dev/null
+++ b/cpp/include/cuvs/neighbors/dynamic_batching.hpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuvs/neighbors/common.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+namespace detail {
+template <typename T, typename IdxT>
+class batch_runner;
+}
+
+/**
+ * @defgroup dynamic_batching_cpp_index_params Dynamic Batching index parameters
+ * @{
+ */
+struct index_params : cuvs::neighbors::index_params {
+  /** The number of neighbors to search is fixed at construction time. */
+  int64_t k;
+  /** Maximum size of the batch to submit to the upstream index. */
+  int64_t max_batch_size = 100;
+  /**
+   * The number of independent request queues.
+   *
+   * Each queue is associated with a unique CUDA stream and IO device buffers. If the number of
+   * concurrent requests is high, using multiple queues allows to fill-in data and prepare the batch
+   * while the other queue is busy. Moreover, the queues are submitted concurrently; this allows to
+   * better utilize the GPU by hiding the kernel launch latencies, which helps to improve the
+   * throughput.
+   */
+  size_t n_queues = 3;
+  /**
+   * By default (`conservative_dispatch = false`) the first CPU thread to commit a query to a batch
+   * dispatches the upstream search function as soon as possible (before the batch is full). In that
+   * case, it does not know the final batch size at the time of calling the upstream search and thus
+   * runs the upstream search with the maximum batch size every time, even if only one valid query
+   * is present in the batch. This reduces the latency at the cost of wasted GPU resources.
+   *
+   * The alternative behavaior (`conservative_dispatch = true`) is more conservative: the dispatcher
+   * thread starts the kernel that gathers input queries, but waits till the batch is full or the
+   * waiting time is exceeded. Only then it acquires the actual batch size and launches the upstream
+   * search. As a result, less GPU resources are wasted at the cost of exposing upstream search
+   * latency.
+   *
+   * *Rule of Thumb*:
+   *    for a large `max_batch_size` set `conservative_dispatch = true`, otherwise keep it disabled.
+   */
+  bool conservative_dispatch = false;
+};
+/** @} */
+
+/**
+ * @defgroup dynamic_batching_cpp_search_params Dynamic Batching search parameters
+ * @{
+ */
+struct search_params : cuvs::neighbors::search_params {
+  /**
+   * How long a request can stay in the queue (milliseconds).
+   * Note, this only affects the dispatch time and does not reflect full request latency;
+   * the latter depends on the upstream search parameters and the batch size.
+   */
+  double dispatch_timeout_ms = 1.0;
+};
+/** @} */
+
+/**
+ * @defgroup dynamic_batching_cpp_index Dynamic Batching index type
+ * @{
+ */
+
+/**
+ * @brief Lightweight dynamic batching index wrapper
+ *
+ * @tparam T data type
+ * @tparam IdxT index type
+ *
+ * One lightweight dynamic batching index manages a single index and a single search parameter set.
+ * This structure should be shared among multiple users via copy semantics: access to the
+ * underlying implementation is managed via a shared pointer, and concurrent search among the
+ * participants is thread-safe.
+ *
+ * __Usage example__
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // When creating a dynamic batching index, k parameter has to be passed explicitly.
+ *   // The first empty braces default-initialize the parent `neighbors::index_params` (unused).
+ *   dynamic_batching::index_params dynb_index_params{{}, k};
+ *   // Construct the index by wrapping the upstream index and search parameters.
+ *   dynamic_batching::index<float, uint32_t> index{
+ *       res, dynb_index_params, upstream_index, upstream_search_params
+ *   };
+ *   // Use default search parameters
+ *   dynamic_batching::search_params search_params;
+ *   // Search K nearest neighbours
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   dynamic_batching::search(
+ *       res, search_params, index, queries, neighbors.view(), distances.view()
+ *   );
+ * @endcode
+ *
+ *
+ * __Priority queues__
+ *
+ * The dynamic batching index has a limited support for prioritizing individual requests.
+ * There's only one pool of queues in the batcher and no functionality to prioritize one bach over
+ * the other. The `search_params::dispatch_timeout_ms` parameters passed in each request are
+ * aggregated internally and the batch is dispatched no later than any of the timeouts is exceeded.
+ * In this logic, a high-priority request can never be processed earlier than any lower-priority
+ * requests submitted earlier.
+ *
+ * However, dynamic batching indexes are lightweight and do not contain any global or static state.
+ * This means it's easy to combine multiple batchers.
+ * As an example, you can construct one batching index per priority class:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // Large batch size (128), couple queues (2),
+ *   //   enabled conservative dispatch - all for better throughput
+ *   dynamic_batching::index_params low_priority_params{{}, k, 128, 2, true};
+ *   // Small batch size (16), more queues (4),
+ *   //   disabled conservative dispatch - to minimize latency with reasonable throughput
+ *   dynamic_batching::index_params high_priority_params{{}, k, 16, 4, false};
+ *   // Construct the indexes by wrapping the upstream index and search parameters.
+ *   dynamic_batching::index<float, uint32_t> low_priority_index{
+ *       res, low_priority_params, upstream_index, upstream_search_params
+ *   };
+ *   dynamic_batching::index<float, uint32_t> high_priority_index{
+ *       res, high_priority_params, upstream_index, upstream_search_params
+ *   };
+ *   // Define a combined search function with priority selection
+ *   double high_priority_threshold_ms = 0.1;
+ *   auto search_function =
+ *      [low_priority_index, high_priority_index, high_priority_threshold_ms](
+ *        raft::resources const &res,
+ *        dynamic_batching::search_params search_params,
+ *        raft::device_matrix_view<const float, int64_t> queries,
+ *        raft::device_matrix_view<uint32_t, int64_t> neighbors,
+ *        raft::device_matrix_view<float, int64_t> distances) {
+ *      dynamic_batching::search(
+ *          res,
+ *          search_params,
+ *          search_params.dispatch_timeout_ms < high_priority_threshold_ms
+ *            ? high_priority_index : low_priority_index,
+ *          queries,
+ *          neighbors,
+ *          distances
+ *      );
+ *   };
+ * @endcode
+ */
+template <typename T, typename IdxT>
+struct index : cuvs::neighbors::index {
+  std::shared_ptr<detail::batch_runner<T, IdxT>> runner;
+
+  /**
+   * @brief Construct a dynamic batching index by wrapping the upstream index.
+   *
+   * @tparam Upstream the upstream index type
+   *
+   * @param[in] res raft resources
+   * @param[in] params dynamic batching parameters
+   * @param[in] upstream_index the original index to perform the search
+   *     (the reference must be alive for the lifetime of the dynamic batching index)
+   * @param[in] upstream_params the original index search parameters for all queries in a batch
+   *     (the parameters are captured by value for the lifetime of the dynamic batching index)
+   * @param[in] sample_filter
+   *     filtering function, if any, must be the same for all requests in a batch
+   *     (the pointer must be alive for the lifetime of the dynamic batching index)
+   */
+  template <typename Upstream>
+  index(const raft::resources& res,
+        const cuvs::neighbors::dynamic_batching::index_params& params,
+        const Upstream& upstream_index,
+        const typename Upstream::search_params_type& upstream_params,
+        const cuvs::neighbors::filtering::base_filter* sample_filter = nullptr);
+};
+/** @} */
+
+/**
+ *
+ * @defgroup dynamic_batching_cpp_search Dynamic Batching search
+ *
+ * @{
+ */
+
+/**
+ * @brief Search ANN using a dynamic batching index.
+ *
+ * The search parameters of the upstream index and the optional filtering function are configured at
+ * the dynamic batching index construction time.
+ *
+ * Like with many other indexes, the dynamic batching search has the stream-ordered semantics: the
+ * host function may return the control before the results are ready. Synchronize with the main CUDA
+ * stream in the given resource object to wait for arrival of the search results.
+ *
+ * Dynamic batching search is thread-safe: call the search function with copies of the same index in
+ * multiple threads to increase the occupancy of the batches.
+ *
+ * @param[in] res
+ * @param[in] params query-specific batching parameters, such as the maximum waiting time
+ * @param[in] index a dynamic batching index
+ * @param[in] queries a device matrix view to a row-major matrix
+ *               [n_queries, dim]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ *               [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors
+ *               [n_queries, k]
+ *
+ */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<float, uint32_t> const& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<half, uint32_t> const& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<int8_t, uint32_t> const& index,
+            raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<uint8_t, uint32_t> const& index,
+            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<float, int64_t> const& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<half, int64_t> const& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<int8_t, int64_t> const& index,
+            raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<uint8_t, int64_t> const& index,
+            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @} */
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/include/cuvs/neighbors/ivf_flat.hpp b/cpp/include/cuvs/neighbors/ivf_flat.hpp
index 7f852d635..e017946d9 100644
--- a/cpp/include/cuvs/neighbors/ivf_flat.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_flat.hpp
@@ -138,6 +138,10 @@ using list_data = ivf::list<list_spec, SizeT, ValueT, IdxT>;
  */
 template <typename T, typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = ivf_flat::index_params;
+  using search_params_type = ivf_flat::search_params;
+  using index_type         = IdxT;
+  using value_type         = T;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp
index ae543c9e9..d85753b7f 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp
@@ -319,6 +319,9 @@ using list_data = ivf::list<list_spec, SizeT, IdxT>;
  */
 template <typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = ivf_pq::index_params;
+  using search_params_type = ivf_pq::search_params;
+  using index_type         = IdxT;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/src/neighbors/detail/dynamic_batching.cuh b/cpp/src/neighbors/detail/dynamic_batching.cuh
new file mode 100644
index 000000000..5c6b1654e
--- /dev/null
+++ b/cpp/src/neighbors/detail/dynamic_batching.cuh
@@ -0,0 +1,1197 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../sample_filter.cuh"
+
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/pinned_mdarray.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cooperative_groups.h>
+#include <cuda/atomic>
+#include <cuda/std/atomic>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <chrono>
+#include <limits>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#ifndef CUVS_SYSTEM_LITTLE_ENDIAN
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CUVS_SYSTEM_LITTLE_ENDIAN 0
+#else
+#define CUVS_SYSTEM_LITTLE_ENDIAN 1
+#endif
+#endif
+
+namespace cuvs::neighbors::dynamic_batching::detail {
+
+using raft::RAFT_NAME;  // TODO: a workaround for RAFT_LOG_XXX macros
+
+/**
+ * A helper to make the requester threads more cooperative when busy-spinning.
+ * It is used in the wait loops across this file to reduce the CPU usage.
+ *
+ * Ideally, we should be using atomics notify/wait feature, but that is not always possible
+ * (e.g. waiting on multiple things or waiting on GPU volatile stores).
+ */
+struct local_waiter {
+  static constexpr inline int64_t kNonSleepIterations = 10;
+
+  explicit local_waiter(std::chrono::nanoseconds base_sleep_time,
+                        int64_t start_iteration = 0) noexcept
+    : base_sleep_time_{base_sleep_time}, iteration_{start_iteration}
+  {
+  }
+
+  inline void wait() noexcept
+  {
+    if (iteration_ < 2) {
+      // Don't wait for the first few iterations:
+      // maybe there's a weak CAS op in the loop, or something else that could return quickly
+    } else if (iteration_ < kNonSleepIterations) {
+      std::this_thread::yield();
+    } else {
+      auto k = iteration_ + 1 - kNonSleepIterations;
+      std::this_thread::sleep_for(base_sleep_time_ * k);
+    }
+    ++iteration_;
+  }
+
+  inline void reset(int64_t start_iteration = 0) noexcept { iteration_ = start_iteration; }
+
+ private:
+  std::chrono::nanoseconds base_sleep_time_;
+  int64_t iteration_;
+};
+
+class cuda_event {
+ public:
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+  ~cuda_event()                       = default;
+  cuda_event(cuda_event const&)       = delete;  // Copying disallowed: one event one owner
+  cuda_event& operator=(cuda_event&)  = delete;
+
+  cuda_event()
+    : event_{[]() {
+               cudaEvent_t* e = new cudaEvent_t;
+               RAFT_CUDA_TRY(cudaEventCreateWithFlags(e, cudaEventDisableTiming));
+               return e;
+             }(),
+             [](cudaEvent_t* e) {
+               RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(*e));
+               delete e;
+             }}
+  {
+  }
+
+  cudaEvent_t value() const { return *event_; }
+
+ private:
+  std::unique_ptr<cudaEvent_t, std::function<void(cudaEvent_t*)>> event_;
+};
+
+template <typename MdSpanOrArray>
+struct get_accessor_type_t {
+  using type = typename MdSpanOrArray::accessor_type;
+};
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+struct get_accessor_type_t<raft::mdarray<ElementType, Extents, LayoutPolicy, ContainerPolicy>> {
+  using mdarray_type = raft::mdarray<ElementType, Extents, LayoutPolicy, ContainerPolicy>;
+  using view_type    = typename mdarray_type::view_type;
+  using type         = typename view_type::accessor_type;
+};
+
+template <typename MdSpanOrArray>
+using get_accessor_type = typename get_accessor_type_t<MdSpanOrArray>::type;
+
+template <typename Source3DT>
+constexpr inline auto slice_3d(typename Source3DT::index_type i,
+                               const Source3DT& source3d,
+                               typename Source3DT::index_type n_rows = 0)
+{
+  using element_type  = typename Source3DT::element_type;
+  using index_type    = typename Source3DT::index_type;
+  using layout_type   = typename Source3DT::layout_type;
+  using accessor_type = get_accessor_type<Source3DT>;
+  auto extent2d =
+    raft::make_extents<index_type>(n_rows == 0 ? source3d.extent(1) : n_rows, source3d.extent(2));
+  auto stride = uint64_t(source3d.extent(1)) * uint64_t(source3d.extent(2));
+  return raft::mdspan<element_type, decltype(extent2d), layout_type, accessor_type>{
+    const_cast<element_type*>(source3d.data_handle()) + stride * i, extent2d};
+}
+
+template <typename Source2DT>
+constexpr inline auto slice_2d(typename Source2DT::index_type i, const Source2DT& source2d)
+{
+  using element_type  = typename Source2DT::element_type;
+  using index_type    = typename Source2DT::index_type;
+  using layout_type   = typename Source2DT::layout_type;
+  using accessor_type = get_accessor_type<Source2DT>;
+  auto extent1d       = raft::make_extents<index_type>(source2d.extent(1));
+  auto stride         = uint64_t(extent1d.extent(0));
+  return raft::mdspan<element_type, decltype(extent1d), layout_type, accessor_type>{
+    const_cast<element_type*>(source2d.data_handle()) + stride * i, extent1d};
+}
+
+// ---------------------------------------------
+
+constexpr size_t kCacheLineBytes = 64;
+
+template <typename Upstream, typename T, typename IdxT>
+using upstream_search_type_const = void(raft::resources const&,
+                                        typename Upstream::search_params_type const&,
+                                        Upstream const&,
+                                        raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                        raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                        raft::device_matrix_view<float, int64_t, raft::row_major>,
+                                        const cuvs::neighbors::filtering::base_filter&);
+
+template <typename Upstream, typename T, typename IdxT>
+using upstream_search_type = void(raft::resources const&,
+                                  typename Upstream::search_params_type const&,
+                                  Upstream&,
+                                  raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<float, int64_t, raft::row_major>,
+                                  const cuvs::neighbors::filtering::base_filter&);
+
+template <typename T, typename IdxT>
+using function_search_type = void(raft::resources const&,
+                                  raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<float, int64_t, raft::row_major>);
+
+/**
+ * State of the batch token slot.
+ *
+ * In a nutshell, there are only two batch slot states that matter: empty or full.
+ * Initially, all slots are empty. The host threads can commit (i.e. subscribe) to a batch slot even
+ * if it's empty (when they know it will be filled-in at some point in future). With this logic, we
+ * smooth out the bottleneck that occurs when many threads try to submit their work using a single
+ * atomic counter (the batch queue head).
+ *
+ * Once a GPU IO buffer is available, its owner returns the buffer to the queue by marking a slot as
+ * full. By that time, it may be partially or fully committed (i.e. several host threads are
+ * committed to submit a certain number of queries).
+ *
+ * If we had an infinite buffer, these two states would suffice. However, we have a finite ring
+ * buffer, so the used-up slots must be emptied again, so that they are usable in the following
+ * rounds through the ring buffer.
+ *
+ * The slot state depends not only on the value stored in it, but on the accessing thread as well
+ * (see `batch_queue_t::batch_status` below). The accessing thread may be ahead or behind the others
+ * (as defined by the sequential order id below). Depending on the accessor state, it may view the
+ * slot as being emptied/filled in the future, current, or previous rounds. This affects the
+ * decision whether the slot can be used and whether the thread has the right to advance tail or
+ * head counters of the batch queue.
+ *
+ */
+enum struct slot_state : int32_t {
+  /** The slot is empty, cleared-up in this round (hence the head should be past it). */
+  kEmptyPast = 1025,
+  /** The slot is empty, cleared-up in previous round. */
+  kEmpty = 1024,
+  /** The slot is empty, cleared-up two round ago and cannot be used yet (due to be filled). */
+  kEmptyBusy = 1023,
+  /** The current thread has been sleeping for too long and is way behind the others. */
+  kFullPast = 1,
+  /** The slot is full, filled-in in this round. */
+  kFull = 0,
+  /** This state is considered full, filled-in in previous round.  */
+  kFullBusy = -1
+  /** The rest of the values are impossible states indicating an error in the algo. */
+};
+
+/**
+ * Identifies the batch and its job-commit state.
+ * Should be in the pinned memory for fast shared access on CPU and GPU side.
+ *
+ * The batch token packs the IO buffer address (id) and a number of committed queries in a single
+ * 64-bit atomic. This is to allow conflict-free atomic updates of both values.
+ *
+ */
+struct batch_token {
+  uint64_t value = 0;
+
+  constexpr inline batch_token() {}
+  explicit constexpr inline batch_token(uint32_t buffer_id) { id() = buffer_id; }
+
+  /**
+   * Sequential id of the batch in the array of batches.
+   *
+   * The `id` field, in practice, stores not only the IO buffer address, but also an extra
+   * sequential "round" id. The latter identifies how many rounds through the batch ring buffer has
+   * already been done (computed from the the `seq_order_id` counter in the batch queue) and is used
+   * by `batch_queue_t::batch_status` below to compute the `slot_state`. This is to avoid the ABA
+   * atomic updates problem when using the ring buffer.
+   *
+   * There cannot be more IO buffers than the size of the ring buffer. The size of the ring buffer
+   * is always a power-of-two. Hence the IO buffer address needs only `log2(Size)` bits, and the
+   * rest is used for the ring buffer round id (see `batch_queue_t::make_seq_batch_id`).
+   *
+   */
+  RAFT_INLINE_FUNCTION auto id() noexcept -> uint32_t&
+  {
+    return *(reinterpret_cast<uint32_t*>(&value) + kOffsetOfId);
+  }
+  /**
+   * How many queries are promised by the participating CPU threads (requesters).
+   *
+   * The CPU threads atomically increment this counter until its size reaches `max_batch_size`.
+   *
+   * Any (CPU or GPU thread) may atomically write to the highest byte of this value, which indicates
+   * that no one can commit to this batch anymore (e.g. the wait timeout is exceeded).
+   * Hence, the actual number of committed queries is `size_committed % 0x00ffffff`.
+   *
+   * The gather kernel cannot finish while `size_committed < max_batch_size`.
+   *
+   * NB: we use the trick of writing to the highest byte to allow GPU write atomically to the pinned
+   * host memory. This way, we don't need to use device RMW atomics on host memory, which are not
+   * available on a broad class of GPUs. If not this workaround, we could simply do atomic add/or
+   * with value 0x01000000.
+   */
+  RAFT_INLINE_FUNCTION auto size_committed() noexcept -> uint32_t&
+  {
+    return *(reinterpret_cast<uint32_t*>(&value) + kOffsetOfSC);
+  }
+
+ private:
+  /** Offset of the `id()` value in the token if it's interpreted as uint32_t[2]. */
+  static constexpr inline uint32_t kOffsetOfId = CUVS_SYSTEM_LITTLE_ENDIAN;
+  /** Offset of the `size_committed()` value in the token if it's interpreted as uint32_t[2]. */
+  static constexpr inline uint32_t kOffsetOfSC = 1 - kOffsetOfId;
+};
+static_assert(sizeof(batch_token) == sizeof(uint64_t));
+static_assert(cuda::std::atomic<batch_token>::is_always_lock_free);
+
+/**
+ * The batch queue consists of several ring buffers and two counters determining where are the head
+ * and the tail of the queue in those buffers.
+ *
+ * There is an internal sequentially consistent order in the queue, defined by `seq_order_id`
+ * counter. The head and tail members define where the participants should look for full and
+ * empty slots in the queue respectively.
+ *
+ * The slots in the queue have their own states (see `slot_state` above). The states are updated
+ * concurrently in many threads, so the head and tail counters do not always accurately represent
+ * the actual compound state of the queue.
+ *
+ * `.head()` is where a host thread starts looking for a batch token. All slots earlier than
+ * returned by this method are not usable anymore (they batches are either "fully committed",
+ * dispatched, or emptied earlier). If a host thread determines that the current slot is not usable
+ * anymore, it increments the counter by calling `.pop()`.
+ *
+ * The tail is where a host thread reserves an empty slot to be filled-in by a GPU worker thread
+ * once it releases the owned IO buffer. There's no `.tail()` method, but `.push()` method returns
+ * the tail position (before advancing it). `.push()` blocks the host thread until it knows the slot
+ * isn't used by any other threads anymore (i.e. cleaned-up from the previous round).
+ *
+ * There's no strict relation between the head and the tail.
+ * Normally there is a single batch in the ring buffer being partially filled. It is followed by
+ * contiguous list of empty idle batches and reserved empty slots. The head and the tail loosely
+ * correspond to the beginning and the end of this sequence.
+ *
+ * Sometimes, the head can go further than the tail. This means all batches are busy and there are
+ * more threads committed to the slots that are not populated with the batches (and not even
+ * reserved for filling-in yet).
+ *
+ *
+ */
+template <uint32_t Size>
+struct batch_queue_t {
+  static constexpr uint32_t kSize        = Size;
+  static constexpr uint32_t kMinElemSize = sizeof(uint32_t);
+  static_assert(cuda::std::atomic<batch_token>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(cuda::std::atomic<uint32_t>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(cuda::std::atomic<int32_t>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(raft::is_a_power_of_two(kSize), "The size must be a power-of-two for efficiency.");
+
+  static constexpr auto kMemOrder = cuda::std::memory_order_relaxed;
+
+  /** Type-safe synonym for the internal head & tail counters. */
+  struct seq_order_id {
+    uint32_t value;
+  };
+
+  explicit batch_queue_t(const raft::resources& res, bool use_batch_sizes) noexcept
+    : tokens_{raft::make_pinned_vector<cuda::atomic<batch_token, cuda::thread_scope_system>,
+                                       uint32_t>(res, kSize)},
+      rem_time_us_{
+        raft::make_pinned_vector<cuda::atomic<int32_t, cuda::thread_scope_system>, uint32_t>(
+          res, kSize)},
+      dispatch_sequence_id_(kSize),
+      batch_sizes_{
+        use_batch_sizes
+          ? std::make_optional(
+              raft::make_pinned_vector<cuda::atomic<uint32_t, cuda::thread_scope_system>, uint32_t>(
+                res, kSize))
+          : std::nullopt}
+  {
+    tail_.store(0, kMemOrder);
+    head_.store(0, kMemOrder);
+    auto past_seq_id = seq_order_id{static_cast<uint32_t>(-1)};
+    for (uint32_t i = 0; i < kSize; i++) {
+      rem_time_us_(i).store(std::numeric_limits<int32_t>::max(), kMemOrder);
+      if (batch_sizes_.has_value()) { batch_sizes_.value()(i).store(0, kMemOrder); }
+      dispatch_sequence_id_[i].store(past_seq_id.value, kMemOrder);
+      tokens_(i).store(make_empty_token(past_seq_id), kMemOrder);
+    }
+  }
+
+  /**
+   * Advance the tail position, ensure the slot is empty, and return the reference to the new slot.
+   * The calling side is responsible for filling-in the slot with an actual value at a later time.
+   *
+   * Conceptually, this method reserves a ring buffer slot on the host side, so that the GPU worker
+   * thread can return the IO buffer (filling the token slot) asynchronously.
+   */
+  inline auto push() -> seq_order_id
+  {
+    seq_order_id seq_id{tail_.fetch_add(1, kMemOrder)};
+    auto& loc = token(seq_id);
+    auto ss   = batch_status(loc.load(kMemOrder), seq_id);
+    /* [Note: very small waiting time]
+
+    Only a few (dispatcher) threads are going to call this function at the same time as opposed to
+    potentially any number of threads waiting on new batches to arrive.
+    This is a performance-critical code path.
+
+    Hence the small base sleep time.
+    */
+    local_waiter till_empty{std::chrono::nanoseconds{1000}};
+    while (ss == slot_state::kFull || ss == slot_state::kFullBusy || ss == slot_state::kEmptyBusy) {
+      // Wait till the slot becomes empty (doesn't matter future or past).
+      // The batch id is only ever updated in the scatter/gather kernels, which are the only source
+      // of truth whether a batch buffer is currently used by the GPU.
+      till_empty.wait();
+      ss = batch_status(loc.load(kMemOrder), seq_id);
+    }
+    return seq_id;
+  }
+
+  /**
+   * Return the offset of the given w.r.t. the tail of the queue.
+   * Negative value means the given slot is in the body of the queue and should be dispatched soon.
+   * Positive value means the given slot is ahead of the queue and should wait longer.
+   *
+   * That is the lower the value the higher the priority.
+   */
+  [[nodiscard]] inline auto niceness(seq_order_id id) const noexcept -> int32_t
+  {
+    return static_cast<int32_t>(id.value - tail_.load(kMemOrder));
+  }
+
+  /** Get the reference to the first element in the queue. */
+  inline auto head() noexcept -> seq_order_id
+  {
+    auto h = head_.load(kMemOrder);
+    // The head cannot go ahead of the tail by more than the queue buffer size.
+    // If the head is ahead by not more than kSize elements though, everything is fine;
+    // the slots too far ahead are protected by busy tokens.
+    local_waiter for_tail(std::chrono::nanoseconds{100000});
+    while (static_cast<int32_t>(h - tail_.load(kMemOrder)) >= static_cast<int32_t>(kSize)) {
+      for_tail.wait();
+      h = head_.load(kMemOrder);
+    }
+    return seq_order_id{h};
+  }
+
+  /** Batch commit state and IO buffer id (see `batch_token`) */
+  inline auto token(seq_order_id id) -> cuda::atomic<batch_token, cuda::thread_scope_system>&
+  {
+    return tokens_(cache_friendly_idx(id.value));
+  }
+
+  /**
+   * How much time has this batch left for waiting.
+   * It is an approximate value by design - to minimize the synchronization between CPU and GPU.
+   *
+   * The clocks on GPU and CPU may have different values, so the running kernel and the CPU thread
+   * have different ideas on how much time is left. Rather than trying to synchronize the clocks, we
+   * maintain independent timers and accept the uncertainty.
+   *
+   * Access pattern: CPU write-only (producer); GPU read-only (consumer).
+   */
+  inline auto rem_time_us(seq_order_id id) -> cuda::atomic<int32_t, cuda::thread_scope_system>&
+  {
+    return rem_time_us_(cache_friendly_idx(id.value));
+  }
+
+  /**
+   * The actual batch size - the final number of committed queries.
+   * This is only used if `conservative_dispatch = true`.
+   */
+  inline auto batch_size(seq_order_id id) noexcept
+    -> cuda::atomic<uint32_t, cuda::thread_scope_system>*
+  {
+    if (batch_sizes_.has_value()) { return &batch_sizes_.value()(cache_friendly_idx(id.value)); }
+    return nullptr;
+  }
+
+  /**
+   * This value is updated by the host thread after it submits the job completion event to indicate
+   * to other threads can wait on the event to get the results back.
+   * Other threads get the value from the batch queue and compare that value against this atomic.
+   *
+   * Access pattern: CPU-only; dispatching thread writes the id once, other threads wait on it.
+   */
+  inline auto dispatch_sequence_id(seq_order_id id) -> cuda::std::atomic<uint32_t>&
+  {
+    return dispatch_sequence_id_[cache_friendly_idx(id.value)];
+  }
+
+  /**
+   * An `atomicMax` on the queue head in disguise.
+   * This makes the given batch slot and all prior slots unreachable (not possible to commit).
+   */
+  inline void pop(seq_order_id id) noexcept
+  {
+    const auto desired = id.value + 1;
+    auto observed      = id.value;
+    while (observed < desired &&
+           !head_.compare_exchange_weak(observed, desired, kMemOrder, kMemOrder)) {}
+  }
+
+  static constexpr inline auto batch_id(batch_token token) noexcept -> uint32_t
+  {
+    return token.id() & kCounterLocMask;
+  }
+
+  /**
+   * Construct a token that is interpreted as having been emptied in the current round
+   * (the round is derived from seq_id).
+   *
+   * NB: "round" is the number of times the queue counters went over the whole ring buffer.
+   *     It's used to avoid the ABA problem for atomic token updates.
+   */
+  static constexpr inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
+  {
+    // Modify the seq_id to identify that the token slot is empty
+    auto empty_round    = static_cast<uint32_t>(slot_state::kEmptyPast) * kSize;
+    auto empty_round_id = seq_order_id{seq_id.value + empty_round};
+    // Id of empty slot is ignored and can be anything
+    auto empty_id = kCounterLocMask;
+    return batch_token{make_seq_batch_id(empty_round_id, empty_id)};
+  }
+
+  /**
+   * Construct a sequential batch id by combining the current round and the real batch id.
+   *
+   * The "round" part gives a hint when the token slot was filled-in to avoid the ABA problem
+   *  (see above).
+   */
+  static constexpr inline auto make_seq_batch_id(seq_order_id seq_id, uint32_t batch_id) noexcept
+    -> uint32_t
+  {
+    return seq_round(seq_id) | batch_id;
+  }
+
+  /**
+   * Get the state of the batch slot w.r.t. the given seq_order_id counter.
+   * This gives the information whether the slot is emptied/filled by another thread and whether
+   * that thread is ahead or behind the current thread.
+   * By introducing these future/past flavours of states we solve the ABA problem for atomic updates
+   * of the ring buffer slots.
+   */
+  static inline auto batch_status(batch_token token, seq_order_id seq_id) -> slot_state
+  {
+    /*
+    The "round" part of the id is just a seq_id without the low bits.
+    Essentially, we comparing here seq_ids of two threads: the one that wrote to the slot in the
+    past and the one reads from it now.
+
+    `kSize` determines the number of bits we use for the IO buffer id and for the round id.
+      */
+    auto v =
+      static_cast<int32_t>(seq_round(token) - seq_round(seq_id)) / static_cast<int32_t>(kSize);
+    if (v < static_cast<int32_t>(slot_state::kFullBusy)) { RAFT_FAIL("Invalid batch state %d", v); }
+    if (v < static_cast<int32_t>(slot_state::kEmptyBusy)) {
+      return static_cast<slot_state>(std::min(v, static_cast<int32_t>(slot_state::kFullPast)));
+    }
+    return static_cast<slot_state>(std::min(v, static_cast<int32_t>(slot_state::kEmptyPast)));
+  }
+
+ private:
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
+
+  alignas(kCacheLineBytes)
+    raft::pinned_vector<cuda::atomic<batch_token, cuda::thread_scope_system>, uint32_t> tokens_;
+  raft::pinned_vector<cuda::atomic<int32_t, cuda::thread_scope_system>, uint32_t> rem_time_us_;
+  std::vector<cuda::std::atomic<uint32_t>> dispatch_sequence_id_;
+  std::optional<raft::pinned_vector<cuda::atomic<uint32_t, cuda::thread_scope_system>, uint32_t>>
+    batch_sizes_;
+
+  /* [Note: cache-friendly indexing]
+     To avoid false sharing, the queue pushes and pops values not sequentially, but with an
+     increment that is larger than the cache line size.
+     Hence we introduce the `kCounterIncrement > kCacheLineBytes`.
+     However, to make sure all indices are used, we choose the increment to be coprime with the
+     buffer size. We also require that the buffer size is a power-of-two for two reasons:
+       1) Fast modulus operation - reduces to binary `and` (with `kCounterLocMask`).
+       2) Easy to ensure GCD(kCounterIncrement, kSize) == 1 by construction
+          (see the definition below).
+   */
+  static constexpr uint32_t kElemsPerCacheLine =
+    raft::div_rounding_up_safe<uint32_t>(kCacheLineBytes, kMinElemSize);
+  static constexpr uint32_t kCounterIncrement = raft::bound_by_power_of_two(kElemsPerCacheLine) + 1;
+  static constexpr uint32_t kCounterLocMask   = kSize - 1;
+  // These props hold by design, but we add them here as a documentation and a sanity check.
+  static_assert(
+    kCounterIncrement * kMinElemSize >= kCacheLineBytes,
+    "The counter increment should be larger than the cache line size to avoid false sharing.");
+  static_assert(
+    std::gcd(kCounterIncrement, kSize) == 1,
+    "The counter increment and the size must be coprime to allow using all of the queue slots.");
+  /** Map the sequential index onto cache-friendly strided index. */
+  static constexpr inline auto cache_friendly_idx(uint32_t source_idx) noexcept -> uint32_t
+  {
+    return (source_idx * kCounterIncrement) & kCounterLocMask;
+  }
+
+  /** The "round": the number of times the queue counter went over the whole ring buffer. */
+  static constexpr inline auto seq_round(seq_order_id id) noexcept -> uint32_t
+  {
+    return id.value & ~kCounterLocMask;
+  }
+
+  /** The "round": the number of times the queue counter went over the whole ring buffer. */
+  static constexpr inline auto seq_round(batch_token token) noexcept -> uint32_t
+  {
+    return token.id() & ~kCounterLocMask;
+  }
+};
+
+template <typename T, typename IdxT>
+struct alignas(kCacheLineBytes) request_pointers {
+  /**
+   * A pointer to `dim` values of a single query (input).
+   *
+   * Serves as a synchronization point between the CPU thread (producer) and a GPU block in the
+   * `gather_inputs` kernel (consumer).
+   */
+  cuda::atomic<const T*, cuda::thread_scope_system> query{nullptr};
+  /** A pointer to `k` nearest neighbors (output) */
+  IdxT* neighbors{nullptr};
+  /** A pointer to distances of `k` nearest neighbors (output) */
+  float* distances{nullptr};
+};
+
+/**
+ * Check the current timestamp at the moment of construction and repeatedly compare the elapsed time
+ * to the timeout value provided by the host (passed via an atomic).
+ *
+ * This is used in the gather inputs kernel to make it stop waiting for new queries in a batch
+ * once the deadline is reached.
+ */
+struct gpu_time_keeper {
+  /**
+   * @param[in] cpu_provided_remaining_time_us
+   *   a pointer to a shared atomic, represent the remaining waiting time in microseconds.
+   *   Note, the remaining time is updated atomically by each participating host thread in their
+   *   "private coordinate systems". That's ok, we don't expect a single reference time for all host
+   *   and device threads.
+   *   We tolerate the errors coming from the time difference between the host thread writing their
+   *   remaining waiting time and the GPU thread reading that value.
+   */
+  RAFT_DEVICE_INLINE_FUNCTION explicit gpu_time_keeper(
+    cuda::atomic<int32_t, cuda::thread_scope_system>* cpu_provided_remaining_time_us)
+    : cpu_provided_remaining_time_us_{cpu_provided_remaining_time_us}
+  {
+    update_timestamp();
+  }
+
+  /**
+   * Check whether the deadline is not reached yet:
+   * 1) Compare the internal clock against the last-read deadline value
+   * 2) Read the deadline value from the host-visible atomic and check the internal clock again.
+   */
+  RAFT_DEVICE_INLINE_FUNCTION auto has_time() noexcept -> bool
+  {
+    if (timeout) { return false; }
+    update_local_remaining_time();
+    if (local_remaining_time_us_ <= 0) {
+      timeout = true;
+      return false;
+    }
+    update_cpu_provided_remaining_time();
+    if (local_remaining_time_us_ <= 0) {
+      timeout = true;
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  cuda::atomic<int32_t, cuda::thread_scope_system>* cpu_provided_remaining_time_us_;
+  uint64_t timestamp_ns_           = 0;
+  int32_t local_remaining_time_us_ = std::numeric_limits<int32_t>::max();
+  bool timeout                     = false;
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_timestamp() noexcept
+  {
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timestamp_ns_));
+  }
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_local_remaining_time() noexcept
+  {
+    auto prev_timestamp = timestamp_ns_;
+    update_timestamp();
+    // subtract the time passed since the last check
+    // (assuming local time is updated every time timestamp is read)
+    local_remaining_time_us_ -= static_cast<int32_t>((timestamp_ns_ - prev_timestamp) / 1000ull);
+  }
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_cpu_provided_remaining_time() noexcept
+  {
+    local_remaining_time_us_ =
+      std::min<int32_t>(local_remaining_time_us_,
+                        cpu_provided_remaining_time_us_->load(cuda::std::memory_order_relaxed));
+  }
+};
+
+/**
+ * Copy the queries from the submitted pointers to the batch store, one query per block.
+ * Upon completion of this kernel, the submitted queries are all in the contiguous buffer
+ * `batch_queries`.
+ *
+ * Block size: (n, 1, 1) any number of threads copying a single row of data.
+ * Grid size: (max_batch_size, 1, 1) - one block per query
+ *
+ * Note, we view the incoming queries and the batch as going through multiple stages:
+ *   1) A host thread "commits" a query: it reserves a slot for the query in the batch and promises
+ *      to fill-in the corresponding query pointer.
+ *   2) A host thread "submits" the query: it fills-in the pointer to the query data in the reserved
+ *      slot.
+ *   3) This kernel copies the query data to the contiguous query buffer owned by the batch.
+ *
+ * The batch is "fully committed" when the number of committed queries reaches the maximum batch
+ * size (all slots are reserved). Committing, submitting, and copying of the queries is somewhat
+ * overlapped among multiple host and device threads. Only the copying happens in a CUDA stream in
+ * this kernel, and the upstream search is dispatched right after this kernel (in the same stream).
+ *
+ */
+template <typename T, typename IdxT>
+RAFT_KERNEL gather_inputs(
+  raft::device_matrix_view<T, uint32_t, raft::row_major> batch_queries,
+  raft::pinned_vector_view<request_pointers<T, IdxT>, uint32_t> request_ptrs,
+  /* The remaining time may be updated on the host side: a thread with a tighter deadline may reduce
+     it (but not increase). */
+  cuda::atomic<int32_t, cuda::thread_scope_system>* remaining_time_us,
+  /* The token contains the current number of queries committed and is cleared in this kernel. */
+  cuda::atomic<batch_token, cuda::thread_scope_system>* batch_token_ptr,
+  /* The host-visible batch size counter (used in `conservative_dispatch`). */
+  cuda::atomic<uint32_t, cuda::thread_scope_system>* batch_size_out,
+  /**
+   * The token value considered empty depends on the round over the ring buffer
+   * (which is defined by the seq_order_id)
+   */
+  batch_token empty_token_value,
+  /**
+   * The counter is used to find the last CTA to finish and to share the batch size with the
+   * scatter_inputs kernel.
+   */
+  cuda::atomic<uint32_t, cuda::std::thread_scope_device>* kernel_progress_counter)
+{
+  const uint32_t query_id = blockIdx.x;
+  __shared__ const T* query_ptr;
+
+  if (threadIdx.x == 0) {
+    query_ptr = nullptr;
+
+    // NB: we have to read/write to `batch_token_ptr`, `bs_committed`, and `batch_fully_committed`
+    // using volatile assembly ops, because otherwise the compiler seems to fail to understand that
+    // this is the same location in memory. The order of reads in writes here is extremely
+    // important, as it involves multiple host and device threads (the host threads do RMW atomic
+    // increments on the commit counter).
+    volatile uint32_t* bs_committed =
+      reinterpret_cast<volatile uint32_t*>(batch_token_ptr) + 1 - CUVS_SYSTEM_LITTLE_ENDIAN;
+    volatile uint8_t* batch_fully_committed =
+      reinterpret_cast<volatile uint8_t*>(bs_committed) + (CUVS_SYSTEM_LITTLE_ENDIAN * 3);
+
+    gpu_time_keeper runtime{remaining_time_us};
+    bool committed          = false;  // if the query is committed, we have to wait for it to arrive
+    auto& request_query_ptr = request_ptrs(query_id).query;
+    while (true) {
+      query_ptr = request_query_ptr.load(cuda::std::memory_order_acquire);
+      if (query_ptr != nullptr) {
+        // The query is submitted to this block's slot; erase the pointer buffer for future use and
+        // exit the loop.
+        request_query_ptr.store(nullptr, cuda::std::memory_order_relaxed);
+        break;
+      }
+      // The query hasn't been submitted, but is already committed; other checks may be skipped
+      if (committed) { continue; }
+      // Check if the query is committed
+      uint32_t committed_count;
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed = (committed_count & 0x00ffffff) > query_id;
+      if (committed) { continue; }
+      // If the query is not committed, but the batch is past the deadline, we exit without copying
+      // the query
+      if (committed_count > 0x00ffffff) { break; }
+      // The query hasn't been submitted yet; check if we're past the deadline
+      if (runtime.has_time()) { continue; }
+      // Otherwise, let the others know time is out
+      // Set the highest byte of the commit counter to 1 (thus avoiding RMW atomic)
+      // This prevents any more CPU threads from committing to this batch.
+      asm volatile("st.volatile.global.u8 [%0], %1;"
+                   :
+                   : "l"(batch_fully_committed), "r"(1)
+                   : "memory");
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed = (committed_count & 0x00ffffff) > query_id;
+      if (committed) { continue; }
+      break;
+    }
+    auto progress = kernel_progress_counter->fetch_add(1, cuda::std::memory_order_acq_rel) + 1;
+    if (progress >= gridDim.x) {
+      // read the last value of the committed count to know the batch size for sure
+      uint32_t committed_count;
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed_count &= 0x00ffffff;  // Clear the timeout bit
+      if (batch_size_out != nullptr) {
+        // Inform the dispatcher about the final batch size if `conservative_dispatch` is enabled
+        batch_size_out->store(committed_count, cuda::std::memory_order_relaxed);
+      }
+      // store the batch size in the progress counter, so we can read it in the scatter kernel
+      kernel_progress_counter->store(committed_count, cuda::std::memory_order_relaxed);
+      // Clear the batch token slot, so it can be re-used by others
+      asm volatile("st.volatile.global.u64 [%0], %1;"
+                   :
+                   : "l"(reinterpret_cast<uint64_t*>(batch_token_ptr)),
+                     "l"(reinterpret_cast<uint64_t&>(empty_token_value))
+                   : "memory");
+    }
+  }
+  // The block waits till the leading thread gets the query pointer
+  cooperative_groups::this_thread_block().sync();
+  auto query_ptr_local = query_ptr;
+  if (query_ptr_local == nullptr) { return; }
+  // block-wide copy input query
+  auto dim = batch_queries.extent(1);
+  for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+    batch_queries(query_id, i) = query_ptr_local[i];
+  }
+}
+
+/** Copy the results of the search back to the requesters. */
+template <typename T, typename IdxT>
+RAFT_KERNEL scatter_outputs(
+  raft::pinned_vector_view<request_pointers<T, IdxT>, uint32_t> request_ptrs,
+  raft::device_matrix_view<const IdxT, uint32_t> batch_neighbors,
+  raft::device_matrix_view<const float, uint32_t> batch_distances,
+  cuda::atomic<uint32_t, cuda::std::thread_scope_device>* kernel_progress_counter,
+  cuda::atomic<batch_token, cuda::thread_scope_system>* next_token,
+  uint32_t batch_id)
+{
+  __shared__ uint32_t batch_size;
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    batch_size = kernel_progress_counter->exchange(0, cuda::std::memory_order_relaxed);
+  }
+  // Copy output
+  cooperative_groups::this_thread_block().sync();
+  auto k = batch_neighbors.extent(1);
+  for (uint32_t i = threadIdx.y; i < batch_size; i += blockDim.y) {
+    auto* request_neighbors = request_ptrs(i).neighbors;
+    auto* request_distances = request_ptrs(i).distances;
+    for (uint32_t j = threadIdx.x; j < k; j += blockDim.x) {
+      request_neighbors[j] = batch_neighbors(i, j);
+      request_distances[j] = batch_distances(i, j);
+    }
+  }
+  // Clear the batch state after all threads copied the data, so the batch can be reused
+  cuda::atomic_thread_fence(cuda::std::memory_order_release, cuda::thread_scope_system);
+  cooperative_groups::this_thread_block().sync();
+  if (threadIdx.x != 0 || threadIdx.y != 0) { return; }
+  reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(
+    &reinterpret_cast<batch_token*>(next_token)->id())
+    ->store(batch_id, cuda::std::memory_order_relaxed);
+}
+
+/**
+ * Batch runner is shared among the users of the `dynamic_batching::index` (i.e. the index can be
+ * copied, but the copies hold shared pointers to a single batch runner).
+ *
+ * Constructor and destructor of this class do not need to be thread-safe, as their execution is
+ * guaranteed to happen in one thread by the holding shared pointer.
+ *
+ * The search function must be thread-safe. We only have to pay attention to the `mutable` members
+ * though, because the function is marked const.
+ */
+template <typename T, typename IdxT>
+class batch_runner {
+ public:
+  constexpr static uint32_t kMaxNumQueues = 256;
+
+  using batch_queue  = batch_queue_t<kMaxNumQueues>;
+  using seq_order_id = typename batch_queue::seq_order_id;
+
+  // Save the parameters and the upstream batched search function to invoke
+  template <typename Upstream>
+  batch_runner(const raft::resources& res,
+               const dynamic_batching::index_params& params,
+               const Upstream& upstream_index,
+               const typename Upstream::search_params_type& upstream_params,
+               upstream_search_type_const<Upstream, T, IdxT>* upstream_search,
+               const cuvs::neighbors::filtering::base_filter* sample_filter)
+    : res_{res},
+      upstream_search_{[&upstream_index, upstream_search, upstream_params, sample_filter](
+                         raft::resources const& res,
+                         raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+                         raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,
+                         raft::device_matrix_view<float, int64_t, raft::row_major> distances) {
+        /* Note: passing sample_filter by pointer
+
+        Ideally, dynamic batching would capture the filter by value. Unfortunately, one cannot use
+        the copy constructor of the `base_filter` (it would erase the actual filter type).
+        Therefore, we can only pass the filter by pointer or reference and require the user to keep
+        the filter alive for the lifetime of the dynamic batching index.
+        This, however, may lead to a segfault when the user doesn't provide the filter argument and
+        the argument is passed by reference: the lifetime of the none_sample_filter default argument
+        is limited to the search function call, so it is destroyed while the dynamic batching index
+        is still alive.
+        Hence the solution is to pass the filter by pointer and default it to nullptr.
+        */
+        if (sample_filter == nullptr) {
+          using base_filter_type = cuvs::neighbors::filtering::base_filter;
+          const auto none_filter = cuvs::neighbors::filtering::none_sample_filter{};
+          return upstream_search(res,
+                                 upstream_params,
+                                 upstream_index,
+                                 queries,
+                                 neighbors,
+                                 distances,
+                                 static_cast<const base_filter_type&>(none_filter));
+
+        } else {
+          return upstream_search(
+            res, upstream_params, upstream_index, queries, neighbors, distances, *sample_filter);
+        }
+      }},
+      k_{uint32_t(params.k)},
+      dim_{uint32_t(upstream_index.dim())},
+      max_batch_size_{uint32_t(params.max_batch_size)},
+      n_queues_{uint32_t(params.n_queues)},
+      batch_queue_{res_, params.conservative_dispatch},
+      completion_events_(n_queues_),
+      input_extents_{n_queues_, max_batch_size_, dim_},
+      output_extents_{n_queues_, max_batch_size_, k_},
+      queries_{raft::make_device_mdarray<T>(res_, input_extents_)},
+      neighbors_{raft::make_device_mdarray<IdxT>(res_, output_extents_)},
+      distances_{raft::make_device_mdarray<float>(res_, output_extents_)},
+      kernel_progress_counters_{
+        raft::make_device_vector<cuda::atomic<uint32_t, cuda::std::thread_scope_device>>(
+          res_, n_queues_)},
+      request_ptrs_{raft::make_pinned_matrix<request_pointers<T, IdxT>, uint32_t>(
+        res_, n_queues_, max_batch_size_)}
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      kernel_progress_counters_.data_handle(),
+      0,
+      sizeof(*kernel_progress_counters_.data_handle()) * kernel_progress_counters_.size(),
+      raft::resource::get_cuda_stream(res_)));
+    // Make sure to initialize the atomic values in the batch_state structs.
+    for (uint32_t i = 0; i < n_queues_; i++) {
+      auto seq_id = batch_queue_.push();
+      batch_queue_.token(seq_id).store(batch_token{batch_queue::make_seq_batch_id(seq_id, i)});
+      // Make sure to initialize query pointers, because they are used for synchronization
+      for (uint32_t j = 0; j < max_batch_size_; j++) {
+        new (&request_ptrs_(i, j)) request_pointers<T, IdxT>{};
+      }
+    }
+  }
+
+  // A workaround for algos, which have non-const `index` type in their arguments
+  template <typename Upstream>
+  batch_runner(const raft::resources& res,
+               const dynamic_batching::index_params& params,
+               const Upstream& upstream_index,
+               const typename Upstream::search_params_type& upstream_params,
+               upstream_search_type<Upstream, T, IdxT>* upstream_search,
+               const cuvs::neighbors::filtering::base_filter* sample_filter)
+    : batch_runner{
+        res,
+        params,
+        upstream_index,
+        upstream_params,
+        reinterpret_cast<upstream_search_type_const<Upstream, T, IdxT>*>(upstream_search),
+        sample_filter}
+  {
+  }
+
+  void search(raft::resources const& res,
+              cuvs::neighbors::dynamic_batching::search_params const& params,
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+              raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances) const
+  {
+    uint32_t n_queries = queries.extent(0);
+    if (n_queries >= max_batch_size_) {
+      return upstream_search_(res, queries, neighbors, distances);
+    }
+
+    if (neighbors.extent(1) != int64_t(k_)) {
+      // TODO: the check can be relaxed to `neighbors.extent(1) > int64_t(k_)`;
+      //       this, however, would require an extra bounds check per-query in the scatter kernel.
+      RAFT_LOG_WARN(
+        "The requested number of neighbors (%zd) doesn't match the configured "
+        "dynamic_batching::index_params::k (%u); dynamic batching is disabled for the request.",
+        neighbors.extent(1),
+        k_);
+      return upstream_search_(res, queries, neighbors, distances);
+    }
+
+    auto deadline = std::chrono::system_clock::now() +
+                    std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1000000.0));
+
+    int64_t local_io_offset = 0;
+    batch_token batch_token_observed{0};
+    local_waiter to_commit{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 3e5)),
+                           local_waiter::kNonSleepIterations};
+    while (true) {
+      const auto seq_id        = batch_queue_.head();
+      const auto commit_result = try_commit(seq_id, n_queries);
+      // The bool (busy or not) returned if no queries were committed:
+      if (std::holds_alternative<bool>(commit_result)) {
+        // Pause if the system is busy
+        // (otherwise the progress is guaranteed due to update of the head counter)
+        if (std::get<bool>(commit_result)) { to_commit.wait(); }
+        continue;  // Try to get a new batch token
+      }
+      batch_token_observed           = std::get<batch_token>(std::get<0>(commit_result));
+      const auto queries_committed   = std::get<uint32_t>(std::get<0>(commit_result));
+      const auto batch_offset        = batch_token_observed.size_committed();
+      auto& batch_token_ref          = batch_queue_.token(seq_id);
+      auto& rem_time_us_ref          = batch_queue_.rem_time_us(seq_id);
+      auto& dispatch_sequence_id_ref = batch_queue_.dispatch_sequence_id(seq_id);
+      auto* batch_size_ptr           = batch_queue_.batch_size(seq_id);
+      // sleep for 1/10 of deadline time or more
+      //   (if couldn't get the value in the first few iterations).
+      local_waiter till_full{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5)),
+                             batch_queue_.niceness(seq_id)};
+      while (batch_queue::batch_status(batch_token_observed, seq_id) != slot_state::kFull) {
+        /* Note: waiting for batch IO buffers
+        The CPU threads can commit to the incoming batches in the queue in advance (this happens in
+        try_commit).
+        In this loop, a thread waits for the batch IO buffer to be released by a running search on
+        the GPU side (scatter_outputs kernel). Hence, this loop is engaged only if all buffers are
+        currently used, which suggests that the GPU is busy (or there's not enough IO buffers).
+        This also means the current search is not likely to meet the deadline set by the user.
+
+        The scatter kernel returns its buffer id into an acquired slot in the batch queue; in this
+        loop we wait for that id to arrive.
+
+        Generally, we want to waste as little as possible CPU cycles here to let other threads wait
+        on dispatch_sequence_id_ref below more efficiently. At the same time, we shouldn't use
+        `.wait()` here, because `.notify_all()` would have to come from GPU.
+        */
+        till_full.wait();
+        batch_token_observed = batch_token_ref.load(cuda::std::memory_order_acquire);
+      }
+      // Whether this thread is responsible for dispatching the batch.
+      bool is_dispatcher = batch_offset == 0;
+      auto stream        = raft::resource::get_cuda_stream(res);
+      auto batch_id      = batch_queue::batch_id(batch_token_observed);
+      auto request_ptrs  = slice_2d(batch_id, request_ptrs_);
+
+      if (is_dispatcher) {
+        // Conservatively initialize the remaining time
+        // TODO (achirkin): this initialization may happen after the other requesters update the
+        //                  time and thus erase their deadlines.
+        rem_time_us_ref.store(static_cast<int32_t>(params.dispatch_timeout_ms * 1000),
+                              cuda::std::memory_order_relaxed);
+        // run the gather kernel before submitting the data to reduce the latency
+        gather_inputs<T, IdxT><<<max_batch_size_, 32, 0, stream>>>(
+          slice_3d(batch_id, queries_),
+          request_ptrs,
+          &rem_time_us_ref,
+          &batch_token_ref,
+          batch_size_ptr,
+          // This indicates the empty token slot, which can only be used in the following round
+          batch_queue::make_empty_token(seq_id),
+          kernel_progress_counters_.data_handle() + batch_id);
+      }
+
+      // *** Set the pointers to queries, neighbors, distances - query-by-query
+      for (uint32_t i = 0; i < queries_committed; i++) {
+        const auto o   = local_io_offset + i;
+        auto& ptrs     = request_ptrs(batch_offset + i);
+        ptrs.neighbors = neighbors.data_handle() + o * k_;
+        ptrs.distances = distances.data_handle() + o * k_;
+        ptrs.query.store(queries.data_handle() + o * dim_, cuda::std::memory_order_release);
+      }
+
+      // Submit estimated remaining time
+      {
+        auto rem_time_us = static_cast<int32_t>(
+          std::max<int64_t>(0, (deadline - std::chrono::system_clock::now()).count()) / 1000);
+        rem_time_us_ref.fetch_min(rem_time_us, cuda::std::memory_order_relaxed);
+      }
+
+      if (is_dispatcher) {
+        uint32_t batch_size = max_batch_size_;
+        if (batch_size_ptr != nullptr) {
+          // Block until the real batch size is available if conservative dispatch is used.
+          local_waiter for_dispatch{
+            std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5))};
+          batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed);
+          while (batch_size == 0) {
+            for_dispatch.wait();
+            batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed);
+          }
+          batch_size_ptr->store(0, cuda::std::memory_order_relaxed);
+        }
+        auto batch_neighbors = slice_3d(batch_id, neighbors_, batch_size);
+        auto batch_distances = slice_3d(batch_id, distances_, batch_size);
+        upstream_search_(
+          res, slice_3d(batch_id, queries_, batch_size), batch_neighbors, batch_distances);
+        auto next_seq_id     = batch_queue_.push();
+        auto& next_token_ref = batch_queue_.token(next_seq_id);
+        // next_batch_token);
+        auto bs = dim3(128, 8, 1);
+        scatter_outputs<T, IdxT>
+          <<<1, bs, 0, stream>>>(request_ptrs,
+                                 batch_neighbors,
+                                 batch_distances,
+                                 kernel_progress_counters_.data_handle() + batch_id,
+                                 &next_token_ref,
+                                 batch_queue::make_seq_batch_id(next_seq_id, batch_id));
+        RAFT_CUDA_TRY(cudaEventRecord(completion_events_[batch_id].value(), stream));
+        dispatch_sequence_id_ref.store(seq_id.value, cuda::std::memory_order_release);
+        dispatch_sequence_id_ref.notify_all();
+
+      } else {
+        // Wait till the dispatch_sequence_id counter is updated, which means the event is recorded
+        auto dispatched_id_observed =
+          dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire);
+        while (static_cast<int32_t>(seq_id.value - dispatched_id_observed) > 0) {
+          dispatch_sequence_id_ref.wait(dispatched_id_observed, cuda::std::memory_order_relaxed);
+          dispatched_id_observed = dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire);
+        }
+        // Now we can safely record the event
+        RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, completion_events_[batch_id].value()));
+      }
+
+      n_queries -= queries_committed;
+
+      if (n_queries == 0) { return; }
+      // If not all queries were committed, continue in the loop.
+      // TODO: it could potentially be more efficient to first commit everything and only then
+      //        submit the work/wait for the event
+      local_io_offset += queries_committed;
+      to_commit.reset(
+        local_waiter::kNonSleepIterations);  // reset the waiter for the next iteration.
+    }
+  }
+
+ private:
+  raft::resources res_;  // Sic! Store by value to copy the resource.
+  std::function<function_search_type<T, IdxT>> upstream_search_;
+  uint32_t k_;
+  uint32_t dim_;
+  uint32_t max_batch_size_;
+  uint32_t n_queues_;
+
+  mutable batch_queue batch_queue_;
+  std::vector<cuda_event> completion_events_;
+
+  using batch_extents = raft::extent_3d<uint32_t>;
+  batch_extents input_extents_;
+  batch_extents output_extents_;
+
+  mutable raft::device_mdarray<T, batch_extents, raft::row_major> queries_;
+  mutable raft::device_mdarray<IdxT, batch_extents, raft::row_major> neighbors_;
+  mutable raft::device_mdarray<float, batch_extents, raft::row_major> distances_;
+  mutable raft::device_vector<cuda::atomic<uint32_t, cuda::std::thread_scope_device>>
+    kernel_progress_counters_;
+
+  mutable raft::pinned_matrix<request_pointers<T, IdxT>, uint32_t, raft::row_major> request_ptrs_;
+
+  /**
+   * Try to commit n_queries at most; returns the last observed batch_token (where `size_committed`
+   * represents offset at which new queries are committed if successful), the number of committed
+   * queries, or whether the ring buffer appears to be busy (on unsuccessful commit).
+   */
+  auto try_commit(seq_order_id seq_id, uint32_t n_queries) const
+    -> std::variant<std::tuple<batch_token, uint32_t>, bool>
+  {
+    auto& batch_token_ref            = batch_queue_.token(seq_id);
+    batch_token batch_token_observed = batch_token_ref.load(cuda::std::memory_order_relaxed);
+    batch_token batch_token_updated;
+    slot_state token_status;
+    do {
+      // The interpretation of the token status depends on the current seq_order_id and a similar
+      // counter in the token. This is to prevent conflicts when too many parallel requests wrap
+      // over the whole ring buffer (batch_queue_t).
+      token_status = batch_queue::batch_status(batch_token_observed, seq_id);
+      // Busy status means the current thread is a whole ring buffer ahead of the token.
+      // The thread should wait for the rest of the system.
+      if (token_status == slot_state::kFullBusy || token_status == slot_state::kEmptyBusy) {
+        return true;
+      }
+      // This branch checks if the token was recently filled or dispatched.
+      // This means the head counter of the ring buffer is slightly outdated.
+      if (token_status == slot_state::kEmptyPast || token_status == slot_state::kFullPast ||
+          batch_token_observed.size_committed() >= max_batch_size_) {
+        batch_queue_.pop(seq_id);
+        return false;
+      }
+      batch_token_updated = batch_token_observed;
+      batch_token_updated.size_committed() =
+        std::min(batch_token_observed.size_committed() + n_queries, max_batch_size_);
+    } while (!batch_token_ref.compare_exchange_weak(batch_token_observed,
+                                                    batch_token_updated,
+                                                    cuda::std::memory_order_acq_rel,
+                                                    cuda::std::memory_order_relaxed));
+    if (batch_token_updated.size_committed() >= max_batch_size_) {
+      // The batch is already full, let's try to pop it from the queue
+      //                                 (if nobody has done so already)
+      batch_queue_.pop(seq_id);
+    }
+    return std::make_tuple(
+      batch_token_observed,
+      batch_token_updated.size_committed() - batch_token_observed.size_committed());
+  }
+};
+
+}  // namespace cuvs::neighbors::dynamic_batching::detail
diff --git a/cpp/src/neighbors/dynamic_batching.cu b/cpp/src/neighbors/dynamic_batching.cu
new file mode 100644
index 000000000..6be70353b
--- /dev/null
+++ b/cpp/src/neighbors/dynamic_batching.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/dynamic_batching.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+// NB: the (template) index parameter should be the last; it may contain the spaces and so split
+//       into multiple preprocessor token. Then it is consumed as __VA_ARGS__
+//
+#define CUVS_INST_DYNAMIC_BATCHING_INDEX(T, IdxT, Namespace, ...)                         \
+  template <>                                                                             \
+  template <>                                                                             \
+  index<T, IdxT>::index<Namespace ::__VA_ARGS__>(                                         \
+    const raft::resources& res,                                                           \
+    const cuvs::neighbors::dynamic_batching::index_params& params,                        \
+    const Namespace ::__VA_ARGS__& upstream_index,                                        \
+    const typename Namespace ::__VA_ARGS__::search_params_type& upstream_params,          \
+    const cuvs::neighbors::filtering::base_filter* sample_filter)                         \
+    : runner{new detail::batch_runner<T, IdxT>(                                           \
+        res, params, upstream_index, upstream_params, Namespace ::search, sample_filter)} \
+  {                                                                                       \
+  }
+
+#define CUVS_INST_DYNAMIC_BATCHING_SEARCH(T, IdxT)                                 \
+  void search(raft::resources const& res,                                          \
+              cuvs::neighbors::dynamic_batching::search_params const& params,      \
+              cuvs::neighbors::dynamic_batching::index<T, IdxT> const& index,      \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries, \
+              raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,  \
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances) \
+  {                                                                                \
+    return index.runner->search(res, params, queries, neighbors, distances);       \
+  }
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, uint32_t, cuvs::neighbors::cagra, index<float, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(half, uint32_t, cuvs::neighbors::cagra, index<half, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, uint32_t, cuvs::neighbors::cagra, index<int8_t, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t,
+                                 uint32_t,
+                                 cuvs::neighbors::cagra,
+                                 index<uint8_t, uint32_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(half, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_flat, index<float, int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t,
+                                 int64_t,
+                                 cuvs::neighbors::ivf_flat,
+                                 index<int8_t, int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t,
+                                 int64_t,
+                                 cuvs::neighbors::ivf_flat,
+                                 index<uint8_t, int64_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, uint32_t);  // uint32_t index type is needed for CAGRA
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, uint32_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, uint32_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, uint32_t);
+
+#undef CUVS_INST_DYNAMIC_BATCHING_INDEX
+#undef CUVS_INST_DYNAMIC_BATCHING_SEARCH
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 286d721d7..1c8de2ad0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -175,6 +175,19 @@ if(BUILD_TESTS)
     100
   )
 
+  ConfigureTest(
+    NAME
+    NEIGHBORS_DYNAMIC_BATCHING_TEST
+    PATH
+    neighbors/dynamic_batching/test_cagra.cu
+    neighbors/dynamic_batching/test_ivf_flat.cu
+    neighbors/dynamic_batching/test_ivf_pq.cu
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
   if(BUILD_CAGRA_HNSWLIB)
     ConfigureTest(NAME NEIGHBORS_HNSW_TEST PATH neighbors/hnsw.cu GPUS 1 PERCENT 100)
     target_link_libraries(NEIGHBORS_HNSW_TEST PRIVATE hnswlib::hnswlib)
diff --git a/cpp/test/neighbors/dynamic_batching.cuh b/cpp/test/neighbors/dynamic_batching.cuh
new file mode 100644
index 000000000..b64c5b01e
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching.cuh
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_utils.cuh"
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cstdint>
+#include <future>
+#include <vector>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+struct dynamic_batching_spec {
+  int64_t n_queries                   = 1000;
+  int64_t n_rows                      = 100000;
+  int64_t dim                         = 128;
+  int64_t k                           = 10;
+  int64_t max_batch_size              = 64;
+  size_t n_queues                     = 3;
+  bool conservative_dispatch          = false;
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+  int64_t max_concurrent_threads      = 128;
+};
+
+inline ::std::ostream& operator<<(::std::ostream& os, const dynamic_batching_spec& p)
+{
+  os << "{n_queries=" << p.n_queries;
+  os << ", dataset shape=" << p.n_rows << "x" << p.dim;
+  os << ", metric=" << print_metric{p.metric};
+  os << ", k=" << p.k;
+  os << ", max_batch_size=" << p.max_batch_size;
+  os << ", n_queues=" << p.n_queues;
+  os << ", conservative_dispatch=" << p.conservative_dispatch;
+  os << '}' << std::endl;
+  return os;
+}
+
+template <typename DataT, typename IdxT, typename UpstreamT>
+using build_function = UpstreamT(const raft::resources&,
+                                 const typename UpstreamT::index_params_type&,
+                                 raft::device_matrix_view<const DataT, int64_t, raft::row_major>);
+
+template <typename DataT, typename IdxT, typename UpstreamT>
+using search_function = void(const raft::resources&,
+                             const typename UpstreamT::search_params_type& params,
+                             const UpstreamT& index,
+                             raft::device_matrix_view<const DataT, int64_t, raft::row_major>,
+                             raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                             raft::device_matrix_view<float, int64_t, raft::row_major>,
+                             const cuvs::neighbors::filtering::base_filter&);
+
+template <typename DataT,
+          typename IdxT,
+          typename UpstreamT,
+          build_function<DataT, IdxT, UpstreamT> UpstreamBuildF,
+          search_function<DataT, IdxT, UpstreamT> UpstreamSearchF>
+struct dynamic_batching_test : public ::testing::TestWithParam<dynamic_batching_spec> {
+  using distance_type = float;
+  using data_type     = DataT;
+  using index_type    = IdxT;
+  using upstream_type = UpstreamT;
+
+  dynamic_batching_spec ps = ::testing::TestWithParam<dynamic_batching_spec>::GetParam();
+  raft::resources res;
+
+  // input data
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset            = std::nullopt;
+  std::optional<raft::device_matrix<data_type, int64_t>> queries            = std::nullopt;
+  std::optional<raft::device_matrix<index_type, int64_t>> neighbors_upsm    = std::nullopt;
+  std::optional<raft::device_matrix<index_type, int64_t>> neighbors_dynb    = std::nullopt;
+  std::optional<raft::device_matrix<distance_type, int64_t>> distances_upsm = std::nullopt;
+  std::optional<raft::device_matrix<distance_type, int64_t>> distances_dynb = std::nullopt;
+
+  // build parameters
+  cuvs::neighbors::index_params build_params_base{ps.metric};
+  typename upstream_type::index_params_type build_params_upsm{build_params_base};
+  dynamic_batching::index_params build_params_dynb{
+    build_params_base, ps.k, ps.max_batch_size, ps.n_queues, ps.conservative_dispatch};
+
+  // search parameters
+  typename upstream_type::search_params_type search_params_upsm{};
+  dynamic_batching::search_params search_params_dynb{};
+
+  // indexes
+  std::optional<upstream_type> index_upsm                                  = std::nullopt;
+  std::optional<dynamic_batching::index<data_type, index_type>> index_dynb = std::nullopt;
+
+  void build_all()
+  {
+    index_dynb.reset();
+    index_upsm.reset();
+    index_upsm = UpstreamBuildF(res, build_params_upsm, dataset->view());
+    index_dynb.emplace(res, build_params_dynb, index_upsm.value(), search_params_upsm);
+  }
+
+  void search_all()
+  {
+    // Search using upstream index - all queries at once
+    UpstreamSearchF(res,
+                    search_params_upsm,
+                    index_upsm.value(),
+                    queries->view(),
+                    neighbors_upsm->view(),
+                    distances_upsm->view(),
+                    filtering::none_sample_filter{});
+    raft::resource::sync_stream(res);
+
+    // Search with dynamic batching
+    // Streaming scenario: prepare concurrent resources
+    rmm::cuda_stream_pool worker_streams(ps.max_concurrent_threads);
+    std::vector<std::future<void>> futures(ps.max_concurrent_threads);
+    std::vector<raft::resources> resource_pool(0);
+    for (int64_t i = 0; i < ps.max_concurrent_threads; i++) {
+      resource_pool.push_back(res);  // copies the resource
+      raft::resource::set_cuda_stream(resource_pool[i], worker_streams.get_stream(i));
+    }
+
+    // Try multiple batch sizes in a round-robin to improve test coverage
+    std::vector<int64_t> minibatch_sizes{1, 3, 7, 10};
+    auto get_bs = [&minibatch_sizes](auto i) {
+      return minibatch_sizes[i % minibatch_sizes.size()];
+    };
+    int64_t i = 0;
+    for (int64_t offset = 0; offset < ps.n_queries; offset += get_bs(i++)) {
+      auto bs = std::min<int64_t>(get_bs(i), ps.n_queries - offset);
+      auto j  = i % ps.max_concurrent_threads;
+      // wait for previous job in the same slot to finish
+      if (i >= ps.max_concurrent_threads) { futures[j].wait(); }
+      // submit a new job
+      futures[j] = std::async(
+        std::launch::async,
+        [&res       = resource_pool[j],
+         &params    = search_params_dynb,
+         index      = index_dynb.value(),
+         query_view = raft::make_device_matrix_view<data_type, int64_t>(
+           queries->data_handle() + offset * ps.dim, bs, ps.dim),
+         neighbors_view = raft::make_device_matrix_view<index_type, int64_t>(
+           neighbors_dynb->data_handle() + offset * ps.k, bs, ps.k),
+         distances_view = raft::make_device_matrix_view<distance_type, int64_t>(
+           distances_dynb->data_handle() + offset * ps.k, bs, ps.k)]() {
+          dynamic_batching::search(res, params, index, query_view, neighbors_view, distances_view);
+        });
+    }
+
+    // finalize all resources
+    for (int64_t j = 0; j < ps.max_concurrent_threads && j < i; j++) {
+      futures[j].wait();
+      raft::resource::sync_stream(resource_pool[j]);
+    }
+    raft::resource::sync_stream(res);
+  }
+
+  /*
+    Check the dynamic batching generated neighbors against the upstream index. They both may be
+    imperfect w.r.t. the ground truth, but they shouldn't differ too much.
+   */
+  void check_neighbors()
+  {
+    auto stream         = raft::resource::get_cuda_stream(res);
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<index_type> neighbors_upsm_host(queries_size);
+    std::vector<index_type> neighbors_dynb_host(queries_size);
+    std::vector<distance_type> distances_upsm_host(queries_size);
+    std::vector<distance_type> distances_dynb_host(queries_size);
+    raft::copy(neighbors_upsm_host.data(), neighbors_upsm->data_handle(), queries_size, stream);
+    raft::copy(neighbors_dynb_host.data(), neighbors_dynb->data_handle(), queries_size, stream);
+    raft::copy(distances_upsm_host.data(), distances_upsm->data_handle(), queries_size, stream);
+    raft::copy(distances_dynb_host.data(), distances_dynb->data_handle(), queries_size, stream);
+    raft::resource::sync_stream(res);
+    ASSERT_TRUE(eval_neighbours(neighbors_upsm_host,
+                                neighbors_dynb_host,
+                                distances_upsm_host,
+                                distances_dynb_host,
+                                ps.n_queries,
+                                ps.k,
+                                0.001,
+                                0.9))
+      << ps;
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, ps.n_rows, ps.dim));
+    queries.emplace(raft::make_device_matrix<data_type, int64_t>(res, ps.n_queries, ps.dim));
+    neighbors_upsm.emplace(raft::make_device_matrix<index_type, int64_t>(res, ps.n_queries, ps.k));
+    neighbors_dynb.emplace(raft::make_device_matrix<index_type, int64_t>(res, ps.n_queries, ps.k));
+    distances_upsm.emplace(
+      raft::make_device_matrix<distance_type, int64_t>(res, ps.n_queries, ps.k));
+    distances_dynb.emplace(
+      raft::make_device_matrix<distance_type, int64_t>(res, ps.n_queries, ps.k));
+
+    raft::random::RngState rng(666ULL);
+    if constexpr (std::is_same_v<data_type, float> || std::is_same_v<data_type, half>) {
+      raft::random::uniform(
+        res, rng, dataset->data_handle(), dataset->size(), data_type(0.1), data_type(2.0));
+      raft::random::uniform(
+        res, rng, queries->data_handle(), queries->size(), data_type(0.1), data_type(2.0));
+    } else {
+      raft::random::uniformInt(
+        res, rng, dataset->data_handle(), dataset->size(), data_type(1), data_type(20));
+      raft::random::uniformInt(
+        res, rng, queries->data_handle(), queries->size(), data_type(1), data_type(20));
+    }
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    index_dynb.reset();
+    index_upsm.reset();
+    dataset.reset();
+    queries.reset();
+    neighbors_upsm.reset();
+    neighbors_dynb.reset();
+    distances_upsm.reset();
+    distances_dynb.reset();
+    raft::resource::sync_stream(res);
+  }
+};
+
+inline std::vector<dynamic_batching_spec> generate_inputs()
+{
+  std::vector<dynamic_batching_spec> inputs{dynamic_batching_spec{}};
+
+  for (auto alt_n_queries : {10, 50, 100}) {
+    dynamic_batching_spec input{};
+    input.n_queries = alt_n_queries;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_k : {100, 200}) {
+    dynamic_batching_spec input{};
+    input.k = alt_k;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_max_batch_size : {4, 16, 128, 256, 512, 1024}) {
+    dynamic_batching_spec input{};
+    input.max_batch_size = alt_max_batch_size;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_n_queues : {1, 2, 16, 32}) {
+    dynamic_batching_spec input{};
+    input.n_queues = alt_n_queues;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_max_concurrent_threads : {1, 2, 16, 32}) {
+    dynamic_batching_spec input{};
+    input.max_concurrent_threads = alt_max_concurrent_threads;
+    inputs.push_back(input);
+  }
+
+  {
+    auto n = inputs.size();
+    for (size_t i = 0; i < n; i++) {
+      auto input                  = inputs[i];
+      input.conservative_dispatch = !input.conservative_dispatch;
+      inputs.push_back(input);
+    }
+  }
+
+  return inputs;
+}
+
+const std::vector<dynamic_batching_spec> inputs = generate_inputs();
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_cagra.cu b/cpp/test/neighbors/dynamic_batching/test_cagra.cu
new file mode 100644
index 000000000..604fc29cf
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_cagra.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using cagra_F32 = dynamic_batching_test<float,
+                                        uint32_t,
+                                        cagra::index<float, uint32_t>,
+                                        cagra::build,
+                                        cagra::search>;
+
+using cagra_U8 = dynamic_batching_test<uint8_t,
+                                       uint32_t,
+                                       cagra::index<uint8_t, uint32_t>,
+                                       cagra::build,
+                                       cagra::search>;
+
+template <typename fixture>
+static void set_default_cagra_params(fixture& that)
+{
+  that.build_params_upsm.intermediate_graph_degree = 128;
+  that.build_params_upsm.graph_degree              = 64;
+  that.search_params_upsm.itopk_size =
+    std::clamp<int64_t>(raft::bound_by_power_of_two(that.ps.k) * 16, 128, 512);
+}
+
+TEST_P(cagra_F32, single_cta)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::SINGLE_CTA;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_F32, multi_cta)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::MULTI_CTA;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_F32, multi_kernel)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::MULTI_KERNEL;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_U8, defaults)
+{
+  set_default_cagra_params(*this);
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_F32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_U8, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu
new file mode 100644
index 000000000..4922cffa3
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using ivf_flat_i8 = dynamic_batching_test<uint8_t,
+                                          int64_t,
+                                          ivf_flat::index<uint8_t, int64_t>,
+                                          ivf_flat::build,
+                                          ivf_flat::search>;
+
+TEST_P(ivf_flat_i8, defaults)
+{
+  build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows));
+  search_params_upsm.n_probes =
+    std::max<uint32_t>(std::min<uint32_t>(build_params_upsm.n_lists, 10),
+                       raft::div_rounding_up_safe<uint32_t>(build_params_upsm.n_lists, 50));
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_flat_i8, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu
new file mode 100644
index 000000000..ec57e0b57
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/ivf_pq.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using ivf_pq_f16 =
+  dynamic_batching_test<half, int64_t, ivf_pq::index<int64_t>, ivf_pq::build, ivf_pq::search>;
+
+TEST_P(ivf_pq_f16, defaults)
+{
+  build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows));
+  search_params_upsm.n_probes =
+    std::max<uint32_t>(std::min<uint32_t>(build_params_upsm.n_lists, 10),
+                       raft::div_rounding_up_safe<uint32_t>(build_params_upsm.n_lists, 50));
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_pq_f16, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index d55d58eb0..ab810ab53 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -11,6 +11,7 @@ Nearest Neighbors
 
    neighbors_bruteforce.rst
    neighbors_cagra.rst
+   neighbors_dynamic_batching.rst
    neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
diff --git a/docs/source/cpp_api/neighbors_dynamic_batching.rst b/docs/source/cpp_api/neighbors_dynamic_batching.rst
new file mode 100644
index 000000000..adc5cb56a
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_dynamic_batching.rst
@@ -0,0 +1,45 @@
+Dynamic Batching
+================
+
+Dynamic Batching allows grouping small search requests into batches to increase the device occupancy and throughput while keeping the latency within limits.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/dynamic_batching.hpp>``
+
+namespace *cuvs::neighbors::dynamic_batching*
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: dynamic_batching_cpp_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search parameters
+-----------------------
+
+.. doxygengroup:: dynamic_batching_cpp_search_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index
+-----
+
+.. doxygengroup:: dynamic_batching_cpp_index
+    :project: cuvs
+    :members:
+    :content-only:
+
+
+Index search
+------------
+
+.. doxygengroup:: dynamic_batching_cpp_search
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 092b65ed9..951e0ad0c 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -38,6 +38,7 @@ include(../cmake/thirdparty/get_cuvs.cmake)
 # -------------- compile tasks ----------------- #
 add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
 add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
+add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu)
 add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
 add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu)
 add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
@@ -48,6 +49,9 @@ target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:c
 target_link_libraries(
   CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
 )
+target_link_libraries(
+  DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+)
 target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/cpp/src/dynamic_batching_example.cu b/examples/cpp/src/dynamic_batching_example.cu
new file mode 100644
index 000000000..95f66a454
--- /dev/null
+++ b/examples/cpp/src/dynamic_batching_example.cu
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <future>
+
+// A helper to split the dataset into chunks
+template <typename DeviceMatrixOrView>
+auto slice_matrix(const DeviceMatrixOrView &source,
+                  typename DeviceMatrixOrView::index_type offset_rows,
+                  typename DeviceMatrixOrView::index_type count_rows) {
+  auto n_cols = source.extent(1);
+  return raft::make_device_matrix_view<
+      typename DeviceMatrixOrView::element_type,
+      typename DeviceMatrixOrView::index_type>(
+      const_cast<typename DeviceMatrixOrView::element_type *>(
+          source.data_handle()) +
+          offset_rows * n_cols,
+      count_rows, n_cols);
+}
+
+// A helper to measure the execution time of a function
+template <typename F, typename... Args>
+void time_it(std::string label, F f, Args &&...xs) {
+  auto start = std::chrono::system_clock::now();
+  f(std::forward<Args>(xs)...);
+  auto end = std::chrono::system_clock::now();
+  auto t = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  auto t_ms = double(t.count()) / 1000.0;
+  std::cout << "[" << label << "] execution time: " << t_ms << " ms"
+            << std::endl;
+}
+
+/**
+ * Wrap waiting on a stream work into an async C++ future object.
+ * This is similar to recording and waiting on CUDA events, but in C++11 API.
+ */
+struct cuda_work_completion_promise {
+
+  cuda_work_completion_promise(const raft::resources &res) {
+    auto *promise = new std::promise<void>;
+    RAFT_CUDA_TRY(cudaLaunchHostFunc(raft::resource::get_cuda_stream(res),
+                                     completion_callback,
+                                     reinterpret_cast<void *>(promise)));
+    value_ = promise->get_future();
+  }
+
+  /**
+   * Waiting on the produced `future` object has the same effect as
+   * cudaEventSynchronize if an event was recorded at the time of creation of
+   * this promise object.
+   */
+  auto get_future() -> std::future<void> && { return std::move(value_); }
+
+private:
+  std::future<void> value_;
+
+  static void completion_callback(void *ptr) {
+    auto *promise = reinterpret_cast<std::promise<void> *>(ptr);
+    promise->set_value();
+    delete promise;
+  }
+};
+
+void dynamic_batching_example(
+    raft::resources const &res,
+    raft::device_matrix_view<const float, int64_t> dataset,
+    raft::device_matrix_view<const float, int64_t> queries) {
+  using namespace cuvs::neighbors;
+
+  // Number of neighbors to search
+  int64_t topk = 100;
+
+  // Streaming scenario: maximum number of requests in-flight
+  constexpr int64_t kMaxJobs = 1000;
+  // Streaming scenario: number of concurrent CUDA streams
+  constexpr int64_t kNumWorkerStreams = 5;
+
+  // Split the queries into two subsets to run every experiment twice and thus
+  // surface any initialization overheads.
+  int64_t n_queries_a = queries.extent(0) / 2;
+  int64_t n_queries_b = queries.extent(0) - n_queries_a;
+
+  auto queries_a = slice_matrix(queries, 0, n_queries_a);
+  auto queries_b = slice_matrix(queries, n_queries_a, n_queries_b);
+
+  // create output arrays
+  auto neighbors =
+      raft::make_device_matrix<uint32_t>(res, queries.extent(0), topk);
+  auto distances =
+      raft::make_device_matrix<float>(res, queries.extent(0), topk);
+  // slice them same as queries
+  auto neighbors_a = slice_matrix(neighbors, 0, n_queries_a);
+  auto distances_a = slice_matrix(distances, 0, n_queries_a);
+  auto neighbors_b = slice_matrix(neighbors, n_queries_a, n_queries_b);
+  auto distances_b = slice_matrix(distances, n_queries_a, n_queries_b);
+
+  // use default index parameters
+  cagra::index_params orig_index_params;
+
+  std::cout << "Building CAGRA index (search graph)" << std::endl;
+  auto orig_index = cagra::build(res, orig_index_params, dataset);
+
+  std::cout << "CAGRA index has " << orig_index.size() << " vectors"
+            << std::endl;
+  std::cout << "CAGRA graph has degree " << orig_index.graph_degree()
+            << ", graph size [" << orig_index.graph().extent(0) << ", "
+            << orig_index.graph().extent(1) << "]" << std::endl;
+
+  // use default search parameters
+  cagra::search_params orig_search_params;
+  // get a decent recall by increasing the internal topk list
+  orig_search_params.itopk_size = 512;
+  orig_search_params.algo = cagra::search_algo::SINGLE_CTA;
+
+  // Set up dynamic batching parameters
+  dynamic_batching::index_params dynb_index_params{
+      /* default-initializing the parent `neighbors::index_params`
+         (not used anyway) */
+      {},
+      /* Set the K in advance (the batcher needs to allocate buffers) */
+      topk,
+      /* Configure the number and the size of IO buffers */
+      64,
+      kNumWorkerStreams};
+
+  // "build" the index (it's a low-cost index wrapping),
+  //  that is we need to pass the original index and its search params here
+  dynamic_batching::index<float, uint32_t> dynb_index(
+      res, dynb_index_params, orig_index, orig_search_params);
+
+  // You can implement job priorities by varying the deadlines of individual
+  // requests
+  dynamic_batching::search_params dynb_search_params;
+  dynb_search_params.dispatch_timeout_ms = 0.1;
+
+  // Define the big-batch setting as a baseline for measuring the throughput.
+  auto search_batch_orig =
+      [&res, &orig_index, &orig_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        cagra::search(res, orig_search_params, orig_index, queries, neighbors,
+                      distances);
+        raft::resource::sync_stream(res);
+      };
+
+  // Launch the baseline search: check the big-batch performance
+  time_it("standard/batch A", search_batch_orig, queries_a, neighbors_a,
+          distances_a);
+  time_it("standard/batch B", search_batch_orig, queries_b, neighbors_b,
+          distances_b);
+
+  // Streaming scenario: prepare concurrent resources
+  rmm::cuda_stream_pool worker_streams{kNumWorkerStreams};
+  std::vector<raft::resources> resource_pool(0);
+  for (int64_t i = 0; i < kNumWorkerStreams; i++) {
+    resource_pool.push_back(res);
+    raft::resource::set_cuda_stream(resource_pool[i],
+                                    worker_streams.get_stream(i));
+  }
+
+  // Streaming scenario:
+  // send queries one-by-one, with a maximum kMaxJobs in-flight
+  auto search_async_orig =
+      [&resource_pool, &orig_index, &orig_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        auto work_size = queries.extent(0);
+        std::array<std::future<void>, kMaxJobs> futures;
+        for (int64_t i = 0; i < work_size + kMaxJobs; i++) {
+          // wait for previous job in the same slot to finish
+          if (i >= kMaxJobs) {
+            futures[i % kMaxJobs].wait();
+          }
+          // submit a new job
+          if (i < work_size) {
+            auto &res = resource_pool[i % kNumWorkerStreams];
+            cagra::search(res, orig_search_params, orig_index,
+                          slice_matrix(queries, i, 1),
+                          slice_matrix(neighbors, i, 1),
+                          slice_matrix(distances, i, 1));
+            futures[i % kMaxJobs] =
+                cuda_work_completion_promise(res).get_future();
+          }
+        }
+      };
+
+  // Streaming scenario with dynamic batching:
+  // send queries one-by-one, with a maximum kMaxJobs in-flight,
+  // yet allow grouping the sequential requests (subject to deadlines)
+  auto search_async_dynb =
+      [&resource_pool, &dynb_index, &dynb_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        auto work_size = queries.extent(0);
+        std::array<std::future<void>, kMaxJobs> futures;
+        for (int64_t i = 0; i < work_size + kMaxJobs; i++) {
+          // wait for previous job in the same slot to finish
+          if (i >= kMaxJobs) {
+            futures[i % kMaxJobs].wait();
+          }
+          // submit a new job
+          if (i < work_size) {
+            auto &res = resource_pool[i % kNumWorkerStreams];
+            dynamic_batching::search(res, dynb_search_params, dynb_index,
+                                     slice_matrix(queries, i, 1),
+                                     slice_matrix(neighbors, i, 1),
+                                     slice_matrix(distances, i, 1));
+            futures[i % kMaxJobs] =
+                cuda_work_completion_promise(res).get_future();
+          }
+        }
+      };
+
+  // Try to handle the same amount of work in the async setting using the
+  // standard implementation.
+  time_it("standard/async A", search_async_orig, queries_a, neighbors_a,
+          distances_a);
+  time_it("standard/async B", search_async_orig, queries_b, neighbors_b,
+          distances_b);
+
+  // Do the same using dynamic batching
+  time_it("dynamic_batching/async A", search_async_dynb, queries_a, neighbors_a,
+          distances_a);
+  time_it("dynamic_batching/async B", search_async_dynb, queries_b, neighbors_b,
+          distances_b);
+}
+
+int main() {
+  raft::device_resources res;
+
+  // Set the raft resource to use a pool for internal memory allocations
+  // (workspace) and limit the available workspace size.
+  raft::resource::set_workspace_to_pool_resource(res,
+                                                 12ull * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 1000000;
+  int64_t n_dim = 128;
+  int64_t n_queries = 10000;
+  auto dataset =
+      raft::make_device_matrix<float, int64_t>(res, n_samples, n_dim);
+  auto queries =
+      raft::make_device_matrix<float, int64_t>(res, n_queries, n_dim);
+  generate_dataset(res, dataset.view(), queries.view());
+
+  // run the interesting part of the program
+  dynamic_batching_example(res, raft::make_const_mdspan(dataset.view()),
+                           raft::make_const_mdspan(queries.view()));
+}

From b051f805129fab36ee5da7299ed0fb98850fa44c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20F=C3=B6rster?=
 <97973773+mfoerste4@users.noreply.github.com>
Date: Thu, 5 Dec 2024 06:27:33 +0100
Subject: [PATCH 42/47] Add C++ API scalar quantization (#494)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First draft for scalar quantization.

WIP status:
* only int8_t target type
* quantile computation inefficient (via sampling & sorting)

Authors:
  - Malte Förster (https://github.com/mfoerste4)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/494
---
 cpp/CMakeLists.txt                            |   1 +
 .../cuvs/preprocessing/quantize/scalar.hpp    | 489 ++++++++++++++++++
 .../preprocessing/quantize/detail/scalar.cuh  | 227 ++++++++
 cpp/src/preprocessing/quantize/scalar.cu      |  74 +++
 cpp/test/CMakeLists.txt                       |   5 +
 cpp/test/preprocessing/scalar_quantization.cu | 291 +++++++++++
 docs/source/cpp_api.rst                       |   1 +
 docs/source/cpp_api/preprocessing.rst         |  12 +
 .../source/cpp_api/preprocessing_quantize.rst |  20 +
 9 files changed, 1120 insertions(+)
 create mode 100644 cpp/include/cuvs/preprocessing/quantize/scalar.hpp
 create mode 100644 cpp/src/preprocessing/quantize/detail/scalar.cuh
 create mode 100644 cpp/src/preprocessing/quantize/scalar.cu
 create mode 100644 cpp/test/preprocessing/scalar_quantization.cu
 create mode 100644 docs/source/cpp_api/preprocessing.rst
 create mode 100644 docs/source/cpp_api/preprocessing_quantize.rst

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6af423bd5..199bb232d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -459,6 +459,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/vamana_serialize_float.cu
     src/neighbors/vamana_serialize_uint8.cu
     src/neighbors/vamana_serialize_int8.cu
+    src/preprocessing/quantize/scalar.cu
     src/selection/select_k_float_int64_t.cu
     src/selection/select_k_float_int32_t.cu
     src/selection/select_k_float_uint32_t.cu
diff --git a/cpp/include/cuvs/preprocessing/quantize/scalar.hpp b/cpp/include/cuvs/preprocessing/quantize/scalar.hpp
new file mode 100644
index 000000000..49b4bb7a6
--- /dev/null
+++ b/cpp/include/cuvs/preprocessing/quantize/scalar.hpp
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+
+#include <cuda_fp16.h>
+
+namespace cuvs::preprocessing::quantize::scalar {
+
+/**
+ * @defgroup scalar Scalar quantizer utilities
+ * @{
+ */
+
+/**
+ * @brief quantizer parameters.
+ */
+struct params {
+  /*
+   * specifies how many outliers at top & bottom will be ignored
+   * needs to be within range of (0, 1]
+   */
+  float quantile = 0.99;
+};
+
+/**
+ * @brief Defines and stores scalar for quantisation upon training
+ *
+ * The quantization is performed by a linear mapping of an interval in the
+ * float data type to the full range of the quantized int type.
+ *
+ * @tparam T data element type
+ *
+ */
+template <typename T>
+struct quantizer {
+  T min_;
+  T max_;
+};
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on device
+ *
+ * @return quantizer
+ */
+quantizer<double> train(raft::resources const& res,
+                        const params params,
+                        raft::device_matrix_view<const double, int64_t> dataset);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on host
+ *
+ * @return quantizer
+ */
+quantizer<double> train(raft::resources const& res,
+                        const params params,
+                        raft::host_matrix_view<const double, int64_t> dataset);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples,
+ * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<double>& quantizer,
+               raft::device_matrix_view<const double, int64_t> dataset,
+               raft::device_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<double, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<double>& quantizer,
+               raft::host_matrix_view<const double, int64_t> dataset,
+               raft::host_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix<double,
+ * int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<double>& quantizer,
+                       raft::device_matrix_view<const int8_t, int64_t> dataset,
+                       raft::device_matrix_view<double, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix<double, int64_t>(samples,
+ * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<double>& quantizer,
+                       raft::host_matrix_view<const int8_t, int64_t> dataset,
+                       raft::host_matrix_view<double, int64_t> out);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on device
+ *
+ * @return quantizer
+ */
+quantizer<float> train(raft::resources const& res,
+                       const params params,
+                       raft::device_matrix_view<const float, int64_t> dataset);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on host
+ *
+ * @return quantizer
+ */
+quantizer<float> train(raft::resources const& res,
+                       const params params,
+                       raft::host_matrix_view<const float, int64_t> dataset);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples,
+ * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<float>& quantizer,
+               raft::device_matrix_view<const float, int64_t> dataset,
+               raft::device_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<float, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<float>& quantizer,
+               raft::host_matrix_view<const float, int64_t> dataset,
+               raft::host_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix<float, int64_t>(handle,
+ * samples, features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<float>& quantizer,
+                       raft::device_matrix_view<const int8_t, int64_t> dataset,
+                       raft::device_matrix_view<float, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix<float, int64_t>(samples,
+ * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<float>& quantizer,
+                       raft::host_matrix_view<const int8_t, int64_t> dataset,
+                       raft::host_matrix_view<float, int64_t> out);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on device
+ *
+ * @return quantizer
+ */
+quantizer<half> train(raft::resources const& res,
+                      const params params,
+                      raft::device_matrix_view<const half, int64_t> dataset);
+
+/**
+ * @brief Initializes a scalar quantizer to be used later for quantizing the dataset.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] params configure scalar quantizer, e.g. quantile
+ * @param[in] dataset a row-major matrix view on host
+ *
+ * @return quantizer
+ */
+quantizer<half> train(raft::resources const& res,
+                      const params params,
+                      raft::host_matrix_view<const half, int64_t> dataset);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples,
+ * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<half>& quantizer,
+               raft::device_matrix_view<const half, int64_t> dataset,
+               raft::device_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Applies quantization transform to given dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::preprocessing::quantize::scalar::params params;
+ * auto quantizer = cuvs::preprocessing::quantize::scalar::train<half, int8_t>(handle, params,
+ * dataset); auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void transform(raft::resources const& res,
+               const quantizer<half>& quantizer,
+               raft::host_matrix_view<const half, int64_t> dataset,
+               raft::host_matrix_view<int8_t, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_device_matrix<int8_t, int64_t>(handle, samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix<half, int64_t>(handle,
+ * samples, features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on device
+ * @param[out] out a row-major matrix view on device
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<half>& quantizer,
+                       raft::device_matrix_view<const int8_t, int64_t> dataset,
+                       raft::device_matrix_view<half, int64_t> out);
+
+/**
+ * @brief Perform inverse quantization step on previously quantized dataset
+ *
+ * Note that depending on the chosen data types train dataset the conversion is
+ * not lossless.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * auto quantized_dataset = raft::make_host_matrix<int8_t, int64_t>(samples, features);
+ * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset,
+ * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix<half, int64_t>(samples,
+ * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer,
+ * dataset_revert.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] quantizer a scalar quantizer
+ * @param[in] dataset a row-major matrix view on host
+ * @param[out] out a row-major matrix view on host
+ *
+ */
+void inverse_transform(raft::resources const& res,
+                       const quantizer<half>& quantizer,
+                       raft::host_matrix_view<const int8_t, int64_t> dataset,
+                       raft::host_matrix_view<half, int64_t> out);
+
+/** @} */  // end of group scalar
+
+}  // namespace cuvs::preprocessing::quantize::scalar
diff --git a/cpp/src/preprocessing/quantize/detail/scalar.cuh b/cpp/src/preprocessing/quantize/detail/scalar.cuh
new file mode 100644
index 000000000..fc132eb7f
--- /dev/null
+++ b/cpp/src/preprocessing/quantize/detail/scalar.cuh
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <thrust/system/omp/execution_policy.h>
+
+namespace cuvs::preprocessing::quantize::detail {
+
+template <class T>
+_RAFT_HOST_DEVICE bool fp_lt(const T& a, const T& b)
+{
+  return a < b;
+}
+
+template <>
+_RAFT_HOST_DEVICE bool fp_lt(const half& a, const half& b)
+{
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+template <typename T, typename QuantI, typename TempT = double>
+struct quantize_op {
+  const T min_;
+  const T max_;
+  const QuantI q_type_min_ = std::numeric_limits<QuantI>::min();
+  const QuantI q_type_max_ = std::numeric_limits<QuantI>::max();
+  const TempT scalar_;
+  const TempT offset_;
+
+  constexpr explicit quantize_op(T min, T max)
+    : min_(min),
+      max_(max),
+      scalar_(static_cast<TempT>(max_) > static_cast<TempT>(min_)
+                ? ((static_cast<TempT>(q_type_max_) - static_cast<TempT>(q_type_min_)) /
+                   (static_cast<TempT>(max_) - static_cast<TempT>(min_)))
+                : static_cast<TempT>(1)),
+      offset_(static_cast<TempT>(q_type_min_) - static_cast<TempT>(min_) * scalar_)
+  {
+  }
+
+  constexpr RAFT_INLINE_FUNCTION QuantI operator()(const T& x) const
+  {
+    if (!fp_lt(min_, x)) return q_type_min_;
+    if (!fp_lt(x, max_)) return q_type_max_;
+    return static_cast<QuantI>(lroundf(scalar_ * static_cast<TempT>(x) + offset_));
+  }
+
+  constexpr RAFT_INLINE_FUNCTION T operator()(const QuantI& x) const
+  {
+    return static_cast<T>((static_cast<TempT>(x) - offset_) / scalar_);
+  }
+};
+
+template <typename T>
+std::tuple<T, T> quantile_min_max(raft::resources const& res,
+                                  raft::device_matrix_view<const T, int64_t> dataset,
+                                  double quantile)
+{
+  // settings for quantile approximation
+  constexpr size_t max_num_samples = 1000000;
+  constexpr int seed               = 137;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  // select subsample
+  raft::random::RngState rng(seed);
+  size_t n_elements  = dataset.extent(0) * dataset.extent(1);
+  size_t subset_size = std::min(max_num_samples, n_elements);
+  auto subset        = raft::make_device_vector<T>(res, subset_size);
+  auto dataset_view  = raft::make_device_vector_view<const T>(dataset.data_handle(), n_elements);
+  raft::random::sample_without_replacement(
+    res, rng, dataset_view, std::nullopt, subset.view(), std::nullopt);
+
+  // quantile / sort and pick for now
+  thrust::sort(raft::resource::get_thrust_policy(res),
+               subset.data_handle(),
+               subset.data_handle() + subset_size);
+
+  double half_quantile_pos = (0.5 + 0.5 * quantile) * subset_size;
+  int pos_max              = std::ceil(half_quantile_pos) - 1;
+  int pos_min              = subset_size - pos_max - 1;
+
+  T minmax_h[2];
+  raft::update_host(&(minmax_h[0]), subset.data_handle() + pos_min, 1, stream);
+  raft::update_host(&(minmax_h[1]), subset.data_handle() + pos_max, 1, stream);
+  raft::resource::sync_stream(res);
+
+  return {minmax_h[0], minmax_h[1]};
+}
+
+template <typename T>
+std::tuple<T, T> quantile_min_max(raft::resources const& res,
+                                  raft::host_matrix_view<const T, int64_t> dataset,
+                                  double quantile)
+{
+  // settings for quantile approximation
+  constexpr size_t max_num_samples = 1000000;
+  constexpr int seed               = 137;
+
+  // select subsample
+  std::mt19937 rng(seed);
+  size_t n_elements  = dataset.extent(0) * dataset.extent(1);
+  size_t subset_size = std::min(max_num_samples, n_elements);
+  std::vector<T> subset;
+  std::sample(dataset.data_handle(),
+              dataset.data_handle() + n_elements,
+              std::back_inserter(subset),
+              subset_size,
+              rng);
+
+  // quantile / sort and pick for now
+  thrust::sort(thrust::omp::par, subset.data(), subset.data() + subset_size, fp_lt<T>);
+  double half_quantile_pos = (0.5 + 0.5 * quantile) * subset_size;
+  int pos_max              = std::ceil(half_quantile_pos) - 1;
+  int pos_min              = subset_size - pos_max - 1;
+
+  return {subset[pos_min], subset[pos_max]};
+}
+
+template <typename T>
+cuvs::preprocessing::quantize::scalar::quantizer<T> train(
+  raft::resources const& res,
+  const cuvs::preprocessing::quantize::scalar::params params,
+  raft::device_matrix_view<const T, int64_t> dataset)
+{
+  RAFT_EXPECTS(params.quantile > 0.0 && params.quantile <= 1.0,
+               "quantile for scalar quantization needs to be within (0, 1] but is %f",
+               params.quantile);
+
+  auto [min, max] = detail::quantile_min_max(res, dataset, params.quantile);
+
+  RAFT_LOG_DEBUG("quantizer train min=%lf max=%lf.", double(min), double(max));
+
+  return cuvs::preprocessing::quantize::scalar::quantizer<T>{min, max};
+}
+
+template <typename T>
+cuvs::preprocessing::quantize::scalar::quantizer<T> train(
+  raft::resources const& res,
+  const cuvs::preprocessing::quantize::scalar::params params,
+  raft::host_matrix_view<const T, int64_t> dataset)
+{
+  RAFT_EXPECTS(params.quantile > 0.0 && params.quantile <= 1.0,
+               "quantile for scalar quantization needs to be within (0, 1] but is %f",
+               params.quantile);
+
+  auto [min, max] = detail::quantile_min_max(res, dataset, params.quantile);
+
+  RAFT_LOG_DEBUG("quantizer train min=%lf max=%lf.", double(min), double(max));
+
+  return cuvs::preprocessing::quantize::scalar::quantizer<T>{min, max};
+}
+
+template <typename T, typename QuantI = int8_t>
+void transform(raft::resources const& res,
+               const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+               raft::device_matrix_view<const T, int64_t> dataset,
+               raft::device_matrix_view<QuantI, int64_t> out)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  raft::linalg::map(res, out, quantize_op<T, QuantI>(quantizer.min_, quantizer.max_), dataset);
+}
+
+template <typename T, typename QuantI = int8_t>
+void transform(raft::resources const& res,
+               const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+               raft::host_matrix_view<const T, int64_t> dataset,
+               raft::host_matrix_view<QuantI, int64_t> out)
+{
+  auto main_op      = quantize_op<T, QuantI>(quantizer.min_, quantizer.max_);
+  size_t n_elements = dataset.extent(0) * dataset.extent(1);
+
+#pragma omp parallel for
+  for (size_t i = 0; i < n_elements; ++i) {
+    out.data_handle()[i] = main_op(dataset.data_handle()[i]);
+  }
+}
+
+template <typename T, typename QuantI = int8_t>
+void inverse_transform(raft::resources const& res,
+                       const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+                       raft::device_matrix_view<const QuantI, int64_t> dataset,
+                       raft::device_matrix_view<T, int64_t> out)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  raft::linalg::map(res, out, quantize_op<T, QuantI>(quantizer.min_, quantizer.max_), dataset);
+}
+
+template <typename T, typename QuantI = int8_t>
+void inverse_transform(raft::resources const& res,
+                       const cuvs::preprocessing::quantize::scalar::quantizer<T>& quantizer,
+                       raft::host_matrix_view<const QuantI, int64_t> dataset,
+                       raft::host_matrix_view<T, int64_t> out)
+{
+  auto main_op      = quantize_op<T, QuantI>(quantizer.min_, quantizer.max_);
+  size_t n_elements = dataset.extent(0) * dataset.extent(1);
+
+#pragma omp parallel for
+  for (size_t i = 0; i < n_elements; ++i) {
+    out.data_handle()[i] = main_op(dataset.data_handle()[i]);
+  }
+}
+
+}  // namespace cuvs::preprocessing::quantize::detail
diff --git a/cpp/src/preprocessing/quantize/scalar.cu b/cpp/src/preprocessing/quantize/scalar.cu
new file mode 100644
index 000000000..9624ad4fe
--- /dev/null
+++ b/cpp/src/preprocessing/quantize/scalar.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./detail/scalar.cuh"
+
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+
+namespace cuvs::preprocessing::quantize::scalar {
+
+#define CUVS_INST_QUANTIZATION(T, QuantI)                                         \
+  auto train(raft::resources const& res,                                          \
+             const params params,                                                 \
+             raft::device_matrix_view<const T, int64_t> dataset)                  \
+    ->quantizer<T>                                                                \
+  {                                                                               \
+    return detail::train(res, params, dataset);                                   \
+  }                                                                               \
+  auto train(raft::resources const& res,                                          \
+             const params params,                                                 \
+             raft::host_matrix_view<const T, int64_t> dataset)                    \
+    ->quantizer<T>                                                                \
+  {                                                                               \
+    return detail::train(res, params, dataset);                                   \
+  }                                                                               \
+  void transform(raft::resources const& res,                                      \
+                 const quantizer<T>& quantizer,                                   \
+                 raft::device_matrix_view<const T, int64_t> dataset,              \
+                 raft::device_matrix_view<QuantI, int64_t> out)                   \
+  {                                                                               \
+    detail::transform(res, quantizer, dataset, out);                              \
+  }                                                                               \
+  void transform(raft::resources const& res,                                      \
+                 const quantizer<T>& quantizer,                                   \
+                 raft::host_matrix_view<const T, int64_t> dataset,                \
+                 raft::host_matrix_view<QuantI, int64_t> out)                     \
+  {                                                                               \
+    detail::transform(res, quantizer, dataset, out);                              \
+  }                                                                               \
+  void inverse_transform(raft::resources const& res,                              \
+                         const quantizer<T>& quantizer,                           \
+                         raft::device_matrix_view<const QuantI, int64_t> dataset, \
+                         raft::device_matrix_view<T, int64_t> out)                \
+  {                                                                               \
+    detail::inverse_transform(res, quantizer, dataset, out);                      \
+  }                                                                               \
+  void inverse_transform(raft::resources const& res,                              \
+                         const quantizer<T>& quantizer,                           \
+                         raft::host_matrix_view<const QuantI, int64_t> dataset,   \
+                         raft::host_matrix_view<T, int64_t> out)                  \
+  {                                                                               \
+    detail::inverse_transform(res, quantizer, dataset, out);                      \
+  }                                                                               \
+  template struct quantizer<T>;
+
+CUVS_INST_QUANTIZATION(double, int8_t);
+CUVS_INST_QUANTIZATION(float, int8_t);
+CUVS_INST_QUANTIZATION(half, int8_t);
+
+#undef CUVS_INST_QUANTIZATION
+
+}  // namespace cuvs::preprocessing::quantize::scalar
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 1c8de2ad0..0ecac6ec2 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -226,6 +226,11 @@ if(BUILD_TESTS)
     PERCENT
     100
   )
+
+  ConfigureTest(
+    NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100
+  )
+
   ConfigureTest(
     NAME STATS_TEST PATH stats/trustworthiness.cu stats/silhouette_score.cu GPUS 1 PERCENT 100
   )
diff --git a/cpp/test/preprocessing/scalar_quantization.cu b/cpp/test/preprocessing/scalar_quantization.cu
new file mode 100644
index 000000000..2fdfe7555
--- /dev/null
+++ b/cpp/test/preprocessing/scalar_quantization.cu
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/stats/stddev.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+
+namespace cuvs::preprocessing::quantize::scalar {
+
+template <typename T>
+struct QuantizationInputs {
+  cuvs::preprocessing::quantize::scalar::params quantization_params;
+  int rows;
+  int cols;
+  T min            = T(-1.0);
+  T max            = T(1.0);
+  double threshold = 2e-2;
+};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const QuantizationInputs<T>& inputs)
+{
+  return os << "quantization_quantile:<" << inputs.quantization_params.quantile
+            << "> rows:" << inputs.rows << " cols:" << inputs.cols << " min:" << (double)inputs.min
+            << " max:" << (double)inputs.max;
+}
+
+template <typename T, typename QuantI>
+class QuantizationTest : public ::testing::TestWithParam<QuantizationInputs<T>> {
+ public:
+  QuantizationTest()
+    : params_(::testing::TestWithParam<QuantizationInputs<T>>::GetParam()),
+      stream(raft::resource::get_cuda_stream(handle)),
+      input_(0, stream)
+  {
+  }
+
+  double getRelativeErrorStddev(const T* array_a, const T* array_b, size_t size, float quantile)
+  {
+    // relative error elementwise
+    rmm::device_uvector<double> relative_error(size, stream);
+    raft::linalg::binaryOp(
+      relative_error.data(),
+      array_a,
+      array_b,
+      size,
+      [] __device__(double a, double b) {
+        return a != b ? (raft::abs(a - b) / raft::max(raft::abs(a), raft::abs(b))) : 0;
+      },
+      stream);
+
+    // sort by size --> remove largest errors to account for quantile chosen
+    thrust::sort(raft::resource::get_thrust_policy(handle),
+                 relative_error.data(),
+                 relative_error.data() + size);
+    int elements_to_consider =
+      std::ceil(double(params_.quantization_params.quantile) * double(size));
+
+    rmm::device_uvector<double> mu(1, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(mu.data(), 0, sizeof(double), stream));
+
+    rmm::device_uvector<double> error_stddev(1, stream);
+    raft::stats::stddev(error_stddev.data(),
+                        relative_error.data(),
+                        mu.data(),
+                        1,
+                        elements_to_consider,
+                        false,
+                        true,
+                        stream);
+
+    double error_stddev_h;
+    raft::update_host(&error_stddev_h, error_stddev.data(), 1, stream);
+    raft::resource::sync_stream(handle, stream);
+    return error_stddev_h;
+  }
+
+ protected:
+  void testScalarQuantization()
+  {
+    // dataset identical on host / device
+    auto dataset = raft::make_device_matrix_view<const T, int64_t, raft::row_major>(
+      (const T*)(input_.data()), rows_, cols_);
+    auto dataset_h = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+      (const T*)(host_input_.data()), rows_, cols_);
+
+    size_t print_size = std::min(input_.size(), 20ul);
+
+    // train quantizer_1 on device
+    auto quantizer_1 =
+      cuvs::preprocessing::quantize::scalar::train(handle, params_.quantization_params, dataset);
+    std::cerr << "Q1: min = " << (double)quantizer_1.min_ << ", max = " << (double)quantizer_1.max_
+              << std::endl;
+
+    {
+      auto quantized_input_h = raft::make_host_matrix<QuantI, int64_t>(rows_, cols_);
+      auto quantized_input_d = raft::make_device_matrix<QuantI, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_1, dataset, quantized_input_d.view());
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_1, dataset_h, quantized_input_h.view());
+
+      {
+        raft::print_device_vector("Input array: ", input_.data(), print_size, std::cerr);
+
+        rmm::device_uvector<int> quantization_for_print(print_size, stream);
+        raft::linalg::unaryOp(quantization_for_print.data(),
+                              quantized_input_d.data_handle(),
+                              print_size,
+                              raft::cast_op<int>{},
+                              stream);
+        raft::resource::sync_stream(handle, stream);
+        raft::print_device_vector(
+          "Quantized array 1: ", quantization_for_print.data(), print_size, std::cerr);
+      }
+
+      // test (inverse) transform host/device equal
+      ASSERT_TRUE(devArrMatchHost(quantized_input_h.data_handle(),
+                                  quantized_input_d.data_handle(),
+                                  input_.size(),
+                                  cuvs::Compare<QuantI>(),
+                                  stream));
+
+      auto quantized_input_h_const_view = raft::make_host_matrix_view<const QuantI, int64_t>(
+        quantized_input_h.data_handle(), rows_, cols_);
+      auto re_transformed_input_h = raft::make_host_matrix<T, int64_t>(rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::inverse_transform(
+        handle, quantizer_1, quantized_input_h_const_view, re_transformed_input_h.view());
+
+      auto quantized_input_d_const_view = raft::make_device_matrix_view<const QuantI, int64_t>(
+        quantized_input_d.data_handle(), rows_, cols_);
+      auto re_transformed_input_d = raft::make_device_matrix<T, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::inverse_transform(
+        handle, quantizer_1, quantized_input_d_const_view, re_transformed_input_d.view());
+      raft::print_device_vector(
+        "re-transformed array: ", re_transformed_input_d.data_handle(), print_size, std::cerr);
+
+      {
+        double l2_error = getRelativeErrorStddev(dataset.data_handle(),
+                                                 re_transformed_input_d.data_handle(),
+                                                 input_.size(),
+                                                 params_.quantization_params.quantile);
+        std::cerr << "error stddev = " << l2_error << ", threshold = " << params_.threshold
+                  << std::endl;
+        // test (inverse) transform close to original dataset
+        ASSERT_TRUE(l2_error < params_.threshold);
+      }
+    }
+
+    // train quantizer_2 on host
+    auto quantizer_2 =
+      cuvs::preprocessing::quantize::scalar::train(handle, params_.quantization_params, dataset_h);
+    std::cerr << "Q2: min = " << (double)quantizer_2.min_ << ", max = " << (double)quantizer_2.max_
+              << std::endl;
+
+    // check both quantizers are the same (valid if sampling is identical)
+    if (input_.size() <= 1000000) {
+      ASSERT_TRUE((double)quantizer_1.min_ == (double)quantizer_2.min_);
+      ASSERT_TRUE((double)quantizer_1.max_ == (double)quantizer_2.max_);
+    }
+
+    {
+      // test transform host/device equal
+      auto quantized_input_h = raft::make_host_matrix<QuantI, int64_t>(rows_, cols_);
+      auto quantized_input_d = raft::make_device_matrix<QuantI, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_2, dataset, quantized_input_d.view());
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_2, dataset_h, quantized_input_h.view());
+
+      {
+        rmm::device_uvector<int> quantization_for_print(print_size, stream);
+        raft::linalg::unaryOp(quantization_for_print.data(),
+                              quantized_input_d.data_handle(),
+                              print_size,
+                              raft::cast_op<int>{},
+                              stream);
+        raft::resource::sync_stream(handle, stream);
+        raft::print_device_vector(
+          "Quantized array 2: ", quantization_for_print.data(), print_size, std::cerr);
+      }
+
+      ASSERT_TRUE(devArrMatchHost(quantized_input_h.data_handle(),
+                                  quantized_input_d.data_handle(),
+                                  input_.size(),
+                                  cuvs::Compare<QuantI>(),
+                                  stream));
+    }
+
+    // sort_by_key (input, quantization) -- check <= on result
+    {
+      auto quantized_input = raft::make_device_matrix<QuantI, int64_t>(handle, rows_, cols_);
+      cuvs::preprocessing::quantize::scalar::transform(
+        handle, quantizer_1, dataset, quantized_input.view());
+      thrust::sort_by_key(raft::resource::get_thrust_policy(handle),
+                          input_.data(),
+                          input_.data() + input_.size(),
+                          quantized_input.data_handle());
+      std::vector<QuantI> quantized_input_sorted_host(input_.size());
+      raft::update_host(
+        quantized_input_sorted_host.data(), quantized_input.data_handle(), input_.size(), stream);
+      raft::resource::sync_stream(handle, stream);
+
+      for (size_t i = 0; i < input_.size() - 1; ++i) {
+        ASSERT_TRUE(quantized_input_sorted_host[i] <= quantized_input_sorted_host[i + 1]);
+      }
+    }
+  }
+
+  void SetUp() override
+  {
+    rows_ = params_.rows;
+    cols_ = params_.cols;
+
+    int n_elements = rows_ * cols_;
+    input_.resize(n_elements, stream);
+    host_input_.resize(n_elements);
+
+    // random input
+    unsigned long long int seed = 1234ULL;
+    raft::random::RngState r(seed);
+    uniform(handle, r, input_.data(), input_.size(), params_.min, params_.max);
+
+    raft::update_host(host_input_.data(), input_.data(), input_.size(), stream);
+
+    raft::resource::sync_stream(handle, stream);
+  }
+
+ private:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  QuantizationInputs<T> params_;
+  int rows_;
+  int cols_;
+  rmm::device_uvector<T> input_;
+  std::vector<T> host_input_;
+};
+
+template <typename T>
+const std::vector<QuantizationInputs<T>> inputs = {
+  {{1.0}, 5, 5, T(0.0), T(1.0)},
+  {{0.98}, 10, 20, T(0.0), T(1.0)},
+  {{0.90}, 1000, 1500, T(-500.0), T(100.0)},
+  {{0.59}, 100, 200},
+  {{0.1}, 1, 1, T(0.0), T(1.0)},
+  {{0.01}, 50, 50, T(0.0), T(1.0)},
+  {{0.94}, 10, 20, T(-1.0), T(0.0)},
+  {{0.95}, 10, 2, T(50.0), T(100.0)},
+  {{0.95}, 10, 20, T(-500.0), T(-100.0)},
+  {{0.95}, 10, 20, T(5.0), T(5.0)},
+};
+
+typedef QuantizationTest<float, int8_t> QuantizationTest_float_int8t;
+TEST_P(QuantizationTest_float_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); }
+
+typedef QuantizationTest<double, int8_t> QuantizationTest_double_int8t;
+TEST_P(QuantizationTest_double_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); }
+
+typedef QuantizationTest<half, int8_t> QuantizationTest_half_int8t;
+TEST_P(QuantizationTest_half_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); }
+
+INSTANTIATE_TEST_CASE_P(QuantizationTest,
+                        QuantizationTest_float_int8t,
+                        ::testing::ValuesIn(inputs<float>));
+INSTANTIATE_TEST_CASE_P(QuantizationTest,
+                        QuantizationTest_double_int8t,
+                        ::testing::ValuesIn(inputs<double>));
+INSTANTIATE_TEST_CASE_P(QuantizationTest,
+                        QuantizationTest_half_int8t,
+                        ::testing::ValuesIn(inputs<half>));
+
+}  // namespace cuvs::preprocessing::quantize::scalar
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 49732dc92..34f48a88f 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -10,5 +10,6 @@ C++ API Documentation
    cpp_api/cluster.rst
    cpp_api/distance.rst
    cpp_api/neighbors.rst
+   cpp_api/preprocessing.rst
    cpp_api/selection.rst
    cpp_api/stats.rst
diff --git a/docs/source/cpp_api/preprocessing.rst b/docs/source/cpp_api/preprocessing.rst
new file mode 100644
index 000000000..1c2b0f051
--- /dev/null
+++ b/docs/source/cpp_api/preprocessing.rst
@@ -0,0 +1,12 @@
+Preprocessing
+=============
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   preprocessing_quantize.rst
diff --git a/docs/source/cpp_api/preprocessing_quantize.rst b/docs/source/cpp_api/preprocessing_quantize.rst
new file mode 100644
index 000000000..b660c61c5
--- /dev/null
+++ b/docs/source/cpp_api/preprocessing_quantize.rst
@@ -0,0 +1,20 @@
+Quantize
+========
+
+This page provides C++ class references for the publicly-exposed elements of the
+`cuvs/preprocessing/quantize` package. 
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Scalar
+------
+
+``#include <cuvs/preprocessing/quantize/scalar.hpp>``
+
+namespace *cuvs::preprocessing::quantize::scalar*
+
+.. doxygengroup:: scalar
+   :project: cuvs
+

From c9b38623932039722214caf02a516ce12883c9a4 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 5 Dec 2024 09:44:33 +0100
Subject: [PATCH 43/47] Skip IVF-PQ packing test for lists with not enough data
 (#512)

Skip some checks involving hard-coded offsets into the data when the number of records in the checked PQ list is smaller than needed.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/cuvs/pull/512
---
 cpp/test/neighbors/ann_ivf_pq.cuh | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index fd4e330db..3a92b5e3d 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -379,7 +379,14 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // Pack a few vectors back to the list.
     int row_offset = 5;
     int n_vec      = 3;
-    ASSERT_TRUE(row_offset + n_vec < n_rows);
+    if (static_cast<decltype(n_rows)>(row_offset + n_vec) > n_rows) {
+      RAFT_LOG_INFO(
+        "Skipping IVF-PQ check_packing/pack test for label %u due to insufficient data (%u "
+        "records)",
+        label,
+        uint32_t(n_rows));
+      return;
+    }
     size_t offset      = row_offset * index->pq_dim();
     auto codes_to_pack = raft::make_device_matrix_view<const uint8_t, uint32_t>(
       codes.data_handle() + offset, n_vec, index->pq_dim());
@@ -393,7 +400,14 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // Another test with the API that take list_data directly
     [[maybe_unused]] auto list_data = index->lists()[label]->data.view();
     uint32_t n_take                 = 4;
-    ASSERT_TRUE(row_offset + n_take < n_rows);
+    if (static_cast<decltype(n_rows)>(row_offset + n_take) > n_rows) {
+      RAFT_LOG_INFO(
+        "Skipping IVF-PQ check_packing/take test for label %u due to insufficient data (%u "
+        "records)",
+        label,
+        uint32_t(n_rows));
+      return;
+    }
     auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
     ivf_pq::helpers::codepacker::unpack(
       handle_, list_data, index->pq_bits(), row_offset, codes2.view());

From c5e03f2eaf5e30053a248a866428249909e99180 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 5 Dec 2024 07:51:00 -0800
Subject: [PATCH 44/47] Update cuvs to match raft's cutlass changes (#516)

Due to the tight integration between cuvs and raft, we need to ensure that cuvs is updated for rapidsai/raft#2503 or builds of cuvs that rely on cloning raft will get an incompatible version of cutlass due to raft's update.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/516
---
 .pre-commit-config.yaml                      |  5 ++-
 cpp/cmake/patches/cutlass/build-export.patch | 27 +++++++++++++++
 cpp/cmake/patches/cutlass_override.json      | 16 +++++++++
 cpp/cmake/thirdparty/get_cutlass.cmake       | 35 ++++++++++----------
 4 files changed, 64 insertions(+), 19 deletions(-)
 create mode 100644 cpp/cmake/patches/cutlass/build-export.patch
 create mode 100644 cpp/cmake/patches/cutlass_override.json

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f4fdf202e..5e53abd92 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -91,7 +91,10 @@ repos:
               - id: codespell
                 additional_dependencies: [tomli]
                 args: ["--toml", "pyproject.toml"]
-                exclude: (?x)^(^CHANGELOG.md$)
+                exclude: |
+                  (?x)
+                    ^CHANGELOG[.]md$|
+                    ^cpp/cmake/patches/cutlass/build-export[.]patch$
       - repo: https://github.com/pre-commit/pre-commit-hooks
         rev: v4.5.0
         hooks:
diff --git a/cpp/cmake/patches/cutlass/build-export.patch b/cpp/cmake/patches/cutlass/build-export.patch
new file mode 100644
index 000000000..a6423e9c0
--- /dev/null
+++ b/cpp/cmake/patches/cutlass/build-export.patch
@@ -0,0 +1,27 @@
+From e0a9597946257a01ae8444200f836ee51d5597ba Mon Sep 17 00:00:00 2001
+From: Kyle Edwards <kyedwards@nvidia.com>
+Date: Wed, 20 Nov 2024 16:37:38 -0500
+Subject: [PATCH] Remove erroneous include directories
+
+These directories are left over from when CuTe was a separate
+CMake project. Remove them.
+---
+ CMakeLists.txt | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 7419bdf5e..545384d82 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -665,8 +665,6 @@ target_include_directories(
+   $<INSTALL_INTERFACE:include>
+   $<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
+   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
+-  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
+-  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
+   )
+ 
+ # Mark CTK headers as system to supress warnings from them
+-- 
+2.34.1
+
diff --git a/cpp/cmake/patches/cutlass_override.json b/cpp/cmake/patches/cutlass_override.json
new file mode 100644
index 000000000..7bf818987
--- /dev/null
+++ b/cpp/cmake/patches/cutlass_override.json
@@ -0,0 +1,16 @@
+{
+  "packages" : {
+    "cutlass" : {
+      "version": "3.5.1",
+      "git_url": "https://github.com/NVIDIA/cutlass.git",
+      "git_tag": "v${version}",
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/cutlass/build-export.patch",
+          "issue" : "Fix build directory export",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake
index 61065318b..71bd2d26c 100644
--- a/cpp/cmake/thirdparty/get_cutlass.cmake
+++ b/cpp/cmake/thirdparty/get_cutlass.cmake
@@ -13,10 +13,11 @@
 # =============================================================================
 
 function(find_and_configure_cutlass)
-  set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  # if(RAFT_ENABLE_DIST_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES)
   set(CUTLASS_ENABLE_HEADERS_ONLY
       ON
       CACHE BOOL "Enable only the header library"
@@ -34,13 +35,22 @@ function(find_and_configure_cutlass)
     set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE)
   endif()
 
+  include("${rapids-cmake-dir}/cpm/package_override.cmake")
+  rapids_cpm_package_override("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches/cutlass_override.json")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(cutlass version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(cutlass ${version} patch_command)
+
   rapids_cpm_find(
-    NvidiaCutlass ${PKG_VERSION}
+    NvidiaCutlass ${version}
     GLOBAL_TARGETS nvidia::cutlass::cutlass
     CPM_ARGS
-    GIT_REPOSITORY ${PKG_REPOSITORY}
-    GIT_TAG ${PKG_PINNED_TAG}
-    GIT_SHALLOW TRUE
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
     OPTIONS "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
   )
 
@@ -56,7 +66,6 @@ function(find_and_configure_cutlass)
       NAMESPACE nvidia::cutlass::
     )
   endif()
-  # endif()
 
   # We generate the cutlass-config files when we built cutlass locally, so always do
   # `find_dependency`
@@ -79,14 +88,4 @@ function(find_and_configure_cutlass)
   )
 endfunction()
 
-if(NOT RAFT_CUTLASS_GIT_TAG)
-  set(RAFT_CUTLASS_GIT_TAG v2.10.0)
-endif()
-
-if(NOT RAFT_CUTLASS_GIT_REPOSITORY)
-  set(RAFT_CUTLASS_GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git)
-endif()
-
-find_and_configure_cutlass(
-  VERSION 2.10.0 REPOSITORY ${RAFT_CUTLASS_GIT_REPOSITORY} PINNED_TAG ${RAFT_CUTLASS_GIT_TAG}
-)
+find_and_configure_cutlass()

From 007c3d2c9efddec6dc46549a5aa2e9f48d4d1612 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 5 Dec 2024 15:48:41 -0500
Subject: [PATCH 45/47] Moving spectral embedding and kernel gramm APIs to cuVS
 (#463)

Partially addresses #455

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Ben Frederickson (https://github.com/benfred)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/463
---
 build.sh                                      |  12 +-
 cpp/CMakeLists.txt                            |   4 +
 cpp/include/cuvs/cluster/agglomerative.hpp    |   1 +
 cpp/include/cuvs/distance/grammian.hpp        | 665 +++++++++++++++
 cpp/include/cuvs/embed/spectral.hpp           |  40 +
 .../distance/detail/kernels/gram_matrix.cu    | 481 +++++++++++
 .../distance/detail/kernels/gram_matrix.cuh   | 488 -----------
 .../distance/detail/kernels/kernel_factory.cu |  61 ++
 .../detail/kernels/kernel_factory.cuh         |  65 --
 .../detail/kernels/kernel_matrices.cu         | 726 ++++++++++++++++
 .../detail/kernels/kernel_matrices.cuh        | 777 ------------------
 .../distance/detail/kernels/rbf_fin_op.cuh    |   4 +-
 .../detail/pairwise_matrix/dispatch-ext.cuh   |   4 +-
 .../detail/pairwise_matrix/dispatch_rbf.cu    |   6 +-
 cpp/src/distance/distance-ext.cuh             |   4 +-
 cpp/src/distance/distance.cu                  |   4 +-
 cpp/src/embed/spectral.cu                     |  53 ++
 cpp/src/sparse/cluster/cluster_solvers.cuh    | 100 +++
 cpp/src/sparse/cluster/detail/spectral.cuh    | 111 +++
 .../spectral/modularity_maximization.hpp      | 176 ++++
 .../cluster/detail/spectral/partition.hpp     | 188 +++++
 .../cluster/detail/spectral/spectral_util.cuh | 181 ++++
 cpp/src/sparse/cluster/eigen_solvers.cuh      | 107 +++
 .../cluster/modularity_maximization.cuh       |  86 ++
 cpp/src/sparse/cluster/partition.cuh          |  95 +++
 cpp/test/CMakeLists.txt                       |   6 +
 cpp/test/distance/gram.cu                     | 174 ++++
 cpp/test/distance/gram_base.cuh               |  91 ++
 cpp/test/sparse/cluster/cluster_solvers.cu    | 105 +++
 cpp/test/sparse/cluster/eigen_solvers.cu      | 119 +++
 cpp/test/sparse/cluster/spectral.cu           | 109 +++
 cpp/test/sparse/cluster/spectral_matrix.cu    |  84 ++
 cpp/test/sparse/gram.cu                       | 330 ++++++++
 33 files changed, 4105 insertions(+), 1352 deletions(-)
 create mode 100644 cpp/include/cuvs/distance/grammian.hpp
 create mode 100644 cpp/include/cuvs/embed/spectral.hpp
 create mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cu
 delete mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cuh
 create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cu
 delete mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cuh
 create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cu
 delete mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cuh
 create mode 100644 cpp/src/embed/spectral.cu
 create mode 100644 cpp/src/sparse/cluster/cluster_solvers.cuh
 create mode 100644 cpp/src/sparse/cluster/detail/spectral.cuh
 create mode 100644 cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
 create mode 100644 cpp/src/sparse/cluster/detail/spectral/partition.hpp
 create mode 100644 cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
 create mode 100644 cpp/src/sparse/cluster/eigen_solvers.cuh
 create mode 100644 cpp/src/sparse/cluster/modularity_maximization.cuh
 create mode 100644 cpp/src/sparse/cluster/partition.cuh
 create mode 100644 cpp/test/distance/gram.cu
 create mode 100644 cpp/test/distance/gram_base.cuh
 create mode 100644 cpp/test/sparse/cluster/cluster_solvers.cu
 create mode 100644 cpp/test/sparse/cluster/eigen_solvers.cu
 create mode 100644 cpp/test/sparse/cluster/spectral.cu
 create mode 100644 cpp/test/sparse/cluster/spectral_matrix.cu
 create mode 100644 cpp/test/sparse/gram.cu

diff --git a/build.sh b/build.sh
index c08c2900e..bd5fa649b 100755
--- a/build.sh
+++ b/build.sh
@@ -76,8 +76,8 @@ BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 BUILD_SHARED_LIBS=ON
 
-TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST"
-ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL"
+TEST_TARGETS=""
+ANN_BENCH_TARGETS=""
 
 CACHE_ARGS=""
 NVTX=ON
@@ -273,14 +273,6 @@ fi
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
-
-    # Force compile library when needed test targets are specified
-    if [[ $CMAKE_TARGET == *"CAGRA_C_TEST"* || \
-          $CMAKE_TARGET == *"INTEROP_TEST"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* ]]; then
-      echo "-- Enabling compiled lib for gtests"
-      COMPILE_LIBRARY=ON
-    fi
 fi
 
 if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 199bb232d..95fb7e63b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -324,6 +324,9 @@ if(BUILD_SHARED_LIBS)
     src/cluster/kmeans_transform_float.cu
     src/cluster/single_linkage_float.cu
     src/core/bitset.cu
+    src/distance/detail/kernels/gram_matrix.cu
+    src/distance/detail/kernels/kernel_factory.cu
+    src/distance/detail/kernels/kernel_matrices.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
@@ -370,6 +373,7 @@ if(BUILD_SHARED_LIBS)
     src/distance/distance.cu
     src/distance/pairwise_distance.cu
     src/distance/sparse_distance.cu
+    src/embed/spectral.cu
     src/neighbors/brute_force.cu
     src/neighbors/brute_force_serialize.cu
     src/neighbors/cagra_build_float.cu
diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp
index e1da04085..8f7e8675a 100644
--- a/cpp/include/cuvs/cluster/agglomerative.hpp
+++ b/cpp/include/cuvs/cluster/agglomerative.hpp
@@ -18,6 +18,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <optional>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp
new file mode 100644
index 000000000..0c904d493
--- /dev/null
+++ b/cpp/include/cuvs/distance/grammian.hpp
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::distance::kernels {
+
+template <typename math_t>
+using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, raft::layout_stride>;
+template <typename math_t>
+using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, raft::layout_stride>;
+template <typename math_t>
+using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+template <typename math_t>
+class GramMatrixBase {
+ protected:
+  cublasHandle_t cublas_handle;
+  bool legacy_interface;
+
+ public:
+  GramMatrixBase() : legacy_interface(false){};
+  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
+    : cublas_handle(cublas_handle), legacy_interface(true){};
+
+  virtual ~GramMatrixBase(){};
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  dense_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  csr_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        dense_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        csr_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] virtual void evaluate(const math_t* x1,
+                                       int n1,
+                                       int n_cols,
+                                       const math_t* x2,
+                                       int n2,
+                                       math_t* out,
+                                       bool is_row_major,
+                                       cudaStream_t stream,
+                                       int ld1,
+                                       int ld2,
+                                       int ld_out);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void operator()(const math_t* x1,
+                                 int n1,
+                                 int n_cols,
+                                 const math_t* x2,
+                                 int n2,
+                                 math_t* out,
+                                 bool is_row_major,
+                                 cudaStream_t stream,
+                                 int ld1    = 0,
+                                 int ld2    = 0,
+                                 int ld_out = 0);
+
+ protected:
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void linear(const math_t* x1,
+                             int n1,
+                             int n_cols,
+                             const math_t* x2,
+                             int n2,
+                             math_t* out,
+                             bool is_row_major,
+                             cudaStream_t stream,
+                             int ld1,
+                             int ld2,
+                             int ld_out);
+
+ protected:
+  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              dense_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              csr_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+};
+
+template <typename math_t>
+class KernelFactory {
+ public:
+  static GramMatrixBase<math_t>* create(KernelParams params);
+  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle);
+};
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+class PolynomialKernel : public GramMatrixBase<math_t> {
+  exp_t exponent;
+  math_t gain;
+  math_t offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a polynomial kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
+   *
+   * @tparam math_t floating point type
+   * @tparam exp_t type of exponent
+   * @param exponent
+   * @param gain
+   * @param offset
+   */
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
+    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset){};
+
+  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+class TanhKernel : public GramMatrixBase<math_t> {
+  math_t gain, offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a tanh kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   * @param offset
+   */
+  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
+
+  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+class RBFKernel : public GramMatrixBase<math_t> {
+  math_t gain;
+
+  void applyKernel(math_t* inout,
+                   int ld,
+                   int rows,
+                   int cols,
+                   math_t* norm_x1,
+                   math_t* norm_x2,
+                   bool is_row_major,
+                   cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a RBF kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = exp(-gain*|x1_i- x2_k|^2)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   */
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain){};
+
+  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain){};
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       dense_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       csr_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/include/cuvs/embed/spectral.hpp b/cpp/include/cuvs/embed/spectral.hpp
new file mode 100644
index 000000000..1a8fed96a
--- /dev/null
+++ b/cpp/include/cuvs/embed/spectral.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_coo_matrix.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::embed::spectral {
+
+/**
+ * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings
+ * (lowest n_components eigenvectors), using Lanczos min cut algorithm. Please note that this
+ * algorithm does not compute a full laplacian eigenmap, as the laplacian eigenmap would embed each
+ * connected component. Laplacian eigenmaps can be built from this algorithm by running it on the
+ * vectors for each connected component.
+
+ * @param[in] handle
+ * @param[in] knn_graph KNN Graph
+ * @param[in] n_components the number of components to project into
+ * @param[out] out output array for embedding (size n*n_comonents)
+ * @param[in] seed
+ */
+void fit(const raft::resources& handle,
+         raft::device_coo_matrix_view<float, int, int, int> knn_graph,
+         int n_components,
+         raft::device_matrix_view<float, int> out,
+         unsigned long long seed = 0L);
+};  // namespace cuvs::embed::spectral
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cu b/cpp/src/distance/detail/kernels/gram_matrix.cu
new file mode 100644
index 000000000..0e4f3e639
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/gram_matrix.cu
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../distance.cuh"
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/sparse/linalg/spmm.hpp>
+
+namespace cuvs::distance::kernels {
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        dense_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        csr_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+// unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      dense_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      csr_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::evaluate(const math_t* x1,
+                                                     int n1,
+                                                     int n_cols,
+                                                     const math_t* x2,
+                                                     int n2,
+                                                     math_t* out,
+                                                     bool is_row_major,
+                                                     cudaStream_t stream,
+                                                     int ld1,
+                                                     int ld2,
+                                                     int ld_out)
+{
+  linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::operator()(const math_t* x1,
+                                                       int n1,
+                                                       int n_cols,
+                                                       const math_t* x2,
+                                                       int n2,
+                                                       math_t* out,
+                                                       bool is_row_major,
+                                                       cudaStream_t stream,
+                                                       int ld1,
+                                                       int ld2,
+                                                       int ld_out)
+{
+  ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+  if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+  if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+  if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+  evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::linear(const math_t* x1,
+                                                   int n1,
+                                                   int n_cols,
+                                                   const math_t* x2,
+                                                   int n2,
+                                                   math_t* out,
+                                                   bool is_row_major,
+                                                   cudaStream_t stream,
+                                                   int ld1,
+                                                   int ld2,
+                                                   int ld_out)
+{
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     n2,
+                                                     n1,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x2,
+                                                     ld2,
+                                                     x1,
+                                                     ld1,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  } else {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_T,
+                                                     n1,
+                                                     n2,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x1,
+                                                     ld1,
+                                                     x2,
+                                                     ld2,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  }
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    dense_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major,
+         "GramMatrix leading dimensions for x1, x2 and out do not match");
+
+  // check dimensions
+  int n1     = out.extent(0);
+  int n2     = out.extent(1);
+  int n_cols = x1.extent(1);
+  ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  // extract major stride
+  int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
+  int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
+  int ld_out = is_row_major ? out.stride(0) : out.stride(1);
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       true,
+                       false,
+                       n2,
+                       n1,
+                       n_cols,
+                       &alpha,
+                       x2.data_handle(),
+                       ld2,
+                       x1.data_handle(),
+                       ld1,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  } else {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       false,
+                       true,
+                       n1,
+                       n2,
+                       n_cols,
+                       &alpha,
+                       x1.data_handle(),
+                       ld1,
+                       x2.data_handle(),
+                       ld2,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  }
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major, "GramMatrix leading dimensions for x2 and out do not match");
+
+  // check dimensions
+  auto x1_structure = x1.structure_view();
+  ASSERT(x1_structure.get_n_rows() == out.extent(0),
+         "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == out.extent(1),
+         "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
+         "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+
+  raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    csr_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check layout consistency (w.r.t. strides a matrix might be both row & col major)
+  bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
+  bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
+
+  ASSERT(is_row_major_nopad || is_col_major_nopad,
+         "Sparse linear Kernel distance does not support ld_out parameter");
+
+  // switch a,b based on is_row_major
+  if (is_col_major_nopad) {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(1), out.extent(0));
+
+    cuvs::distance::pairwise_distance(
+      handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  } else {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(0), out.extent(1));
+    cuvs::distance::pairwise_distance(
+      handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  }
+}
+
+template class GramMatrixBase<float>;
+template class GramMatrixBase<double>;
+
+};  // namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cuh b/cpp/src/distance/detail/kernels/gram_matrix.cuh
deleted file mode 100644
index d435fb4d1..000000000
--- a/cpp/src/distance/detail/kernels/gram_matrix.cuh
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../../distance.cuh"
-#include <cuvs/distance/distance.hpp>
-#include <raft/core/device_csr_matrix.hpp>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resources.hpp>
-// #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/distance/distance.cuh>
-#include <raft/sparse/linalg/spmm.hpp>
-
-namespace cuvs::distance::kernels::detail {
-
-template <typename math_t>
-using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
-template <typename math_t>
-using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, layout_stride>;
-template <typename math_t>
-using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
-
-/**
- * Base class for general Gram matrices
- * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
- * Here, the  inner product is evaluated for all elements from vectors sets X1,
- * and X2.
- *
- * To be more precise, on exit the output buffer will store:
- * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
- * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
- * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
- * from the x2 set.
- */
-template <typename math_t>
-class GramMatrixBase {
- protected:
-  cublasHandle_t cublas_handle;
-  bool legacy_interface;
-
- public:
-  GramMatrixBase() : legacy_interface(false){};
-  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
-    : cublas_handle(cublas_handle), legacy_interface(true){};
-
-  virtual ~GramMatrixBase(){};
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  dense_input_matrix_view_t<math_t> x1,
-                  dense_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  csr_input_matrix_view_t<math_t> x1,
-                  dense_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  csr_input_matrix_view_t<math_t> x1,
-                  csr_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        dense_input_matrix_view_t<math_t> x1,
-                        dense_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        csr_input_matrix_view_t<math_t> x1,
-                        dense_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        csr_input_matrix_view_t<math_t> x1,
-                        csr_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] virtual void evaluate(const math_t* x1,
-                                       int n1,
-                                       int n_cols,
-                                       const math_t* x2,
-                                       int n2,
-                                       math_t* out,
-                                       bool is_row_major,
-                                       cudaStream_t stream,
-                                       int ld1,
-                                       int ld2,
-                                       int ld_out)
-  {
-    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  [[deprecated]] void operator()(const math_t* x1,
-                                 int n1,
-                                 int n_cols,
-                                 const math_t* x2,
-                                 int n2,
-                                 math_t* out,
-                                 bool is_row_major,
-                                 cudaStream_t stream,
-                                 int ld1    = 0,
-                                 int ld2    = 0,
-                                 int ld_out = 0)
-  {
-    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
- protected:
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  [[deprecated]] void linear(const math_t* x1,
-                             int n1,
-                             int n_cols,
-                             const math_t* x2,
-                             int n2,
-                             math_t* out,
-                             bool is_row_major,
-                             cudaStream_t stream,
-                             int ld1,
-                             int ld2,
-                             int ld_out)
-  {
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_T,
-                                                       CUBLAS_OP_N,
-                                                       n2,
-                                                       n1,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x2,
-                                                       ld2,
-                                                       x1,
-                                                       ld1,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    } else {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_T,
-                                                       n1,
-                                                       n2,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x1,
-                                                       ld1,
-                                                       x2,
-                                                       ld2,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    }
-  }
-
- protected:
-  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
-
-  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              dense_input_matrix_view_t<math_t> x1,
-              dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x1, x2 and out do not match");
-
-    // check dimensions
-    int n1     = out.extent(0);
-    int n2     = out.extent(1);
-    int n_cols = x1.extent(1);
-    ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    // extract major stride
-    int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
-    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
-    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         true,
-                         false,
-                         n2,
-                         n1,
-                         n_cols,
-                         &alpha,
-                         x2.data_handle(),
-                         ld2,
-                         x1.data_handle(),
-                         ld1,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    } else {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         false,
-                         true,
-                         n1,
-                         n2,
-                         n_cols,
-                         &alpha,
-                         x1.data_handle(),
-                         ld1,
-                         x2.data_handle(),
-                         ld2,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    }
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              csr_input_matrix_view_t<math_t> x1,
-              dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x2 and out do not match");
-
-    // check dimensions
-    auto x1_structure = x1.structure_view();
-    ASSERT(x1_structure.get_n_rows() == out.extent(0),
-           "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == out.extent(1),
-           "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
-           "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-
-    raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              csr_input_matrix_view_t<math_t> x1,
-              csr_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check layout consistency (w.r.t. strides a matrix might be both row & col major)
-    bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
-    bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
-
-    ASSERT(is_row_major_nopad || is_col_major_nopad,
-           "Sparse linear Kernel distance does not support ld_out parameter");
-
-    // switch a,b based on is_row_major
-    if (is_col_major_nopad) {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(1), out.extent(0));
-      raft::sparse::distance::pairwise_distance(
-        handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    } else {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(0), out.extent(1));
-      raft::sparse::distance::pairwise_distance(
-        handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    }
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cu b/cpp/src/distance/detail/kernels/kernel_factory.cu
new file mode 100644
index 000000000..25f9e9b84
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_factory.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/distance/grammian.hpp>
+
+namespace cuvs::distance::kernels {
+
+template <typename math_t>
+GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(); break;
+    case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
+    case RBF: res = new RBFKernel<math_t>(gamma); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+template <typename math_t>
+[[deprecated]] GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params,
+                                                                     cublasHandle_t handle)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
+    case POLYNOMIAL:
+      res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
+      break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
+    case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+template class KernelFactory<float>;
+template class KernelFactory<double>;
+
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cuh b/cpp/src/distance/detail/kernels/kernel_factory.cuh
deleted file mode 100644
index 5c50a95a3..000000000
--- a/cpp/src/distance/detail/kernels/kernel_factory.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-#include "kernel_matrices.cuh"
-
-#include <cuvs/distance/distance.hpp>
-#include <raft/util/cudart_utils.hpp>
-
-namespace cuvs::distance::kernels::detail {
-
-template <typename math_t>
-class KernelFactory {
- public:
-  static GramMatrixBase<math_t>* create(KernelParams params)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(); break;
-      case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
-      case RBF: res = new RBFKernel<math_t>(gamma); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-
-  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
-      case POLYNOMIAL:
-        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
-        break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cu b/cpp/src/distance/detail/kernels/kernel_matrices.cu
new file mode 100644
index 000000000..526ca106f
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_matrices.cu
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../distance/distance.cuh"
+#include <cuvs/distance/grammian.hpp>
+
+#include "rbf_fin_op.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/sparse/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+namespace cuvs::distance::kernels {
+
+/** Epiloge function for polynomial kernel without padding.
+ * Calculates output = (gain*in + offset)^exponent
+ * @param inout device vector in column major format, size [len]
+ * @param len array length
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel_nopad(
+  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = pow(gain * inout[tid] + offset, exponent);
+  }
+}
+
+/** Epiloge function for polynomial kernel with padding.
+ * Calculates output = (gain*input + offset)^exponent
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel(
+  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
+    }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector, size [len]
+ * @param len length of the input vector
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = tanh(gain * inout[tid] + offset);
+  }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
+    }
+}
+
+/** Epiloge function for rbf kernel using expansion.
+ *
+ * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
+ *
+ * Intended usage
+ *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
+ *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
+ *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
+ *
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param norm_x l2-norm of X's rows
+ * @param norm_y l2-norm of Y's rows
+ * @param gain
+ */
+template <typename math_t>
+RAFT_KERNEL rbf_kernel_expanded(
+  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y) {
+    math_t norm_y_val = norm_y[tidy];
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] =
+        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
+    }
+  }
+}
+
+std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
+{
+  dim3 block_shape       = dim3(32, 4);
+  const int num_blocks_x = raft::ceildiv(n1, 32);
+  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
+  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
+  return std::make_tuple(grid_shape, block_shape);
+}
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, exponent, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
+      inout, ld, n1, n2, exponent, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               dense_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               csr_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t, typename exp_t>
+[[deprecated]] void PolynomialKernel<math_t, exp_t>::evaluate(const math_t* x1,
+                                                              int n1,
+                                                              int n_cols,
+                                                              const math_t* x2,
+                                                              int n2,
+                                                              math_t* out,
+                                                              bool is_row_major,
+                                                              cudaStream_t stream,
+                                                              int ld1,
+                                                              int ld2,
+                                                              int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  dense_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  csr_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void TanhKernel<math_t>::evaluate(const math_t* x1,
+                                                 int n1,
+                                                 int n_cols,
+                                                 const math_t* x2,
+                                                 int n2,
+                                                 math_t* out,
+                                                 bool is_row_major,
+                                                 cudaStream_t stream,
+                                                 int ld1,
+                                                 int ld2,
+                                                 int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::applyKernel(math_t* inout,
+                                    int ld,
+                                    int rows,
+                                    int cols,
+                                    math_t* norm_x1,
+                                    math_t* norm_x2,
+                                    bool is_row_major,
+                                    cudaStream_t stream)
+{
+  int n1                         = is_row_major ? cols : rows;
+  int n2                         = is_row_major ? rows : cols;
+  math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
+  math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
+  auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+  rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
+    inout, ld, n1, n2, norm_n1, norm_n2, gain);
+}
+
+template <typename math_t>
+void RBFKernel<math_t>::matrixRowNormL2(raft::resources const& handle,
+                                        dense_input_matrix_view_t<math_t> matrix,
+                                        math_t* target)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
+  int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
+  int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
+  ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
+  raft::linalg::rowNorm(target,
+                        matrix.data_handle(),
+                        matrix.extent(1),
+                        matrix.extent(0),
+                        raft::linalg::NormType::L2Norm,
+                        is_row_major,
+                        raft::resource::get_cuda_stream(handle));
+}
+
+template <typename math_t>
+void RBFKernel<math_t>::matrixRowNormL2(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> matrix,
+                                        math_t* target)
+{
+  auto matrix_structure = matrix.structure_view();
+  raft::sparse::linalg::rowNormCsr(handle,
+                                   matrix_structure.get_indptr().data(),
+                                   matrix.get_elements().data(),
+                                   matrix_structure.get_nnz(),
+                                   matrix_structure.get_n_rows(),
+                                   target,
+                                   raft::linalg::NormType::L2Norm);
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 dense_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.extent(0), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 csr_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void RBFKernel<math_t>::evaluate(const math_t* x1,
+                                                int n1,
+                                                int n_cols,
+                                                const math_t* x2,
+                                                int n2,
+                                                math_t* out,
+                                                bool is_row_major,
+                                                cudaStream_t stream,
+                                                int ld1,
+                                                int ld2,
+                                                int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  int minor1    = is_row_major ? n_cols : n1;
+  int minor2    = is_row_major ? n_cols : n2;
+  int minor_out = is_row_major ? n2 : n1;
+  ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+  ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+  ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+
+  math_t gain   = this->gain;
+  using index_t = int64_t;
+
+  rbf_fin_op fin_op{gain};
+
+  raft::resources handle;
+  raft::resource::set_cuda_stream(handle, stream);
+
+  cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
+                           math_t,
+                           math_t,
+                           math_t,
+                           decltype(fin_op),
+                           index_t>(handle,
+                                    const_cast<math_t*>(x1),
+                                    const_cast<math_t*>(x2),
+                                    out,
+                                    n1,
+                                    n2,
+                                    n_cols,
+                                    NULL,
+                                    0,
+                                    fin_op,
+                                    is_row_major);
+}
+
+template class PolynomialKernel<float, int>;
+template class PolynomialKernel<double, int>;
+template class TanhKernel<float>;
+template class TanhKernel<double>;
+template class RBFKernel<float>;
+template class RBFKernel<double>;
+
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cuh b/cpp/src/distance/detail/kernels/kernel_matrices.cuh
deleted file mode 100644
index bff5bda92..000000000
--- a/cpp/src/distance/detail/kernels/kernel_matrices.cuh
+++ /dev/null
@@ -1,777 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-
-#include "../detail/kernels/rbf_fin_op.cuh"
-#include <cuvs/distance/distance.cuh>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace cuvs::distance::kernels::detail {
-
-/** Epiloge function for polynomial kernel without padding.
- * Calculates output = (gain*in + offset)^exponent
- * @param inout device vector in column major format, size [len]
- * @param len array length
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel_nopad(
-  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = pow(gain * inout[tid] + offset, exponent);
-  }
-}
-
-/** Epiloge function for polynomial kernel with padding.
- * Calculates output = (gain*input + offset)^exponent
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel(
-  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
-    }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector, size [len]
- * @param len length of the input vector
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = tanh(gain * inout[tid] + offset);
-  }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
-    }
-}
-
-/** Epiloge function for rbf kernel using expansion.
- *
- * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
- *
- * Intended usage
- *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
- *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
- *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
- *
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param norm_x l2-norm of X's rows
- * @param norm_y l2-norm of Y's rows
- * @param gain
- */
-template <typename math_t>
-RAFT_KERNEL rbf_kernel_expanded(
-  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y) {
-    math_t norm_y_val = norm_y[tidy];
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] =
-        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
-    }
-  }
-}
-
-namespace {
-std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
-{
-  dim3 block_shape       = dim3(32, 4);
-  const int num_blocks_x = raft::ceildiv(n1, 32);
-  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
-  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
-  return std::make_tuple(grid_shape, block_shape);
-}
-}  // namespace
-
-/**
- * Create a kernel matrix using polynomial kernel function.
- */
-template <typename math_t, typename exp_t>
-class PolynomialKernel : public GramMatrixBase<math_t> {
-  exp_t exponent;
-  math_t gain;
-  math_t offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, exponent, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
-        inout, ld, n1, n2, exponent, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a polynomial kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
-   *
-   * @tparam math_t floating point type
-   * @tparam exp_t type of exponent
-   * @param exponent
-   * @param gain
-   * @param offset
-   */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
-    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using tanh kernel function.
- */
-template <typename math_t>
-class TanhKernel : public GramMatrixBase<math_t> {
-  math_t gain, offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a tanh kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   * @param offset
-   */
-  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
-
-  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using RBF kernel function.
- */
-template <typename math_t>
-class RBFKernel : public GramMatrixBase<math_t> {
-  math_t gain;
-
-  void applyKernel(math_t* inout,
-                   int ld,
-                   int rows,
-                   int cols,
-                   math_t* norm_x1,
-                   math_t* norm_x2,
-                   bool is_row_major,
-                   cudaStream_t stream)
-  {
-    int n1                         = is_row_major ? cols : rows;
-    int n2                         = is_row_major ? rows : cols;
-    math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
-    math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
-    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-    rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
-      inout, ld, n1, n2, norm_n1, norm_n2, gain);
-  }
-
- public:
-  /**
-   * Constructs a RBF kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = exp(-gain*|x1_i- x2_k|^2)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {}
-
-  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain)
-  {
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       dense_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
-    int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
-    int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
-    ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
-    raft::linalg::rowNorm(target,
-                          matrix.data_handle(),
-                          matrix.extent(1),
-                          matrix.extent(0),
-                          raft::linalg::NormType::L2Norm,
-                          is_row_major,
-                          resource::get_cuda_stream(handle));
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       csr_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    auto matrix_structure = matrix.structure_view();
-    raft::sparse::linalg::rowNormCsr(handle,
-                                     matrix_structure.get_indptr().data(),
-                                     matrix.get_elements().data(),
-                                     matrix_structure.get_nnz(),
-                                     matrix_structure.get_n_rows(),
-                                     target,
-                                     raft::linalg::NormType::L2Norm);
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.extent(0), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    int minor1    = is_row_major ? n_cols : n1;
-    int minor2    = is_row_major ? n_cols : n2;
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
-    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-
-    math_t gain   = this->gain;
-    using index_t = int64_t;
-
-    rbf_fin_op fin_op{gain};
-
-    raft::resources handle;
-    resource::set_cuda_stream(handle, stream);
-
-    cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
-                             math_t,
-                             math_t,
-                             math_t,
-                             decltype(fin_op),
-                             index_t>(handle,
-                                      const_cast<math_t*>(x1),
-                                      const_cast<math_t*>(x2),
-                                      out,
-                                      n1,
-                                      n2,
-                                      n_cols,
-                                      NULL,
-                                      0,
-                                      fin_op,
-                                      is_row_major);
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
index 73588baea..53022368d 100644
--- a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
+++ b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
@@ -28,7 +28,7 @@
 #include <raft/core/math.hpp>                 // raft::exp
 #include <raft/util/cuda_dev_essentials.cuh>  // HD
 
-namespace cuvs::distance::kernels::detail {
+namespace cuvs::distance::kernels {
 
 /** @brief: Final op for Gram matrix with RBF kernel.
  *
@@ -48,4 +48,4 @@ struct rbf_fin_op {
   }
 };  // struct rbf_fin_op
 
-}  // namespace cuvs::distance::kernels::detail
+}  // namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
index edfd7cf5f..49497ab3a 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -118,9 +118,7 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::russel_rao_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo(
-  cuvs::distance::detail::ops::l2_unexp_distance_op,
-  int64_t,
-  cuvs::distance::kernels::detail::rbf_fin_op);
+  cuvs::distance::detail::ops::l2_unexp_distance_op, int64_t, cuvs::distance::kernels::rbf_fin_op);
 
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::l2_exp_distance_op, int64_t);
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
index 3c8f25109..a2e12b6df 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
@@ -50,7 +50,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   float,
   float,
   float,
-  cuvs::distance::kernels::detail::rbf_fin_op<float>,
+  cuvs::distance::kernels::rbf_fin_op<float>,
   int64_t);
 
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
@@ -58,7 +58,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   double,
   double,
   double,
-  cuvs::distance::kernels::detail::rbf_fin_op<double>,
+  cuvs::distance::kernels::rbf_fin_op<double>,
   int64_t);
 
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
@@ -66,7 +66,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   half,
   float,
   float,
-  cuvs::distance::kernels::detail::rbf_fin_op<half>,
+  cuvs::distance::kernels::rbf_fin_op<half>,
   int64_t);
 
 #undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh
index e623f76ba..a692a62a3 100644
--- a/cpp/src/distance/distance-ext.cuh
+++ b/cpp/src/distance/distance-ext.cuh
@@ -273,13 +273,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan
                                          float,
                                          float,
                                          float,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<float>,
+                                         cuvs::distance::kernels::rbf_fin_op<float>,
                                          int64_t);
 instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded,
                                          double,
                                          double,
                                          double,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<double>,
+                                         cuvs::distance::kernels::rbf_fin_op<double>,
                                          int64_t);
 
 #undef instantiate_cuvs_distance_distance_extra
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index c1d39f360..47e72460f 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -139,13 +139,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan
                                          float,
                                          float,
                                          float,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<float>,
+                                         cuvs::distance::kernels::rbf_fin_op<float>,
                                          int64_t);
 instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded,
                                          double,
                                          double,
                                          double,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<double>,
+                                         cuvs::distance::kernels::rbf_fin_op<double>,
                                          int64_t);
 
 #undef instantiate_cuvs_distance_distance_extra
diff --git a/cpp/src/embed/spectral.cu b/cpp/src/embed/spectral.cu
new file mode 100644
index 000000000..c3d4e3fc7
--- /dev/null
+++ b/cpp/src/embed/spectral.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../sparse/cluster/detail/spectral.cuh"
+#include <cuvs/embed/spectral.hpp>
+#include <raft/core/device_coo_matrix.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::embed::spectral {
+
+/**
+ * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings
+ * (lowest n_components eigenvectors), using Lanczos min cut algorithm.
+ * @param rows source vertices of knn graph (size nnz)
+ * @param cols destination vertices of knn graph (size nnz)
+ * @param vals edge weights connecting vertices of knn graph (size nnz)
+ * @param nnz size of rows/cols/vals
+ * @param n number of samples in X
+ * @param n_neighbors the number of neighbors to query for knn graph construction
+ * @param n_components the number of components to project the X into
+ * @param out output array for embedding (size n*n_comonents)
+ */
+void fit(const raft::resources& handle,
+         raft::device_coo_matrix_view<float, int, int, int> knn_graph,
+         int n_components,
+         raft::device_matrix_view<float, int> out,
+         unsigned long long seed)
+{
+  cuvs::sparse::cluster::spectral::detail::fit_embedding(
+    handle,
+    knn_graph.structure_view().get_rows().data(),
+    knn_graph.structure_view().get_cols().data(),
+    knn_graph.get_elements().data(),
+    knn_graph.structure_view().get_nnz(),
+    knn_graph.structure_view().get_n_rows(),
+    n_components,
+    out.data_handle(),
+    seed);
+}
+};  // namespace cuvs::embed::spectral
diff --git a/cpp/src/sparse/cluster/cluster_solvers.cuh b/cpp/src/sparse/cluster/cluster_solvers.cuh
new file mode 100644
index 000000000..7b4cf6ab3
--- /dev/null
+++ b/cpp/src/sparse/cluster/cluster_solvers.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CLUSTER_SOLVERS_H
+#define __CLUSTER_SOLVERS_H
+
+#pragma once
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <utility>  // for std::pair
+
+namespace cuvs {
+namespace spectral {
+
+using namespace raft::spectral::matrix;
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct cluster_solver_config_t {
+  size_type_t n_clusters;
+  size_type_t maxIter;
+
+  value_type_t tol;
+
+  unsigned long long seed{123456};
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct kmeans_solver_t {
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  std::pair<value_type_t, index_type_t> solve(raft::resources const& handle,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
+    RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
+    RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
+    value_type_t residual{};
+    index_type_t iters{};
+    cuvs::cluster::kmeans::params km_params;
+    km_params.n_clusters     = config_.n_clusters;
+    km_params.tol            = config_.tol;
+    km_params.max_iter       = config_.maxIter;
+    km_params.rng_state.seed = config_.seed;
+
+    auto X      = raft::make_device_matrix_view<const value_type_t>(obs, n_obs_vecs, dim);
+    auto labels = raft::make_device_vector_view<index_type_t>(codes, n_obs_vecs);
+    auto centroids =
+      raft::make_device_matrix<value_type_t, index_type_t>(handle, config_.n_clusters, dim);
+    auto weight = raft::make_device_vector<value_type_t, index_type_t>(handle, n_obs_vecs);
+    thrust::fill(raft::resource::get_thrust_policy(handle),
+                 weight.data_handle(),
+                 weight.data_handle() + n_obs_vecs,
+                 1);
+
+    auto sw = std::make_optional((raft::device_vector_view<const value_type_t>)weight.view());
+    cuvs::cluster::kmeans::fit_predict(handle,
+                                       km_params,
+                                       X,
+                                       sw,
+                                       centroids.view(),
+                                       labels,
+                                       raft::make_host_scalar_view(&residual),
+                                       raft::make_host_scalar_view(&iters));
+    return std::make_pair(residual, iters);
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/detail/spectral.cuh b/cpp/src/sparse/cluster/detail/spectral.cuh
new file mode 100644
index 000000000..571d92bf5
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral.cuh
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../cluster_solvers.cuh"
+#include "../eigen_solvers.cuh"
+#include "../partition.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/coo.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cuvs::sparse::cluster::spectral::detail {
+
+template <typename T>
+void fit_embedding(raft::resources const& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  rmm::device_uvector<int> src_offsets(n + 1, stream);
+  rmm::device_uvector<int> dst_cols(nnz, stream);
+  rmm::device_uvector<T> dst_vals(nnz, stream);
+  raft::sparse::convert::coo_to_csr(
+    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
+
+  rmm::device_uvector<T> eigVals(n_components + 1, stream);
+  rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
+  rmm::device_uvector<int> labels(n, stream);
+
+  raft::resource::sync_stream(handle, stream);
+
+  /**
+   * Raft spectral clustering
+   */
+  using index_type = int;
+  using value_type = T;
+
+  index_type* ro = src_offsets.data();
+  index_type* ci = dst_cols.data();
+  value_type* vs = dst_vals.data();
+
+  raft::spectral::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+    handle, ro, ci, vs, n, nnz};
+
+  index_type neigvs       = n_components + 1;
+  index_type maxiter      = 4000;  // default reset value (when set to 0);
+  value_type tol          = 0.01;
+  index_type restart_iter = 15 + neigvs;  // what cugraph is using
+
+  cuvs::spectral::eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol};
+
+  cfg.seed = seed;
+
+  cuvs::spectral::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  // cluster computation here is irrelevant,
+  // hence define a no-op such solver to
+  // feed partition():
+  //
+  struct no_op_cluster_solver_t {
+    using index_type_t = index_type;
+    using size_type_t  = index_type;
+    using value_type_t = value_type;
+
+    std::pair<value_type_t, index_type_t> solve(raft::resources const& handle,
+                                                size_type_t n_obs_vecs,
+                                                size_type_t dim,
+                                                value_type_t const* __restrict__ obs,
+                                                index_type_t* __restrict__ codes) const
+    {
+      return std::make_pair<value_type_t, index_type_t>(0, 0);
+    }
+  };
+
+  cuvs::spectral::partition(handle,
+                            r_csr_m,
+                            eig_solver,
+                            no_op_cluster_solver_t{},
+                            labels.data(),
+                            eigVals.data(),
+                            eigVecs.data());
+
+  raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // namespace cuvs::sparse::cluster::spectral::detail
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
new file mode 100644
index 000000000..a42ad2dc1
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include "../../cluster_solvers.cuh"
+#include "../../eigen_solvers.cuh"
+#include "spectral_util.cuh"
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/normalize.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <math.h>
+#include <stdio.h>
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+namespace detail {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // Compute eigenvectors of Modularity Matrix
+
+  // Initialize Modularity Matrix
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute eigenvectors corresponding to largest eigenvalues
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // notice that at this point the matrix has already been transposed, so we are scaling
+  // columns
+  auto dataset_view = raft::make_device_matrix_view(eigVecs, nEigVecs, n);
+  raft::linalg::row_normalize(
+    handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(raft::resources const& handle,
+                       raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+  weight_t partModularity, clustersize;
+
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+  auto stream   = raft::resource::get_cuda_stream(handle);
+
+  // Device memory
+  raft::spectral::matrix::vector_t<weight_t> part_i(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> Bx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Modularity
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  // Initialize output
+  modularity = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    modularity += partModularity;
+  }
+
+  modularity = modularity / B.diagonal_.nrm1();
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/detail/spectral/partition.hpp b/cpp/src/sparse/cluster/detail/spectral/partition.hpp
new file mode 100644
index 000000000..77e83c17d
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/partition.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+
+#include "../../cluster_solvers.cuh"
+#include "../../eigen_solvers.cuh"
+#include "spectral_util.cuh"
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <math.h>
+#include <stdio.h>
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+namespace detail {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            // cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // -------------------------------------------------------
+  // Spectral partitioner
+  // -------------------------------------------------------
+
+  // Compute eigenvectors of Laplacian
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute smallest eigenvalues and eigenvectors
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(raft::resources const& handle,
+                      raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  weight_t partEdgesCut, clustersize;
+
+  // Device memory
+  raft::spectral::matrix::vector_t<weight_t> part_i(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> Lx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  // Initialize output
+  cost    = 0;
+  edgeCut = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    // Construct indicator vector for ith partition
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    cost += partEdgesCut / clustersize;
+    edgeCut += partEdgesCut / 2;
+  }
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
new file mode 100644
index 000000000..1d2e58e2a
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+
+namespace cuvs {
+namespace spectral {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void transform_eigen_matrix(raft::resources const& handle,
+                            edge_t n,
+                            vertex_t nEigVecs,
+                            weight_t* eigVecs)
+{
+  auto stream             = raft::resource::get_cuda_stream(handle);
+  auto cublas_h           = raft::resource::get_cublas_handle(handle);
+  auto thrust_exec_policy = raft::resource::get_thrust_policy(handle);
+
+  const weight_t zero{0.0};
+  const weight_t one{1.0};
+
+  // Whiten eigenvector matrix
+  for (auto i = 0; i < nEigVecs; ++i) {
+    weight_t mean, std;
+
+    mean = thrust::reduce(thrust_exec_policy,
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    RAFT_CHECK_CUDA(stream);
+    mean /= n;
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(mean),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::minus<weight_t>());
+    RAFT_CHECK_CUDA(stream);
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+
+    std /= std::sqrt(static_cast<weight_t>(n));
+
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(std),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::divides<weight_t>());
+    RAFT_CHECK_CUDA(stream);
+  }
+
+  // Transpose eigenvector matrix
+  //   TODO: in-place transpose
+  {
+    raft::spectral::matrix::vector_t<weight_t> work(handle, nEigVecs * n);
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     nEigVecs,
+                                                     n,
+                                                     &one,
+                                                     eigVecs,
+                                                     n,
+                                                     &zero,
+                                                     (weight_t*)NULL,
+                                                     nEigVecs,
+                                                     work.raw(),
+                                                     nEigVecs,
+                                                     stream));
+
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
+  }
+}
+
+namespace {
+/// Functor to generate indicator vectors
+/** For use in Thrust transform
+ */
+template <typename index_type_t, typename value_type_t>
+struct equal_to_i_op {
+  const index_type_t i;
+
+ public:
+  equal_to_i_op(index_type_t _i) : i(_i) {}
+  template <typename Tuple_>
+  __host__ __device__ void operator()(Tuple_ t)
+  {
+    thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  }
+};
+}  // namespace
+
+// Construct indicator vector for ith partition
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+bool construct_indicator(raft::resources const& handle,
+                         edge_t index,
+                         edge_t n,
+                         weight_t& clustersize,
+                         weight_t& partStats,
+                         vertex_t const* __restrict__ clusters,
+                         raft::spectral::matrix::vector_t<weight_t>& part_i,
+                         raft::spectral::matrix::vector_t<weight_t>& Bx,
+                         raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> const& B)
+{
+  auto stream             = raft::resource::get_cuda_stream(handle);
+  auto cublas_h           = raft::resource::get_cublas_handle(handle);
+  auto thrust_exec_policy = raft::resource::get_thrust_policy(handle);
+
+  thrust::for_each(
+    thrust_exec_policy,
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters),
+                                                 thrust::device_pointer_cast(part_i.raw()))),
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
+                                                 thrust::device_pointer_cast(part_i.raw() + n))),
+    equal_to_i_op<vertex_t, weight_t>(index));
+  RAFT_CHECK_CUDA(stream);
+
+  // Compute size of ith partition
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+    cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+
+  clustersize = round(clustersize);
+  if (clustersize < 0.5) { return false; }
+
+  // Compute part stats
+  B.mv(1, part_i.raw(), 0, Bx.raw());
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+
+  return true;
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/eigen_solvers.cuh b/cpp/src/sparse/cluster/eigen_solvers.cuh
new file mode 100644
index 000000000..1b2501d68
--- /dev/null
+++ b/cpp/src/sparse/cluster/eigen_solvers.cuh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __EIGEN_SOLVERS_H
+#define __EIGEN_SOLVERS_H
+
+#pragma once
+
+#include <raft/sparse/solver/lanczos.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace cuvs {
+namespace spectral {
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct eigen_solver_config_t {
+  size_type_t n_eigVecs;
+  size_type_t maxIter;
+
+  size_type_t restartIter;
+  value_type_t tol;
+
+  bool reorthogonalize{false};
+  unsigned long long seed{
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct lanczos_solver_t {
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  index_type_t solve_smallest_eigenvectors(
+    raft::resources const& handle,
+    raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    raft::sparse::solver::computeSmallestEigenvectors(handle,
+                                                      A,
+                                                      config_.n_eigVecs,
+                                                      config_.maxIter,
+                                                      config_.restartIter,
+                                                      config_.tol,
+                                                      config_.reorthogonalize,
+                                                      iters,
+                                                      eigVals,
+                                                      eigVecs,
+                                                      config_.seed);
+    return iters;
+  }
+
+  index_type_t solve_largest_eigenvectors(
+    raft::resources const& handle,
+    raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    raft::sparse::solver::computeLargestEigenvectors(handle,
+                                                     A,
+                                                     config_.n_eigVecs,
+                                                     config_.maxIter,
+                                                     config_.restartIter,
+                                                     config_.tol,
+                                                     config_.reorthogonalize,
+                                                     iters,
+                                                     eigVals,
+                                                     eigVecs,
+                                                     config_.seed);
+    return iters;
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
diff --git a/cpp/src/sparse/cluster/modularity_maximization.cuh b/cpp/src/sparse/cluster/modularity_maximization.cuh
new file mode 100644
index 000000000..71cba6927
--- /dev/null
+++ b/cpp/src/sparse/cluster/modularity_maximization.cuh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MODULARITY_MAXIMIZATION_H
+#define __MODULARITY_MAXIMIZATION_H
+
+#pragma once
+
+#include "detail/spectral/modularity_maximization.hpp"
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return cuvs::spectral::detail::
+    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(raft::resources const& handle,
+                       raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  cuvs::spectral::detail::analyzeModularity<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, modularity);
+}
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/partition.cuh b/cpp/src/sparse/cluster/partition.cuh
new file mode 100644
index 000000000..df78a8a2d
--- /dev/null
+++ b/cpp/src/sparse/cluster/partition.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PARTITION_H
+#define __PARTITION_H
+
+#pragma once
+
+#include "detail/spectral/partition.hpp"
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return cuvs::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(raft::resources const& handle,
+                      raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  cuvs::spectral::detail::analyzePartition<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, edgeCut, cost);
+}
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0ecac6ec2..9224e88d8 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -218,6 +218,7 @@ if(BUILD_TESTS)
     distance/dist_l_inf.cu
     distance/dist_lp_unexp.cu
     distance/dist_russell_rao.cu
+    distance/gram.cu
     distance/masked_nn.cu
     distance/sparse_distance.cu
     sparse/neighbors/cross_component_nn.cu
@@ -227,6 +228,11 @@ if(BUILD_TESTS)
     100
   )
 
+  ConfigureTest(
+    NAME SPARSE_TEST PATH sparse/cluster/cluster_solvers.cu sparse/cluster/eigen_solvers.cu
+    sparse/cluster/spectral.cu GPUS 1 PERCENT 100
+  )
+  
   ConfigureTest(
     NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100
   )
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
new file mode 100644
index 000000000..89b1525ea
--- /dev/null
+++ b/cpp/test/distance/gram.cu
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include "gram_base.cuh"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs::distance::kernels {
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_"
+     << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+const std::vector<GramMatrixInputs> inputs = {
+  {42, 137, 2, false, {KernelType::LINEAR}},
+  {42, 137, 2, true, {KernelType::LINEAR}},
+  {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
+  {3, 4, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, {KernelType::RBF, 0, 0.5}},
+  // Distance kernel does not support LD parameter yet.
+  //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49},
+  // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143},
+};
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      handle(),
+      x1(0, raft::resource::get_cuda_stream(handle)),
+      x2(0, raft::resource::get_cuda_stream(handle)),
+      gram(0, raft::resource::get_cuda_stream(handle)),
+      gram_host(0)
+  {
+    auto stream = raft::resource::get_cuda_stream(handle);
+
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::RngState rng(42137ULL);
+    raft::random::uniform(handle, rng, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, rng, x2.data(), x2.size(), math_t(0), math_t(1));
+  }
+
+  ~GramMatrixTest() override {}
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    (*kernel)(handle, x1_span, x2_span, out_span);
+
+    auto stream = raft::resource::get_cuda_stream(handle);
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+
+    ASSERT_TRUE(cuvs::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox<math_t>(1e-6f), stream));
+  }
+
+  GramMatrixInputs params;
+  raft::resources handle;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+  rmm::device_uvector<math_t> gram;
+
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloat;
+typedef GramMatrixTest<double> GramMatrixTestDouble;
+
+TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
+};  // namespace cuvs::distance::kernels
\ No newline at end of file
diff --git a/cpp/test/distance/gram_base.cuh b/cpp/test/distance/gram_base.cuh
new file mode 100644
index 000000000..326cdb4f8
--- /dev/null
+++ b/cpp/test/distance/gram_base.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace distance {
+namespace kernels {
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+// Calculate the Gram matrix on the host.
+template <typename math_t>
+void naiveGramMatrixKernel(int n1,
+                           int n2,
+                           int n_cols,
+                           const rmm::device_uvector<math_t>& x1,
+                           const rmm::device_uvector<math_t>& x2,
+                           math_t* gram_host,
+                           int ld1,
+                           int ld2,
+                           int ld_out,
+                           bool is_row_major,
+                           KernelParams kernel,
+                           cudaStream_t stream,
+                           const raft::resources& handle)
+{
+  std::vector<math_t> x1_host(x1.size());
+  raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+  std::vector<math_t> x2_host(x2.size());
+  raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+  raft::resource::sync_stream(handle, stream);
+
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      float d = 0;
+      for (int k = 0; k < n_cols; k++) {
+        if (kernel.kernel == KernelType::RBF) {
+          math_t diff = x1_host[get_offset(i, k, ld1, is_row_major)] -
+                        x2_host[get_offset(j, k, ld2, is_row_major)];
+          d += diff * diff;
+        } else {
+          d += x1_host[get_offset(i, k, ld1, is_row_major)] *
+               x2_host[get_offset(j, k, ld2, is_row_major)];
+        }
+      }
+      int idx  = get_offset(i, j, ld_out, is_row_major);
+      math_t v = 0;
+      switch (kernel.kernel) {
+        case (KernelType::LINEAR): gram_host[idx] = d; break;
+        case (KernelType::POLYNOMIAL):
+          v              = kernel.gamma * d + kernel.coef0;
+          gram_host[idx] = std::pow(v, kernel.degree);
+          break;
+        case (KernelType::TANH): gram_host[idx] = std::tanh(kernel.gamma * d + kernel.coef0); break;
+        case (KernelType::RBF): gram_host[idx] = exp(-kernel.gamma * d); break;
+      }
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace distance
+}  // namespace cuvs
\ No newline at end of file
diff --git a/cpp/test/sparse/cluster/cluster_solvers.cu b/cpp/test/sparse/cluster/cluster_solvers.cu
new file mode 100644
index 000000000..c0b6c1a78
--- /dev/null
+++ b/cpp/test/sparse/cluster/cluster_solvers.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../src/sparse/cluster/cluster_solvers.cuh"
+#include "../../../src/sparse/cluster/eigen_solvers.cuh"
+#include "../../../src/sparse/cluster/modularity_maximization.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace spectral {
+
+TEST(Raft, ClusterSolvers)
+{
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+
+  index_type maxiter{100};
+  value_type tol{1.0e-10};
+  unsigned long long seed{100110021003};
+
+  auto stream = raft::resource::get_cuda_stream(h);
+
+  index_type n{100};
+  index_type d{10};
+  index_type k{5};
+
+  // nullptr expected to trigger exceptions:
+  //
+  value_type* eigvecs{nullptr};
+  index_type* codes{nullptr};
+
+  cluster_solver_config_t<index_type, value_type> cfg{k, maxiter, tol, seed};
+
+  kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
+
+  EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
+}
+
+TEST(Raft, ModularitySolvers)
+{
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  auto stream = raft::resource::get_cuda_stream(h);
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+
+  EXPECT_ANY_THROW(cuvs::spectral::modularity_maximization(
+    h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type modularity{0};
+  EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/cluster/eigen_solvers.cu b/cpp/test/sparse/cluster/eigen_solvers.cu
new file mode 100644
index 000000000..8de0b49e7
--- /dev/null
+++ b/cpp/test/sparse/cluster/eigen_solvers.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../src/sparse/cluster/eigen_solvers.cuh"
+#include "../../../src/sparse/cluster/partition.cuh"
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+
+namespace cuvs {
+namespace spectral {
+
+TEST(Raft, EigenSolvers)
+{
+  raft::common::nvtx::range fun_scope("test::EigenSolvers");
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz   = 0;
+  index_type nrows = 0;
+
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+  std::uint64_t seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+
+  lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+
+  EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+}
+
+TEST(Raft, SpectralSolvers)
+{
+  raft::common::nvtx::range fun_scope("test::SpectralSolvers");
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h)
+
+  );
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  EXPECT_ANY_THROW(
+    spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type edgeCut{0};
+  value_type cost{0};
+  EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/cluster/spectral.cu b/cpp/test/sparse/cluster/spectral.cu
new file mode 100644
index 000000000..7d0cdef9d
--- /dev/null
+++ b/cpp/test/sparse/cluster/spectral.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../test_utils.cuh"
+
+#include "../../../src/sparse/cluster/modularity_maximization.cuh"
+#include "../../../src/sparse/cluster/partition.cuh"
+#include <raft/core/handle.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace cluster {
+
+/**
+ * Warning: There appears to be a CUDA 12.2 bug in cusparse that causes an
+ * alignment issue. We've fixed the bug in our code through a workaround
+ * (see raft/sparse/linalg/spmm.hpp for fix). This test is meant to fail
+ * in the case where the fix is accidentally reverted, so that it doesn't
+ * break any downstream libraries that depend on RAFT
+ */
+TEST(Raft, Spectral)
+{
+  raft::handle_t handle;
+
+  std::vector<int32_t> h_offsets({0, 2, 4, 7, 10, 12, 14});
+  std::vector<int32_t> h_indices({1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4});
+  std::vector<float> h_values(
+    {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  std::vector<int32_t> expected_clustering({1, 1, 1, 0, 0, 0});
+
+  int32_t n_clusters{2};
+  int32_t n_eigenvectors{2};
+  int32_t evs_max_it{100};
+  int32_t kmean_max_it{100};
+  int32_t restartIter_lanczos = 15 + n_eigenvectors;
+  float evs_tol{0.001};
+  float kmean_tol{0.001};
+  unsigned long long seed1{1234567};
+  unsigned long long seed2{12345678};
+  bool reorthog{false};
+
+  rmm::device_uvector<int32_t> offsets(h_offsets.size(), handle.get_stream());
+  rmm::device_uvector<int32_t> indices(h_indices.size(), handle.get_stream());
+  rmm::device_uvector<float> values(h_indices.size(), handle.get_stream());
+  rmm::device_uvector<int32_t> clustering(expected_clustering.size(), handle.get_stream());
+  rmm::device_uvector<float> eigenvalues(n_eigenvectors, handle.get_stream());
+  rmm::device_uvector<float> eigenvectors(n_eigenvectors * expected_clustering.size(),
+                                          handle.get_stream());
+
+  rmm::device_uvector<int32_t> exp_dev(expected_clustering.size(), handle.get_stream());
+
+  raft::update_device(
+    exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream());
+
+  raft::update_device(offsets.data(), h_offsets.data(), h_offsets.size(), handle.get_stream());
+  raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream());
+  raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream());
+
+  raft::spectral::matrix::sparse_matrix_t<int32_t, float> const matrix{
+    handle,
+    offsets.data(),
+    indices.data(),
+    values.data(),
+    static_cast<int32_t>(offsets.size() - 1),
+    static_cast<int32_t>(indices.size())};
+
+  cuvs::spectral::eigen_solver_config_t<int32_t, float> eig_cfg{
+    n_eigenvectors, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed1};
+  cuvs::spectral::lanczos_solver_t<int32_t, float> eig_solver{eig_cfg};
+
+  cuvs::spectral::cluster_solver_config_t<int32_t, float> clust_cfg{
+    n_clusters, kmean_max_it, kmean_tol, seed2};
+  cuvs::spectral::kmeans_solver_t<int32_t, float> cluster_solver{clust_cfg};
+
+  cuvs::spectral::partition(handle,
+                            matrix,
+                            eig_solver,
+                            cluster_solver,
+                            clustering.data(),
+                            eigenvalues.data(),
+                            eigenvectors.data());
+
+  ASSERT_TRUE(devArrMatch(expected_clustering.data(),
+                          exp_dev.data(),
+                          exp_dev.size(),
+                          1,
+                          cuvs::Compare<int32_t>(),
+                          handle.get_stream()));
+}
+
+}  // namespace cluster
+}  // namespace cuvs
\ No newline at end of file
diff --git a/cpp/test/sparse/cluster/spectral_matrix.cu b/cpp/test/sparse/cluster/spectral_matrix.cu
new file mode 100644
index 000000000..37a4202b8
--- /dev/null
+++ b/cpp/test/sparse/cluster/spectral_matrix.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace spectral {
+namespace matrix {
+namespace {
+template <typename index_type, typename value_type>
+struct csr_view_t {
+  index_type* offsets;
+  index_type* indices;
+  value_type* edge_data;
+  index_type number_of_vertices;
+  index_type number_of_edges;
+};
+}  // namespace
+TEST(Raft, SpectralMatrices)
+{
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};
+
+  int const sz = 10;
+  vector_t<index_type> d_v{h, sz};
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz   = 0;
+  index_type nrows = 0;
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+  ASSERT_EQ(nullptr, sm2.row_offsets_);
+
+  auto stream = resource::get_cuda_stream(h);
+
+  auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    laplacian_matrix_t<index_type, value_type> lm1{h, ro, ci, vs, nrows, nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
+
+  auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t<index_type, value_type> lm2{h, sm2}; };
+  EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
+
+  auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    modularity_matrix_t<index_type, value_type> mm1{h, ro, ci, vs, nrows, nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
+
+  auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t<index_type, value_type> mm2{h, sm2}; };
+  EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
+}
+
+}  // namespace matrix
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
new file mode 100644
index 000000000..d7af30a1c
--- /dev/null
+++ b/cpp/test/sparse/gram.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include "../distance/gram_base.cuh"
+#include "../test_utils.cuh"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/dense.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs::distance::kernels::sparse {
+
+/**
+ * Structure to describe structure of the input matrices:
+ *  - DENSE: dense, dense
+ *  - MIX: CSR, dense
+ *  - CSR: CSR, CSR
+ */
+enum SparseType { DENSE, MIX, CSR };
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  SparseType sparse_input;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/")
+     << (p.sparse_input == SparseType::DENSE
+           ? "DenseDense/"
+           : (p.sparse_input == SparseType::MIX ? "CsrDense/" : "CsrCsr/"))
+     << kernel_names[p.kernel.kernel] << "/ld_" << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+/*struct KernelParams {
+  // Kernel function parameters
+  KernelType kernel;  //!< Type of the kernel function
+  int degree;         //!< Degree of polynomial kernel (ignored by others)
+  double gamma;       //!< multiplier in the
+  double coef0;       //!< additive constant in poly and tanh kernels
+};*/
+
+// const KernelParams linear_kernel_params{.kernel=KernelType::LINEAR};
+
+// {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, {KernelType::TANH, 0, 0.5, 2.4}, {KernelType::RBF, 0, 0.5}
+const std::vector<GramMatrixInputs> inputs = raft::util::itertools::product<GramMatrixInputs>(
+  {42},
+  {137},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX, SparseType::CSR},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4},
+   KernelParams{KernelType::RBF, 0, 0.5}});
+
+// (ld_1, ld_2, ld_out) not supported by RBF and CSR
+const std::vector<GramMatrixInputs> inputs_ld = raft::util::itertools::product<GramMatrixInputs>(
+  {137},
+  {42},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+  {159},
+  {73},
+  {144});
+
+// (ld_1, ld_2) are supported by CSR
+const std::vector<GramMatrixInputs> inputs_ld_csr =
+  raft::util::itertools::product<GramMatrixInputs>(
+    {42},
+    {137},
+    {2},
+    {true, false},
+    {SparseType::CSR, SparseType::MIX},
+    {KernelParams{KernelType::LINEAR},
+     KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+     KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+    {64},
+    {155},
+    {0});
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      stream(raft::resource::get_cuda_stream(handle)),
+      x1(0, stream),
+      x2(0, stream),
+      x1_csr_indptr(0, stream),
+      x1_csr_indices(0, stream),
+      x1_csr_data(0, stream),
+      x2_csr_indptr(0, stream),
+      x2_csr_indices(0, stream),
+      x2_csr_data(0, stream),
+      gram(0, stream),
+      gram_host(0)
+  {
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::RngState r(42137ULL);
+    raft::random::uniform(handle, r, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, r, x2.data(), x2.size(), math_t(0), math_t(1));
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  ~GramMatrixTest() override {}
+
+  int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data)
+  {
+    int nnz           = 0;
+    double eps        = 1e-6;
+    int n_cols        = params.n_cols;
+    bool is_row_major = params.is_row_major;
+    size_t dense_size = get_offset(n_rows - 1, n_cols - 1, ld, is_row_major) + 1;
+
+    std::vector<math_t> dense_host(dense_size);
+    raft::update_host(dense_host.data(), dense, dense_size, stream);
+    raft::resource::sync_stream(handle, stream);
+
+    std::vector<int> indptr_host(n_rows + 1);
+    std::vector<int> indices_host(n_rows * n_cols);
+    std::vector<math_t> data_host(n_rows * n_cols);
+
+    // create csr matrix from dense (with threshold)
+    for (int i = 0; i < n_rows; ++i) {
+      indptr_host[i] = nnz;
+      for (int j = 0; j < n_cols; ++j) {
+        math_t value = dense_host[get_offset(i, j, ld, is_row_major)];
+        if (value > eps) {
+          indices_host[nnz] = j;
+          data_host[nnz]    = value;
+          nnz++;
+        }
+      }
+    }
+    indptr_host[n_rows] = nnz;
+
+    // fill back dense matrix from CSR
+    std::fill(dense_host.data(), dense_host.data() + dense_size, 0);
+    for (int i = 0; i < n_rows; ++i) {
+      for (int idx = indptr_host[i]; idx < indptr_host[i + 1]; ++idx) {
+        dense_host[get_offset(i, indices_host[idx], ld, is_row_major)] = data_host[idx];
+      }
+    }
+
+    raft::update_device(dense, dense_host.data(), dense_size, stream);
+    raft::update_device(indptr, indptr_host.data(), n_rows + 1, stream);
+    raft::update_device(indices, indices_host.data(), nnz, stream);
+    raft::update_device(data, data_host.data(), nnz, stream);
+    raft::resource::sync_stream(handle, stream);
+    return nnz;
+  }
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    if (params.sparse_input == SparseType::DENSE) {
+      (*kernel)(handle, x1_span, x2_span, out_span);
+    } else {
+      x1_csr_indptr.reserve(params.n1 + 1, stream);
+      x1_csr_indices.reserve(params.n1 * params.n_cols, stream);
+      x1_csr_data.reserve(params.n1 * params.n_cols, stream);
+      int x1_nnz = prepareCsr(x1.data(),
+                              params.n1,
+                              params.ld1,
+                              x1_csr_indptr.data(),
+                              x1_csr_indices.data(),
+                              x1_csr_data.data());
+
+      auto x1_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+        x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz);
+      auto x1_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+        raft::device_span<const math_t>(x1_csr_data.data(), x1_csr_structure.get_nnz()),
+        x1_csr_structure);
+
+      if (params.sparse_input == SparseType::MIX) {
+        (*kernel)(handle, x1_csr, x2_span, out_span);
+      } else {
+        x2_csr_indptr.reserve(params.n2 + 1, stream);
+        x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
+        x2_csr_data.reserve(params.n2 * params.n_cols, stream);
+        int x2_nnz = prepareCsr(x2.data(),
+                                params.n2,
+                                params.ld2,
+                                x2_csr_indptr.data(),
+                                x2_csr_indices.data(),
+                                x2_csr_data.data());
+
+        auto x2_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+          x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz);
+        auto x2_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+          raft::device_span<const math_t>(x2_csr_data.data(), x2_csr_structure.get_nnz()),
+          x2_csr_structure);
+
+        (*kernel)(handle, x1_csr, x2_csr, out_span);
+      }
+    }
+    // Something in gram is executing not on the 'stream' and therefore
+    // a full device sync is required
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+    raft::resource::sync_stream(handle, stream);
+
+    ASSERT_TRUE(cuvs::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox<math_t>(1e-6f), stream));
+  }
+
+  raft::resources handle;
+  cudaStream_t stream = 0;
+  GramMatrixInputs params;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+
+  rmm::device_uvector<int> x1_csr_indptr;
+  rmm::device_uvector<int> x1_csr_indices;
+  rmm::device_uvector<math_t> x1_csr_data;
+  rmm::device_uvector<int> x2_csr_indptr;
+  rmm::device_uvector<int> x2_csr_indices;
+  rmm::device_uvector<math_t> x2_csr_data;
+
+  rmm::device_uvector<math_t> gram;
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloatStandard;
+typedef GramMatrixTest<float> GramMatrixTestFloatLd;
+typedef GramMatrixTest<float> GramMatrixTestFloatLdCsr;
+
+TEST_P(GramMatrixTestFloatStandard, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLd, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLdCsr, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatStandard, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatLd, ::testing::ValuesIn(inputs_ld));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests,
+                         GramMatrixTestFloatLdCsr,
+                         ::testing::ValuesIn(inputs_ld_csr));
+};  // namespace cuvs::distance::kernels::sparse
\ No newline at end of file

From fa8838a3a00b17c6b8284e094d22382d80f87247 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Fri, 6 Dec 2024 09:42:21 -0600
Subject: [PATCH 46/47] Modify cuvs-bench to be able to generate ground truth
 in CPU systems (#466)

PR allows calculating ground truth for cuvs-bench on CPU systems. Current version uses a simple NumPy brute force, perhaps we should consider using faiss? cc @cjnolet @divyegala

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cuvs/pull/466
---
 .../bench_ann_cuda-118_arch-aarch64.yaml      |   3 +
 .../bench_ann_cuda-118_arch-x86_64.yaml       |   3 +
 .../bench_ann_cuda-125_arch-aarch64.yaml      |   3 +
 .../bench_ann_cuda-125_arch-x86_64.yaml       |   3 +
 conda/recipes/cuvs-bench-cpu/meta.yaml        |   1 +
 conda/recipes/cuvs-bench/meta.yaml            |   3 +-
 dependencies.yaml                             |   3 +
 .../generate_groundtruth/__main__.py          | 204 +++++++++++++++---
 .../cuvs_bench/cuvs_bench/run/data_export.py  |  73 +++----
 python/cuvs_bench/pyproject.toml              |   1 +
 10 files changed, 220 insertions(+), 77 deletions(-)

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 1e602ccf1..59d471bda 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -18,6 +18,8 @@ dependencies:
 - cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
@@ -32,6 +34,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libcuvs==24.12.*,>=0.0.0a0
 - librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index b060e78c2..31a416eb5 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -18,6 +18,8 @@ dependencies:
 - cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
@@ -32,6 +34,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libcuvs==24.12.*,>=0.0.0a0
 - librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 485122273..3efe9ebde 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -19,6 +19,8 @@ dependencies:
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
@@ -29,6 +31,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libcuvs==24.12.*,>=0.0.0a0
 - librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index d5f48dadb..7fbd77368 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -19,6 +19,8 @@ dependencies:
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
+- cupy>=12.0.0
+- cuvs==24.12.*,>=0.0.0a0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
@@ -29,6 +31,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libcuvs==24.12.*,>=0.0.0a0
 - librmm==24.12.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
diff --git a/conda/recipes/cuvs-bench-cpu/meta.yaml b/conda/recipes/cuvs-bench-cpu/meta.yaml
index 02c11346f..016df56be 100644
--- a/conda/recipes/cuvs-bench-cpu/meta.yaml
+++ b/conda/recipes/cuvs-bench-cpu/meta.yaml
@@ -59,6 +59,7 @@ requirements:
     - glog {{ glog_version }}
     - h5py {{ h5py_version }}
     - matplotlib
+    - numpy >=1.23,<3.0a0
     - pandas
     - pyyaml
     - python
diff --git a/conda/recipes/cuvs-bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
index 3e81edc58..0681a1038 100644
--- a/conda/recipes/cuvs-bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -88,10 +88,11 @@ requirements:
     - cudatoolkit
     {% else %}
     - cuda-cudart
+    - cupy>=12.0.0
     - libcublas
     {% endif %}
     - glog {{ glog_version }}
-    - libcuvs {{ version }}
+    - cuvs {{ version }}
     - h5py {{ h5py_version }}
     - matplotlib
     - pandas
diff --git a/dependencies.yaml b/dependencies.yaml
index 80a7d2024..98cac5300 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -39,6 +39,7 @@ files:
       - bench
       - bench_python
       - rapids_build_setuptools
+      - cupy
   test_cpp:
     output: none
     includes:
@@ -475,11 +476,13 @@ dependencies:
           - h5py>=3.8.0
           - benchmark>=1.8.2
           - openblas
+          - libcuvs==24.12.*,>=0.0.0a0
   bench_python:
     common:
       - output_types: [conda, pyproject, requirements]
         packages:
           - click
+          - cuvs==24.12.*,>=0.0.0a0
           - matplotlib
           - pandas
           - pyyaml
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
index dbee6cd36..88ec55dfa 100644
--- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
@@ -15,70 +15,206 @@
 # limitations under the License.
 #
 import argparse
+import importlib
 import os
 import sys
+import warnings
 
-import cupy as cp
-import numpy as np
-import rmm
-from pylibraft.common import DeviceResources
-from rmm.allocators.cupy import rmm_cupy_allocator
+from .utils import memmap_bin_file, suffix_from_dtype, write_bin
 
-from cuvs.neighbors.brute_force import build, search
 
-from .utils import memmap_bin_file, suffix_from_dtype, write_bin
+def import_with_fallback(primary_lib, secondary_lib=None, alias=None):
+    """
+    Attempt to import a primary library, with an optional fallback to a
+    secondary library.
+    Optionally assigns the imported module to a global alias.
+
+    Parameters
+    ----------
+    primary_lib : str
+        Name of the primary library to import.
+    secondary_lib : str, optional
+        Name of the secondary library to use as a fallback. If `None`,
+        no fallback is attempted.
+    alias : str, optional
+        Alias to assign the imported module globally.
+
+    Returns
+    -------
+    module or None
+        The imported module if successful; otherwise, `None`.
+
+    Examples
+    --------
+    >>> xp = import_with_fallback('cupy', 'numpy')
+    >>> mod = import_with_fallback('nonexistent_lib')
+    >>> if mod is None:
+    ...     print("Library not found.")
+    """
+    try:
+        module = importlib.import_module(primary_lib)
+    except ImportError:
+        if secondary_lib is not None:
+            try:
+                module = importlib.import_module(secondary_lib)
+            except ImportError:
+                module = None
+        else:
+            module = None
+    if alias and module is not None:
+        globals()[alias] = module
+    return module
+
+
+xp = import_with_fallback("cupy", "numpy")
+rmm = import_with_fallback("rmm")
+gpu_system = False
+
 
+def force_fallback_to_numpy():
+    global xp, gpu_system
+    xp = import_with_fallback("numpy")
+    gpu_system = False
+    warnings.warn(
+        "Consider using a GPU-based system to greatly accelerate "
+        " generating groundtruths using cuVS."
+    )
+
+
+if rmm is not None:
+    gpu_system = True
+    try:
+        from pylibraft.common import DeviceResources
+        from rmm.allocators.cupy import rmm_cupy_allocator
 
-def generate_random_queries(n_queries, n_features, dtype=np.float32):
+        from cuvs.neighbors.brute_force import build, search
+    except ImportError:
+        # RMM is available, cupy is available, but cuVS is not
+        force_fallback_to_numpy()
+else:
+    # No RMM, no cuVS, but cupy is available
+    force_fallback_to_numpy()
+
+
+def generate_random_queries(n_queries, n_features, dtype=xp.float32):
     print("Generating random queries")
-    if np.issubdtype(dtype, np.integer):
-        queries = cp.random.randint(
+    if xp.issubdtype(dtype, xp.integer):
+        queries = xp.random.randint(
             0, 255, size=(n_queries, n_features), dtype=dtype
         )
     else:
-        queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
+        queries = xp.random.uniform(size=(n_queries, n_features)).astype(dtype)
     return queries
 
 
 def choose_random_queries(dataset, n_queries):
     print("Choosing random vector from dataset as query vectors")
-    query_idx = np.random.choice(
+    query_idx = xp.random.choice(
         dataset.shape[0], size=(n_queries,), replace=False
     )
     return dataset[query_idx, :]
 
 
+def cpu_search(dataset, queries, k, metric="squeclidean"):
+    """
+    Find the k nearest neighbors for each query point in the dataset using the
+    specified metric.
+
+    Parameters
+    ----------
+    dataset : numpy.ndarray
+        An array of shape (n_samples, n_features) representing the dataset.
+    queries : numpy.ndarray
+        An array of shape (n_queries, n_features) representing the query
+        points.
+    k : int
+        The number of nearest neighbors to find.
+    metric : str, optional
+        The distance metric to use. Can be 'squeclidean' or 'inner_product'.
+        Default is 'squeclidean'.
+
+    Returns
+    -------
+    distances : numpy.ndarray
+        An array of shape (n_queries, k) containing the distances
+        (for 'squeclidean') or similarities
+        (for 'inner_product') to the k nearest neighbors for each query.
+    indices : numpy.ndarray
+        An array of shape (n_queries, k) containing the indices of the
+        k nearest neighbors in the dataset for each query.
+
+    """
+    if metric == "squeclidean":
+        diff = queries[:, xp.newaxis, :] - dataset[xp.newaxis, :, :]
+        dist_sq = xp.sum(diff**2, axis=2)  # Shape: (n_queries, n_samples)
+
+        indices = xp.argpartition(dist_sq, kth=k - 1, axis=1)[:, :k]
+        distances = xp.take_along_axis(dist_sq, indices, axis=1)
+
+        sorted_idx = xp.argsort(distances, axis=1)
+        distances = xp.take_along_axis(distances, sorted_idx, axis=1)
+        indices = xp.take_along_axis(indices, sorted_idx, axis=1)
+
+    elif metric == "inner_product":
+        similarities = xp.dot(
+            queries, dataset.T
+        )  # Shape: (n_queries, n_samples)
+
+        neg_similarities = -similarities
+        indices = xp.argpartition(neg_similarities, kth=k - 1, axis=1)[:, :k]
+        distances = xp.take_along_axis(similarities, indices, axis=1)
+
+        sorted_idx = xp.argsort(-distances, axis=1)
+
+    else:
+        raise ValueError(
+            "Unsupported metric in cuvs-bench-cpu. "
+            "Use 'squeclidean' or 'inner_product' or use the GPU package"
+            "to use any distance supported by cuVS."
+        )
+
+    distances = xp.take_along_axis(distances, sorted_idx, axis=1)
+    indices = xp.take_along_axis(indices, sorted_idx, axis=1)
+
+    return distances, indices
+
+
 def calc_truth(dataset, queries, k, metric="sqeuclidean"):
-    resources = DeviceResources()
     n_samples = dataset.shape[0]
     n = 500000  # batch size for processing neighbors
     i = 0
     indices = None
     distances = None
-    queries = cp.asarray(queries, dtype=cp.float32)
+    queries = xp.asarray(queries, dtype=xp.float32)
+
+    if gpu_system:
+        resources = DeviceResources()
 
     while i < n_samples:
         print("Step {0}/{1}:".format(i // n, n_samples // n))
         n_batch = n if i + n <= n_samples else n_samples - i
 
-        X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
+        X = xp.asarray(dataset[i : i + n_batch, :], xp.float32)
 
-        index = build(X, metric=metric, resources=resources)
-        D, Ind = search(index, queries, k, resources=resources)
-        resources.sync()
+        if gpu_system:
+            index = build(X, metric=metric, resources=resources)
+            D, Ind = search(index, queries, k, resources=resources)
+            resources.sync()
+        else:
+            D, Ind = cpu_search(X, queries, metric=metric)
 
-        D, Ind = cp.asarray(D), cp.asarray(Ind)
+        D, Ind = xp.asarray(D), xp.asarray(Ind)
         Ind += i  # shift neighbor index by offset i
 
         if distances is None:
             distances = D
             indices = Ind
         else:
-            distances = cp.concatenate([distances, D], axis=1)
-            indices = cp.concatenate([indices, Ind], axis=1)
-            idx = cp.argsort(distances, axis=1)[:, :k]
-            distances = cp.take_along_axis(distances, idx, axis=1)
-            indices = cp.take_along_axis(indices, idx, axis=1)
+            distances = xp.concatenate([distances, D], axis=1)
+            indices = xp.concatenate([indices, Ind], axis=1)
+            idx = xp.argsort(distances, axis=1)[:, :k]
+            distances = xp.take_along_axis(distances, idx, axis=1)
+            indices = xp.take_along_axis(indices, idx, axis=1)
 
         i += n_batch
 
@@ -86,11 +222,15 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"):
 
 
 def main():
-    pool = rmm.mr.PoolMemoryResource(
-        rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
-    )
-    rmm.mr.set_current_device_resource(pool)
-    cp.cuda.set_allocator(rmm_cupy_allocator)
+    if gpu_system and xp.__name__ == "cupy":
+        pool = rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
+        )
+        rmm.mr.set_current_device_resource(pool)
+        xp.cuda.set_allocator(rmm_cupy_allocator)
+    else:
+        # RMM is available, but cupy is not
+        force_fallback_to_numpy()
 
     parser = argparse.ArgumentParser(
         prog="generate_groundtruth",
@@ -197,7 +337,7 @@ def main():
         "Dataset size {:6.1f} GB, shape {}, dtype {}".format(
             dataset.size * dataset.dtype.itemsize / 1e9,
             dataset.shape,
-            np.dtype(dtype),
+            xp.dtype(dtype),
         )
     )
 
@@ -230,11 +370,11 @@ def main():
 
     write_bin(
         os.path.join(args.output, "groundtruth.neighbors.ibin"),
-        indices.astype(np.uint32),
+        indices.astype(xp.uint32),
     )
     write_bin(
         os.path.join(args.output, "groundtruth.distances.fbin"),
-        distances.astype(np.float32),
+        distances.astype(xp.float32),
     )
 
 
diff --git a/python/cuvs_bench/cuvs_bench/run/data_export.py b/python/cuvs_bench/cuvs_bench/run/data_export.py
index 997dab500..1d0ac40a0 100644
--- a/python/cuvs_bench/cuvs_bench/run/data_export.py
+++ b/python/cuvs_bench/cuvs_bench/run/data_export.py
@@ -17,7 +17,6 @@
 import json
 import os
 import traceback
-import warnings
 
 import pandas as pd
 
@@ -170,44 +169,6 @@ def convert_json_to_csv_build(dataset, dataset_path):
             traceback.print_exc()
 
 
-def append_build_data(write, build_file):
-    """
-    Append build data to the search DataFrame.
-
-    Parameters
-    ----------
-    write : pandas.DataFrame
-        The DataFrame containing the search data to which build
-        data will be appended.
-    build_file : str
-        The file path to the build CSV file.
-    """
-    if os.path.exists(build_file):
-        build_df = pd.read_csv(build_file)
-        write_ncols = len(write.columns)
-        # Initialize columns for build data
-        build_columns = [
-            "build time",
-            "build threads",
-            "build cpu_time",
-            "build GPU",
-        ]
-        write = write.assign(**{col: None for col in build_columns})
-        # Append additional columns if available
-        for col_name in build_df.columns[6:]:
-            write[col_name] = None
-        # Match build rows with search rows by index_name
-        for s_index, search_row in write.iterrows():
-            for b_index, build_row in build_df.iterrows():
-                if search_row["index_name"] == build_row["index_name"]:
-                    write.iloc[s_index, write_ncols:] = build_row[2:].values
-                    break
-    else:
-        warnings.warn(
-            f"Build CSV not found for {build_file}, build params not appended."
-        )
-
-
 def convert_json_to_csv_search(dataset, dataset_path):
     """
     Convert search JSON files to CSV format.
@@ -232,7 +193,7 @@ def convert_json_to_csv_search(dataset, dataset_path):
             )
             algo_name = clean_algo_name(algo_name)
             df["name"] = df["name"].str.split("/").str[0]
-            write_data = pd.DataFrame(
+            write = pd.DataFrame(
                 {
                     "algo_name": [algo_name] * len(df),
                     "index_name": df["name"],
@@ -242,11 +203,35 @@ def convert_json_to_csv_search(dataset, dataset_path):
                 }
             )
             # Append build data
-            append_build_data(write_data, build_file)
+            for name in df:
+                if name not in skip_search_cols:
+                    write[name] = df[name]
+            if os.path.exists(build_file):
+                build_df = pd.read_csv(build_file)
+                write_ncols = len(write.columns)
+                write["build time"] = None
+                write["build threads"] = None
+                write["build cpu_time"] = None
+                write["build GPU"] = None
+
+                for col_idx in range(6, len(build_df.columns)):
+                    col_name = build_df.columns[col_idx]
+                    write[col_name] = None
+
+                for s_index, search_row in write.iterrows():
+                    for b_index, build_row in build_df.iterrows():
+                        if search_row["index_name"] == build_row["index_name"]:
+                            write.iloc[s_index, write_ncols] = build_df.iloc[
+                                b_index, 2
+                            ]
+                            write.iloc[
+                                s_index, write_ncols + 1 :
+                            ] = build_df.iloc[b_index, 3:]
+                            break
             # Write search data and compute frontiers
-            write_data.to_csv(file.replace(".json", ",raw.csv"), index=False)
-            write_frontier(file, write_data, "throughput")
-            write_frontier(file, write_data, "latency")
+            write.to_csv(file.replace(".json", ",raw.csv"), index=False)
+            write_frontier(file, write, "throughput")
+            write_frontier(file, write, "latency")
         except Exception as e:
             print(f"Error processing search file {file}: {e}. Skipping...")
             traceback.print_exc()
diff --git a/python/cuvs_bench/pyproject.toml b/python/cuvs_bench/pyproject.toml
index 41ebad116..5b17f7228 100644
--- a/python/cuvs_bench/pyproject.toml
+++ b/python/cuvs_bench/pyproject.toml
@@ -19,6 +19,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "click",
+    "cuvs==24.12.*,>=0.0.0a0",
     "matplotlib",
     "pandas",
     "pyyaml",

From cf2885c9d0b8a5d839378939a29154a4d165fefe Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Dec 2024 13:11:32 -0500
Subject: [PATCH 47/47] Update Changelog [skip ci]

---
 CHANGELOG.md | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ce4a14c3..ed9429d55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,67 @@
+# cuvs 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Skip IVF-PQ packing test for lists with not enough data ([#512](https://github.com/rapidsai/cuvs/pull/512)) [@achirkin](https://github.com/achirkin)
+- [BUG] Fix CAGRA filter ([#489](https://github.com/rapidsai/cuvs/pull/489)) [@enp1s0](https://github.com/enp1s0)
+- Add `kIsSingleSource` to `PairwiseDistanceEpilogueElementwise` ([#485](https://github.com/rapidsai/cuvs/pull/485)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix include errors, header, and unsafe locks in iface.hpp ([#467](https://github.com/rapidsai/cuvs/pull/467)) [@achirkin](https://github.com/achirkin)
+- Fix an OOB error in device-side cuvs::neighbors::refine and CAGRA kern_prune ([#460](https://github.com/rapidsai/cuvs/pull/460)) [@achirkin](https://github.com/achirkin)
+- Put a ceiling on cuda-python ([#445](https://github.com/rapidsai/cuvs/pull/445)) [@bdice](https://github.com/bdice)
+- Enable NVTX in cuvs-cagra-search component ([#439](https://github.com/rapidsai/cuvs/pull/439)) [@achirkin](https://github.com/achirkin)
+- BUG: CAGRA multi-cta illegal access with bad queries ([#438](https://github.com/rapidsai/cuvs/pull/438)) [@achirkin](https://github.com/achirkin)
+- Fix index overflow in edge cases of CAGRA graph optimize ([#435](https://github.com/rapidsai/cuvs/pull/435)) [@achirkin](https://github.com/achirkin)
+- Fix correct call to brute force in generate groundtruth of cuvs-bench ([#427](https://github.com/rapidsai/cuvs/pull/427)) [@dantegd](https://github.com/dantegd)
+- Use Python for sccache hit rate computation. ([#420](https://github.com/rapidsai/cuvs/pull/420)) [@bdice](https://github.com/bdice)
+- Add `click` package to `cuvs-bench` conda recipe ([#408](https://github.com/rapidsai/cuvs/pull/408)) [@divyegala](https://github.com/divyegala)
+- Fix NVTX annotations ([#400](https://github.com/rapidsai/cuvs/pull/400)) [@achirkin](https://github.com/achirkin)
+
+## 📖 Documentation
+
+- [Doc] Fix CAGRA search sample code ([#484](https://github.com/rapidsai/cuvs/pull/484)) [@enp1s0](https://github.com/enp1s0)
+- Fix broken link in README.md references ([#473](https://github.com/rapidsai/cuvs/pull/473)) [@Azurethi](https://github.com/Azurethi)
+- Adding tech stack to docs ([#448](https://github.com/rapidsai/cuvs/pull/448)) [@cjnolet](https://github.com/cjnolet)
+- Fix Question Retrieval notebook ([#352](https://github.com/rapidsai/cuvs/pull/352)) [@lowener](https://github.com/lowener)
+
+## 🚀 New Features
+
+- Add C++ API scalar quantization ([#494](https://github.com/rapidsai/cuvs/pull/494)) [@mfoerste4](https://github.com/mfoerste4)
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Add serialization API to brute-force ([#461](https://github.com/rapidsai/cuvs/pull/461)) [@lowener](https://github.com/lowener)
+- Add Question Retrieval notebook using Milvus ([#451](https://github.com/rapidsai/cuvs/pull/451)) [@lowener](https://github.com/lowener)
+- Migrate feature diff for NN Descent from RAFT to cuVS ([#421](https://github.com/rapidsai/cuvs/pull/421)) [@divyegala](https://github.com/divyegala)
+- Add --no-lap-sync cmd option to ann-bench ([#405](https://github.com/rapidsai/cuvs/pull/405)) [@achirkin](https://github.com/achirkin)
+- Add `InnerProduct` and `CosineExpanded` metric support in NN Descent ([#177](https://github.com/rapidsai/cuvs/pull/177)) [@divyegala](https://github.com/divyegala)
+
+## 🛠️ Improvements
+
+- Update cuvs to match raft&#39;s cutlass changes ([#516](https://github.com/rapidsai/cuvs/pull/516)) [@vyasr](https://github.com/vyasr)
+- add a README for wheels ([#504](https://github.com/rapidsai/cuvs/pull/504)) [@jameslamb](https://github.com/jameslamb)
+- Move check_input_array from pylibraft ([#474](https://github.com/rapidsai/cuvs/pull/474)) [@benfred](https://github.com/benfred)
+- use different wheel-size thresholds based on CUDA version ([#469](https://github.com/rapidsai/cuvs/pull/469)) [@jameslamb](https://github.com/jameslamb)
+- Modify cuvs-bench to be able to generate ground truth in CPU systems ([#466](https://github.com/rapidsai/cuvs/pull/466)) [@dantegd](https://github.com/dantegd)
+- enforce wheel size limits, README formatting in CI ([#464](https://github.com/rapidsai/cuvs/pull/464)) [@jameslamb](https://github.com/jameslamb)
+- Moving spectral embedding and kernel gramm APIs to cuVS ([#463](https://github.com/rapidsai/cuvs/pull/463)) [@cjnolet](https://github.com/cjnolet)
+- Migrate sparse knn and distances code from raft ([#457](https://github.com/rapidsai/cuvs/pull/457)) [@benfred](https://github.com/benfred)
+- Don&#39;t presume pointers location infers usability. ([#441](https://github.com/rapidsai/cuvs/pull/441)) [@robertmaynard](https://github.com/robertmaynard)
+- call `enable_testing` in root CMakeLists.txt ([#437](https://github.com/rapidsai/cuvs/pull/437)) [@robertmaynard](https://github.com/robertmaynard)
+- CAGRA tech debt: distance descriptor and workspace memory ([#436](https://github.com/rapidsai/cuvs/pull/436)) [@achirkin](https://github.com/achirkin)
+- Add ci run_ scripts needed for build infra ([#434](https://github.com/rapidsai/cuvs/pull/434)) [@robertmaynard](https://github.com/robertmaynard)
+- Use environment variables in cache hit rate computation. ([#422](https://github.com/rapidsai/cuvs/pull/422)) [@bdice](https://github.com/bdice)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+- We need to enable the c_api by default ([#416](https://github.com/rapidsai/cuvs/pull/416)) [@robertmaynard](https://github.com/robertmaynard)
+- print sccache stats in builds ([#413](https://github.com/rapidsai/cuvs/pull/413)) [@jameslamb](https://github.com/jameslamb)
+- make conda installs in CI stricter ([#406](https://github.com/rapidsai/cuvs/pull/406)) [@jameslamb](https://github.com/jameslamb)
+- Ivf c example ([#404](https://github.com/rapidsai/cuvs/pull/404)) [@abner-ma](https://github.com/abner-ma)
+- Prune workflows based on changed files ([#392](https://github.com/rapidsai/cuvs/pull/392)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- [WIP] Add pinned memory resource to C API ([#311](https://github.com/rapidsai/cuvs/pull/311)) [@ajit283](https://github.com/ajit283)
+- Dynamic Batching ([#261](https://github.com/rapidsai/cuvs/pull/261)) [@achirkin](https://github.com/achirkin)
+
 # cuvs 24.10.00 (9 Oct 2024)
 
 ## 🐛 Bug Fixes