diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e18e82df0..78648235f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -88,7 +88,7 @@ jobs:
     with:
       build_type: pull-request
       enable_check_symbols: true
-      symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+      symbol_exclusions: (void (thrust::|cub::))
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5f60c0a34..27dc99a11 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -23,7 +23,7 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       enable_check_symbols: true
-      symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
+      symbol_exclusions: (void (thrust::|cub::))
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
diff --git a/.gitignore b/.gitignore
index 97eab287d..da6eb07f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,6 +75,7 @@ compile_commands.json
 .clangd/
 
 # serialized ann indexes
+brute_force_index
 cagra_index
 ivf_flat_index
 ivf_pq_index
diff --git a/README.md b/README.md
index c1b74a9e8..23759f598 100755
--- a/README.md
+++ b/README.md
@@ -35,6 +35,7 @@ Finally, faster vector search enables interactions between dense vectors and gra
 
 Below are some common use-cases for vector search
 
+
 - ### Semantic search
   - Generative AI & Retrieval augmented generation (RAG)
   - Recommender systems
@@ -68,6 +69,14 @@ There are several benefits to using cuVS and GPUs for vector search, including
 
 In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale. 
 
+## cuVS Technology Stack
+
+cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU. 
+
+![cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU](img/tech_stack.png "cuVS Technology Stack")
+
+
+
 ## Installing cuVS
 
 cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/). Different packages are available for the different languages supported by cuVS:
@@ -233,7 +242,7 @@ If you are interested in contributing to the cuVS library, please read our [Cont
 
 For the interested reader, many of the accelerated implementations in cuVS are also based on research papers which can provide a lot more background. We also ask you to please cite the corresponding algorithms by referencing them in your own research. 
 - [CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search](https://arxiv.org/abs/2308.15136)
-- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062>)
+- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062)
 - [Fast K-NN Graph Construction by GPU Based NN-Descent](https://dl.acm.org/doi/abs/10.1145/3459637.3482344?casa_token=O_nan1B1F5cAAAAA:QHWDEhh0wmd6UUTLY9_Gv6c3XI-5DXM9mXVaUXOYeStlpxTPmV3nKvABRfoivZAaQ3n8FWyrkWw>)
 - [cuSLINK: Single-linkage Agglomerative Clustering on the GPU](https://arxiv.org/abs/2306.16354)
 - [GPU Semiring Primitives for Sparse Neighborhood Methods](https://arxiv.org/abs/2104.06357)
diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh
index e03da9f19..444657cc0 100755
--- a/ci/build_wheel_cuvs.sh
+++ b/ci/build_wheel_cuvs.sh
@@ -3,6 +3,8 @@
 
 set -euo pipefail
 
+package_dir="python/cuvs"
+
 case "${RAPIDS_CUDA_VERSION}" in
   12.*)
     EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
@@ -15,4 +17,5 @@ esac
 # Set up skbuild options. Enable sccache in skbuild config options
 export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}"
 
-ci/build_wheel.sh cuvs python/cuvs
+ci/build_wheel.sh cuvs ${package_dir}
+ci/validate_wheel.sh ${package_dir} final_dist
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 000000000..f2b235765
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir=$1
+wheel_dir_relative_path=$2
+
+RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
+
+# some packages are much larger on CUDA 11 than on CUDA 12
+if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
+    PYDISTCHECK_ARGS=(
+        --max-allowed-size-compressed '1.4G'
+    )
+else
+    PYDISTCHECK_ARGS=(
+        --max-allowed-size-compressed '950M'
+    )
+fi
+
+cd "${package_dir}"
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "${PYDISTCHECK_ARGS[@]}" \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index aa12b4ed6..80bfb0c24 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 494ec394d..07937726c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index f4f03ccee..b7fd6fcfa 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index a295e93f4..83a457465 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index a73839457..21cb98180 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index a529b27e2..effa88ced 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 407fb6058..0c5043ac2 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 2ce3b5d7e..c63f205b0 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index e7e2daf0c..560c95feb 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -26,6 +26,7 @@ build:
     - {{ compiler('cuda') }}
     - cuda-cudart-dev
     {% endif %}
+    - cuda-python
 
 requirements:
   build:
@@ -42,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -60,13 +61,14 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     {% else %}
     - cuda-cudart
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     {% endif %}
     - pylibraft {{ minor_version }}
     - libcuvs {{ version }}
     - python x.x
-    - cuda-python
     - numpy >=1.23,<3.0a0
 
 tests:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c493af488..eb2e7c7a4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,7 +369,9 @@ if(BUILD_SHARED_LIBS)
     src/distance/detail/fused_distance_nn.cu
     src/distance/distance.cu
     src/distance/pairwise_distance.cu
+    src/distance/sparse_distance.cu
     src/neighbors/brute_force.cu
+    src/neighbors/brute_force_serialize.cu
     src/neighbors/cagra_build_float.cu
     src/neighbors/cagra_build_half.cu
     src/neighbors/cagra_build_int8.cu
@@ -436,6 +438,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/nn_descent.cu
     src/neighbors/nn_descent_float.cu
     src/neighbors/nn_descent_half.cu
+    src/neighbors/nn_descent_index.cpp
     src/neighbors/nn_descent_int8.cu
     src/neighbors/nn_descent_uint8.cu
     src/neighbors/reachability.cu
@@ -448,6 +451,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/refine/detail/refine_host_int8_t_float.cpp
     src/neighbors/refine/detail/refine_host_uint8_t_float.cpp
     src/neighbors/sample_filter.cu
+    src/neighbors/sparse_brute_force.cu
     src/neighbors/vamana_build_float.cu
     src/neighbors/vamana_build_uint8.cu
     src/neighbors/vamana_build_int8.cu
diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp
index def72641e..42c574e58 100644
--- a/cpp/include/cuvs/distance/distance.hpp
+++ b/cpp/include/cuvs/distance/distance.hpp
@@ -20,6 +20,7 @@
 
 #include <cstdint>
 #include <cuda_fp16.h>
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
@@ -331,6 +332,86 @@ void pairwise_distance(
   cuvs::distance::DistanceType metric,
   float metric_arg = 2.0f);
 
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_csr_matrix.hpp>
+ * #include <raft/core/device_mdspan.hpp>
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix<float>(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix<float>(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix<float>(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const float, int, int, int> x,
+                       raft::device_csr_matrix_view<const float, int, int, int> y,
+                       raft::device_matrix_view<float, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg = 2.0f);
+
+/**
+ * @brief Compute sparse pairwise distances between x and y, using the provided
+ * input configuration and distance function.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_csr_matrix.hpp>
+ * #include <raft/core/device_mdspan.hpp>
+ *
+ * int x_n_rows = 100000;
+ * int y_n_rows = 50000;
+ * int n_cols = 10000;
+ *
+ * raft::device_resources handle;
+ * auto x = raft::make_device_csr_matrix<double>(handle, x_n_rows, n_cols);
+ * auto y = raft::make_device_csr_matrix<double>(handle, y_n_rows, n_cols);
+ *
+ * ...
+ * // populate data
+ * ...
+ *
+ * auto out = raft::make_device_matrix<double>(handle, x_nrows, y_nrows);
+ * auto metric = cuvs::distance::DistanceType::L2Expanded;
+ * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric);
+ * @endcode
+ *
+ * @param[in] handle raft::resources
+ * @param[in] x raft::device_csr_matrix_view
+ * @param[in] y raft::device_csr_matrix_view
+ * @param[out] dist raft::device_matrix_view dense matrix
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const double, int, int, int> x,
+                       raft::device_csr_matrix_view<const double, int, int, int> y,
+                       raft::device_matrix_view<double, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg = 2.0f);
+
 /** @} */  // end group pairwise_distance_runtime
 
 };  // namespace cuvs::distance
diff --git a/cpp/include/cuvs/neighbors/brute_force.h b/cpp/include/cuvs/neighbors/brute_force.h
index c9e172f62..33b92f11b 100644
--- a/cpp/include/cuvs/neighbors/brute_force.h
+++ b/cpp/include/cuvs/neighbors/brute_force.h
@@ -166,6 +166,66 @@ cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
  * @}
  */
 
+/**
+ * @defgroup bruteforce_c_serialize BRUTEFORCE C-API serialize functions
+ * @{
+ */
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.c}
+ * #include <cuvs/neighbors/brute_force.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create an index with `cuvsBruteforceBuild`
+ * cuvsBruteForceSerialize(res, "/path/to/index", index);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index BRUTEFORCE index
+ *
+ */
+cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                    const char* filename,
+                                    cuvsBruteForceIndex_t index);
+
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.c}
+ * #include <cuvs/neighbors/brute_force.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // Deserialize an index previously built with `cuvsBruteforceBuild`
+ * cuvsBruteForceIndex_t index;
+ * cuvsBruteForceIndexCreate(&index);
+ * cuvsBruteForceDeserialize(res, "/path/to/index", index);
+ * @endcode
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index BRUTEFORCE index loaded disk
+ */
+cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                      const char* filename,
+                                      cuvsBruteForceIndex_t index);
+
+/**
+ * @}
+ */
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp
index 428fa592a..d040e03db 100644
--- a/cpp/include/cuvs/neighbors/brute_force.hpp
+++ b/cpp/include/cuvs/neighbors/brute_force.hpp
@@ -18,6 +18,7 @@
 
 #include "common.hpp"
 #include <cuvs/neighbors/common.hpp>
+#include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
@@ -47,6 +48,14 @@ struct index : cuvs::neighbors::index {
   index& operator=(index&&)      = default;
   ~index()                       = default;
 
+  /**
+   * @brief Construct an empty index.
+   *
+   * Constructs an empty index. This index will either need to be trained with `build`
+   * or loaded from a saved copy with `deserialize`
+   */
+  index(raft::resources const& handle);
+
   /** Construct a brute force index from dataset
    *
    * Constructs a brute force index from a dataset. This lets us precompute norms for
@@ -375,4 +384,342 @@ void search(raft::resources const& handle,
  * @}
  */
 
+/**
+ * @defgroup sparse_bruteforce_cpp_index Sparse Brute Force index
+ * @{
+ */
+/**
+ * @brief Sparse Brute Force index.
+ *
+ * @tparam T Data element type
+ * @tparam IdxT Index element type
+ */
+template <typename T, typename IdxT>
+struct sparse_index {
+ public:
+  sparse_index(const sparse_index&)            = delete;
+  sparse_index(sparse_index&&)                 = default;
+  sparse_index& operator=(const sparse_index&) = delete;
+  sparse_index& operator=(sparse_index&&)      = default;
+  ~sparse_index()                              = default;
+
+  /** Construct a sparse brute force sparse_index from dataset */
+  sparse_index(raft::resources const& res,
+               raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset,
+               cuvs::distance::DistanceType metric,
+               T metric_arg);
+
+  /** Distance metric used for retrieval */
+  cuvs::distance::DistanceType metric() const noexcept { return metric_; }
+
+  /** Metric argument */
+  T metric_arg() const noexcept { return metric_arg_; }
+
+  raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset() const noexcept
+  {
+    return dataset_;
+  }
+
+ private:
+  raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset_;
+  cuvs::distance::DistanceType metric_;
+  T metric_arg_;
+};
+/**
+ * @}
+ */
+
+/**
+ * @defgroup sparse_bruteforce_cpp_index_build Sparse Brute Force index build
+ * @{
+ */
+
+/*
+ * @brief Build the Sparse index from the dataset
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // create and fill the index from a CSR dataset
+ *   auto index = brute_force::build(handle, dataset, metric);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] dataset A sparse CSR matrix in device memory to search against
+ * @param[in] metric cuvs::distance::DistanceType
+ * @param[in] metric_arg metric argument
+ *
+ * @return the constructed Sparse brute-force index
+ */
+auto build(raft::resources const& handle,
+           raft::device_csr_matrix_view<const float, int, int, int> dataset,
+           cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
+           float metric_arg = 0) -> cuvs::neighbors::brute_force::sparse_index<float, int>;
+/**
+ * @}
+ */
+
+/**
+ * @defgroup sparse_bruteforce_cpp_index_search Sparse Brute Force index search
+ * @{
+ */
+struct sparse_search_params {
+  int batch_size_index = 2 << 14;
+  int batch_size_query = 2 << 14;
+};
+
+/*
+ * @brief Search the sparse bruteforce index for nearest neighbors
+ *
+ * @param[in] handle
+ * @param[in] index Sparse brute-force constructed index
+ * @param[in] queries a sparse CSR matrix on the device to query
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ */
+void search(raft::resources const& handle,
+            const sparse_search_params& params,
+            const sparse_index<float, int>& index,
+            raft::device_csr_matrix_view<const float, int, int, int> dataset,
+            raft::device_matrix_view<int, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+/**
+ * @}
+ */
+
+/**
+ * @defgroup bruteforce_cpp_index_serialize Bruteforce index serialize functions
+ * @{
+ */
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index brute force index
+ * @param[in] include_dataset whether to include the dataset in the serialized
+ * output
+ */
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::brute_force::index<half, float>& index,
+               bool include_dataset = true);
+/**
+ * Save the index to file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index brute force index
+ * @param[in] include_dataset whether to include the dataset in the serialized
+ * output
+ *
+ */
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::brute_force::index<float, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Write the index to an output stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index brute force index
+ * @param[in] include_dataset Whether or not to write out the dataset to the file.
+ */
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const cuvs::neighbors::brute_force::index<half, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Write the index to an output stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);`
+ * cuvs::neighbors::brute_force::serialize(handle, os, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index brute force index
+ * @param[in] include_dataset Whether or not to write out the dataset to the file.
+ */
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const cuvs::neighbors::brute_force::index<float, float>& index,
+               bool include_dataset = true);
+
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * using T    = half; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<half, float>* index);
+/**
+ * Load index from file.
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * using T    = float; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<float, float>* index);
+/**
+ * Load index from input stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an input stream
+ * std::istream is(std::cin.rdbuf());
+ * using T    = half; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, is, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] is input stream
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<half, float>* index);
+/**
+ * Load index from input stream
+ * The serialization format can be subject to changes, therefore loading
+ * an index saved with a previous version of cuvs is not guaranteed
+ * to work.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/brute_force.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an input stream
+ * std::istream is(std::cin.rdbuf());
+ * using T    = float; // data element type
+ * brute_force::index<T, float> index(handle);
+ * cuvs::neighbors::brute_force::deserialize(handle, is, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] is input stream
+ * @param[out] index brute force index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<float, float>* index);
+/**
+ * @}
+ */
+
 }  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index e48050756..5ceb3010e 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -363,7 +363,7 @@ struct index : cuvs::neighbors::index {
    *   // search K nearest neighbours
    *   auto neighbors = raft::make_device_matrix<uint32_t, int64_t>(res, n_queries, k);
    *   auto distances = raft::make_device_matrix<float, int64_t>(res, n_queries, k);
-   *   cagra::search(res, search_params, index, queries, neighbors, distances);
+   *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
    * @endcode
    *   In the above example, we have passed a host dataset to build. The returned index will own a
    * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a
@@ -530,7 +530,7 @@ struct index : cuvs::neighbors::index {
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -567,7 +567,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -604,7 +604,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -640,7 +640,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -676,7 +676,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -713,7 +713,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -750,7 +750,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
@@ -787,7 +787,7 @@ auto build(raft::resources const& res,
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   cagra::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors.view(), distances.view());
  * @endcode
  *
  * @param[in] res
diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp
index 347ccf889..9cd8192b5 100644
--- a/cpp/include/cuvs/neighbors/nn_descent.hpp
+++ b/cpp/include/cuvs/neighbors/nn_descent.hpp
@@ -55,15 +55,16 @@ struct index_params : cuvs::neighbors::index_params {
   size_t intermediate_graph_degree = 128;     // Degree of input graph for pruning.
   size_t max_iterations            = 20;      // Number of nn-descent iterations.
   float termination_threshold      = 0.0001;  // Termination threshold of nn-descent.
+  bool return_distances            = true;    // return distances if true
+  size_t n_clusters                = 1;       // defaults to not using any batching
 
   /** @brief Construct NN descent parameters for a specific kNN graph degree
    *
    * @param graph_degree output graph degree
+   * @param metric distance metric to use
    */
-  index_params(size_t graph_degree = 64)
-    : graph_degree(graph_degree), intermediate_graph_degree(1.5 * graph_degree)
-  {
-  }
+  index_params(size_t graph_degree                 = 64,
+               cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded);
 };
 
 /**
@@ -100,14 +101,25 @@ struct index : cuvs::neighbors::index {
    * @param res raft::resources is an object mangaging resources
    * @param n_rows number of rows in knn-graph
    * @param n_cols number of cols in knn-graph
+   * @param return_distances whether to return distances
+   * @param metric distance metric to use
    */
-  index(raft::resources const& res, int64_t n_rows, int64_t n_cols)
+  index(raft::resources const& res,
+        int64_t n_rows,
+        int64_t n_cols,
+        bool return_distances               = false,
+        cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
     : cuvs::neighbors::index(),
       res_{res},
-      metric_{cuvs::distance::DistanceType::L2Expanded},
+      metric_{metric},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(n_rows, n_cols)},
-      graph_view_{graph_.view()}
+      graph_view_{graph_.view()},
+      return_distances_{return_distances}
   {
+    if (return_distances) {
+      distances_      = raft::make_device_matrix<float, int64_t>(res_, n_rows, n_cols);
+      distances_view_ = distances_.value().view();
+    }
   }
 
   /**
@@ -119,14 +131,22 @@ struct index : cuvs::neighbors::index {
    *
    * @param res raft::resources is an object mangaging resources
    * @param graph_view raft::host_matrix_view<IdxT, int64_t, raft::row_major> for storing knn-graph
+   * @param distances_view optional raft::device_matrix_view<float, int64_t, row_major> for storing
+   * distances
+   * @param metric distance metric to use
    */
   index(raft::resources const& res,
-        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view)
+        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
+        std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
+          std::nullopt,
+        cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
     : cuvs::neighbors::index(),
       res_{res},
-      metric_{cuvs::distance::DistanceType::L2Expanded},
+      metric_{metric},
       graph_{raft::make_host_matrix<IdxT, int64_t, raft::row_major>(0, 0)},
-      graph_view_{graph_view}
+      graph_view_{graph_view},
+      distances_view_{distances_view},
+      return_distances_{distances_view.has_value()}
   {
   }
 
@@ -155,6 +175,13 @@ struct index : cuvs::neighbors::index {
     return graph_view_;
   }
 
+  /** neighborhood graph distances [size, graph-degree] */
+  [[nodiscard]] inline auto distances() noexcept
+    -> std::optional<device_matrix_view<float, int64_t, row_major>>
+  {
+    return distances_view_;
+  }
+
   // Don't allow copying the index for performance reasons (try avoiding copying data)
   index(const index&)                    = delete;
   index(index&&)                         = default;
@@ -166,8 +193,11 @@ struct index : cuvs::neighbors::index {
   raft::resources const& res_;
   cuvs::distance::DistanceType metric_;
   raft::host_matrix<IdxT, int64_t, raft::row_major> graph_;  // graph to return for non-int IdxT
+  std::optional<raft::device_matrix<float, int64_t, row_major>> distances_;
   raft::host_matrix_view<IdxT, int64_t, raft::row_major>
     graph_view_;  // view of graph for user provided matrix
+  std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view_;
+  bool return_distances_;
 };
 
 /** @} */
@@ -200,12 +230,15 @@ struct index : cuvs::neighbors::index {
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -232,12 +265,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -262,12 +298,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -294,12 +333,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -324,12 +366,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::device_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -356,12 +401,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in device memory
@@ -386,14 +434,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::device_matrix_view input dataset expected to be located
  *                in device memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
-
-/** @} */
+           raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Build nn-descent Index with dataset in host memory
@@ -420,12 +469,15 @@ auto build(raft::resources const& res,
  *               to run the nn-descent algorithm
  * @param[in] dataset raft::host_matrix_view input dataset expected to be located
  *                in host memory
+ * @param[in] graph optional raft::host_matrix_view<uint32_t, int64_t, raft::row_major> for owning
+ * the output graph
  * @return index<IdxT> index containing all-neighbors knn graph in host memory
  */
 auto build(raft::resources const& res,
            index_params const& params,
-           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
-  -> cuvs::neighbors::nn_descent::index<uint32_t>;
+           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+           std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph =
+             std::nullopt) -> cuvs::neighbors::nn_descent::index<uint32_t>;
 
 /**
  * @brief Test if we have enough GPU memory to run NN descent algorithm.
diff --git a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
index f9955334d..f4a7feaba 100644
--- a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
+++ b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h
@@ -61,6 +61,7 @@ class PairwiseDistanceEpilogueElementwise {
   using ElementT                      = ElementT_;
   static int const kElementsPerAccess = ElementsPerAccess;
   static int const kCount             = kElementsPerAccess;
+  static bool const kIsSingleSource   = true;
 
   using DistanceOp = DistanceOp_;
   using FinalOp    = FinalOp_;
diff --git a/cpp/src/distance/detail/sparse/bin_distance.cuh b/cpp/src/distance/detail/sparse/bin_distance.cuh
new file mode 100644
index 000000000..1a63a8eb9
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/bin_distance.cuh
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "ip_distance.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_binary_row_norm_kernel(value_t* out,
+                                           const value_idx* __restrict__ coo_rows,
+                                           const value_t* __restrict__ data,
+                                           value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) {
+    // We do conditional here only because it's
+    // possible there could be some stray zeros in
+    // the sparse structure and removing them would be
+    // more expensive.
+    atomicAdd(&out[coo_rows[i]], data[i] == 1.0);
+  }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+RAFT_KERNEL compute_binary_warp_kernel(value_t* __restrict__ C,
+                                       const value_t* __restrict__ Q_norms,
+                                       const value_t* __restrict__ R_norms,
+                                       value_idx n_rows,
+                                       value_idx n_cols,
+                                       expansion_f expansion_func)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t q_norm            = Q_norms[i];
+  value_t r_norm            = R_norms[j];
+  value_t dot               = C[(size_t)i * n_cols + j];
+  C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_binary(value_t* C,
+                    const value_t* Q_norms,
+                    const value_t* R_norms,
+                    value_idx n_rows,
+                    value_idx n_cols,
+                    expansion_f expansion_func,
+                    cudaStream_t stream)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_bin_distance(value_t* out,
+                          const value_idx* Q_coo_rows,
+                          const value_t* Q_data,
+                          value_idx Q_nnz,
+                          const value_idx* R_coo_rows,
+                          const value_t* R_data,
+                          value_idx R_nnz,
+                          value_idx m,
+                          value_idx n,
+                          cudaStream_t stream,
+                          expansion_f expansion_func)
+{
+  rmm::device_uvector<value_t> Q_norms(m, stream);
+  rmm::device_uvector<value_t> R_norms(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+
+  compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream);
+}
+
+/**
+ * Jaccard distance using the expanded form:
+ * 1 - (sum(x_k * y_k) / ((sum(x_k) + sum(y_k)) - sum(x_k * y_k))
+ */
+template <typename value_idx = int, typename value_t = float>
+class jaccard_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit jaccard_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         raft::resource::get_cuda_stream(config_->handle),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t denom     = q_r_union - dot;
+
+                           value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
+
+                           // flip the similarity when both rows are 0
+                           bool both_empty = q_r_union == 0;
+                           return 1 - ((!both_empty * jacc) + both_empty);
+                         });
+  }
+
+  ~jaccard_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Dice distance using the expanded form:
+ * 1 - ((2 * sum(x_k * y_k)) / (sum(x_k) + sum(y_k)))
+ */
+template <typename value_idx = int, typename value_t = float>
+class dice_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit dice_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         raft::resource::get_cuda_stream(config_->handle),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t dice      = (2 * dot) / q_r_union;
+                           bool both_empty   = q_r_union == 0;
+                           return 1 - ((!both_empty * dice) + both_empty);
+                         });
+  }
+
+  ~dice_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/common.hpp b/cpp/src/distance/detail/sparse/common.hpp
new file mode 100644
index 000000000..803dabe56
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/common.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resources.hpp>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t>
+struct distances_config_t {
+  distances_config_t(raft::resources const& handle_) : handle(handle_) {}
+
+  // left side
+  value_idx a_nrows;
+  value_idx a_ncols;
+  value_idx a_nnz;
+  value_idx* a_indptr;
+  value_idx* a_indices;
+  value_t* a_data;
+
+  // right side
+  value_idx b_nrows;
+  value_idx b_ncols;
+  value_idx b_nnz;
+  value_idx* b_indptr;
+  value_idx* b_indices;
+  value_t* b_data;
+
+  raft::resources const& handle;
+};
+
+template <typename value_t>
+class distances_t {
+ public:
+  virtual void compute(value_t* out) {}
+  virtual ~distances_t() = default;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv.cuh b/cpp/src/distance/detail/sparse/coo_spmv.cuh
new file mode 100644
index 000000000..181b531f7
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "coo_spmv_strategies/dense_smem_strategy.cuh"
+#include "coo_spmv_strategies/hash_strategy.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cusparse_v2.h>
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
+          typename strategy_t>
+inline void balanced_coo_pairwise_generalized_spmv(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows;
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle)));
+
+  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+};
+
+/**
+ * Performs generalized sparse-matrix-sparse-matrix multiplication via a
+ * sparse-matrix-sparse-vector layout `out=A*B` where generalized product()
+ * and sum() operations can be used in place of the standard sum and product:
+ *
+ * out_ij = sum_k(product(A_ik, B_ik)) The sum goes through values of
+ * k=0..n_cols-1 where B_kj is nonzero.
+ *
+ * The product and sum operations shall form a semiring algebra with the
+ * following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 1} is a product monoid with identity element 1
+ * 3. Multiplication by 0 annihilates x. e.g. product(x, 0) = 0
+ *
+ * Each vector of A is loaded into shared memory in dense form and the
+ * non-zeros of B load balanced across the threads of each block.
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n in row-major
+ *             format.
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_b coo row array for B
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ * @param[in] chunk_size number of nonzeros of B to process for each row of A
+ *            this value was found through profiling and represents a reasonable
+ *            setting for both large and small densities
+ */
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows;
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle)));
+
+  int max_cols = max_cols_per_block<value_idx, value_t>();
+
+  if (max_cols > config_.a_ncols) {
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+  } else {
+    hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+  }
+};
+
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
+          typename strategy_t>
+inline void balanced_coo_pairwise_generalized_spmv_rev(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+};
+
+/**
+ * Used for computing distances where the reduction (e.g. product()) function
+ * requires an implicit union (product(x, 0) = x) to capture the difference A-B.
+ * This is necessary in some applications because the standard semiring algebra
+ * endowed with the default multiplication product monoid will only
+ * compute the intersection & B-A.
+ *
+ * This particular function is meant to accompany the function
+ * `balanced_coo_pairwise_generalized_spmv` and executes the product operation
+ * on only those columns that exist in B and not A.
+ *
+ * The product and sum operations shall enable the computation of a
+ * non-annihilating semiring algebra with the following properties:
+ * 1. {+, 0} is a commutative sum reduction monoid with identity element 0
+ * 2. {*, 0} is a product monoid with identity element 0
+ * 3. Multiplication by 0 does not annihilate x. e.g. product(x, 0) = x
+ *
+ * Manattan distance sum(abs(x_k-y_k)) is a great example of when this type of
+ * execution pattern is necessary.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam threads_per_block block size
+ * @tparam product_f semiring product() function
+ * @tparam accum_f semiring sum() function
+ * @tparam write_f atomic semiring sum() function
+ * @param[out] out_dists dense array of out distances of size m * n
+ * @param[in] config_ distance config object
+ * @param[in] coo_rows_a coo row array for A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ * @param[in] chunk_size number of nonzeros of B to process for each row of A
+ *            this value was found through profiling and represents a reasonable
+ *            setting for both large and small densities
+ */
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+inline void balanced_coo_pairwise_generalized_spmv_rev(
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  // try dense first
+  int max_cols = max_cols_per_block<value_idx, value_t>();
+
+  if (max_cols > config_.b_ncols) {
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+  } else {
+    hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+  }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
new file mode 100644
index 000000000..1f4b19af4
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/cub.cuh>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
+{
+  return __ffs(peer_group) - 1;
+}
+
+/**
+ * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+ * sparse-matrix-sparse-vector multiplication layout (SPMV).
+ * This is intended to be scheduled n_chunks_b times for each row of a.
+ * The steps are as follows:
+ *
+ * 1. Load row from A into dense vector in shared memory.
+ *    This can be further chunked in the future if necessary to support larger
+ *    column sizes.
+ * 2. Threads of block all step through chunks of B in parallel.
+ *    When a new row is encountered in row_indices_b, a segmented
+ *    reduction is performed across the warps and then across the
+ *    block and the final value written out to host memory.
+ *
+ * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb threads per block configured on launch
+ * @tparam rev if this is true, the reduce/accumulate functions are only
+ *         executed when A[col] == 0.0. when executed before/after !rev
+ *         and A & B are reversed, this allows the full symmetric difference
+ *         and intersection to be computed.
+ * @tparam kv_t data type stored in shared mem cache
+ * @tparam product_f reduce function type (semiring product() function).
+ *                  accepts two arguments of value_t and returns a value_t
+ * @tparam accum_f accumulation function type (semiring sum() function).
+ *                 accepts two arguments of value_t and returns a value_t
+ * @tparam write_f function to write value out. this should be mathematically
+ *                 equivalent to the accumulate function but implemented as
+ *                 an atomic operation on global memory. Accepts two arguments
+ *                 of value_t* and value_t and updates the value given by the
+ *                 pointer.
+ * @param[in] indptrA column pointer array for A
+ * @param[in] indicesA column indices array for A
+ * @param[in] dataA data array for A
+ * @param[in] rowsB coo row array for B
+ * @param[in] indicesB column indices array for B
+ * @param[in] dataB data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[in] dim number of features
+ * @param[in] nnz_b number of nonzeros in B
+ * @param[out] out array of size m*n
+ * @param[in] n_blocks_per_row number of blocks of B per row of A
+ * @param[in] chunk_size number of nnz for B to use for each row of A
+ * @param[in] buffer_size amount of smem to use for each row of A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename strategy_t,
+          typename indptr_it,
+          typename value_idx,
+          typename value_t,
+          bool rev,
+          int tpb,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+RAFT_KERNEL balanced_coo_generalized_spmv_kernel(strategy_t strategy,
+                                                 indptr_it indptrA,
+                                                 value_idx* indicesA,
+                                                 value_t* dataA,
+                                                 value_idx nnz_a,
+                                                 value_idx* rowsB,
+                                                 value_idx* indicesB,
+                                                 value_t* dataB,
+                                                 value_idx m,
+                                                 value_idx n,
+                                                 int dim,
+                                                 value_idx nnz_b,
+                                                 value_t* out,
+                                                 int n_blocks_per_row,
+                                                 int chunk_size,
+                                                 value_idx b_ncols,
+                                                 product_f product_func,
+                                                 accum_f accum_func,
+                                                 write_f write_func)
+{
+  typedef cub::WarpReduce<value_t> warp_reduce;
+
+  value_idx cur_row_a        = indptrA.get_row_idx(n_blocks_per_row);
+  value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
+
+  // chunk starting offset
+  value_idx ind_offset = cur_chunk_offset * chunk_size * tpb;
+  // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
+  value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
+
+  int tid     = threadIdx.x;
+  int warp_id = tid / raft::warp_size();
+
+  // compute id relative to current warp
+  unsigned int lane_id = tid & (raft::warp_size() - 1);
+  value_idx ind        = ind_offset + threadIdx.x;
+
+  extern __shared__ char smem[];
+
+  typename strategy_t::smem_type A                = (typename strategy_t::smem_type)(smem);
+  typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim);
+
+  auto inserter = strategy.init_insert(A, dim);
+
+  __syncthreads();
+
+  value_idx start_offset_a, stop_offset_a;
+  bool first_a_chunk, last_a_chunk;
+  indptrA.get_row_offsets(
+    cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk);
+
+  // Convert current row vector in A to dense
+  for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) {
+    strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]);
+  }
+
+  __syncthreads();
+
+  auto finder = strategy.init_find(A, dim);
+
+  if (cur_row_a > m || cur_chunk_offset > n_blocks_per_row) return;
+  if (ind >= nnz_b) return;
+
+  value_idx start_index_a = 0, stop_index_a = b_ncols - 1;
+  indptrA.get_indices_boundary(indicesA,
+                               cur_row_a,
+                               start_offset_a,
+                               stop_offset_a,
+                               start_index_a,
+                               stop_index_a,
+                               first_a_chunk,
+                               last_a_chunk);
+
+  value_idx cur_row_b = -1;
+  value_t c           = 0.0;
+
+  auto warp_red = warp_reduce(*(temp_storage + warp_id));
+
+  if (tid < active_chunk_size) {
+    cur_row_b = rowsB[ind];
+
+    auto index_b   = indicesB[ind];
+    auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+
+    if (in_bounds) {
+      value_t a_col = strategy.find(finder, index_b);
+      if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); }
+    }
+  }
+
+  // loop through chunks in parallel, reducing when a new row is
+  // encountered by each thread
+  for (int i = tid; i < active_chunk_size; i += blockDim.x) {
+    value_idx ind_next   = ind + blockDim.x;
+    value_idx next_row_b = -1;
+
+    if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
+
+    bool diff_rows = next_row_b != cur_row_b;
+
+    if (__any_sync(0xffffffff, diff_rows)) {
+      // grab the threads currently participating in loops.
+      // because any other threads should have returned already.
+      unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
+      bool is_leader          = get_lowest_peer(peer_group) == lane_id;
+      value_t v               = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+
+      // thread with lowest lane id among peers writes out
+      if (is_leader && v != 0.0) {
+        // this conditional should be uniform, since rev is constant
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a;
+        write_func(out + idx, v);
+      }
+
+      c = 0.0;
+    }
+
+    if (next_row_b != -1) {
+      ind = ind_next;
+
+      auto index_b   = indicesB[ind];
+      auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+      if (in_bounds) {
+        value_t a_col = strategy.find(finder, index_b);
+
+        if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); }
+      }
+
+      cur_row_b = next_row_b;
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh
new file mode 100644
index 000000000..457b25eea
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../common.hpp"
+#include "../coo_spmv_kernel.cuh"
+#include "../utils.cuh"
+#include "coo_mask_row_iterators.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class coo_spmv_strategy {
+ public:
+  coo_spmv_strategy(const distances_config_t<value_idx, value_t>& config_) : config(config_)
+  {
+    smem = raft::getSharedMemPerBlock();
+  }
+
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base(strategy_t& strategy,
+                      int smem_dim,
+                      indptr_it& a_indptr,
+                      value_t* out_dists,
+                      value_idx* coo_rows_b,
+                      product_f product_func,
+                      accum_f accum_func,
+                      write_f write_func,
+                      int chunk_size,
+                      int n_blocks,
+                      int n_blocks_per_row)
+  {
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              false,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
+
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
+      <<<n_blocks, tpb, smem, raft::resource::get_cuda_stream(config.handle)>>>(strategy,
+                                                                                a_indptr,
+                                                                                config.a_indices,
+                                                                                config.a_data,
+                                                                                config.a_nnz,
+                                                                                coo_rows_b,
+                                                                                config.b_indices,
+                                                                                config.b_data,
+                                                                                config.a_nrows,
+                                                                                config.b_nrows,
+                                                                                smem_dim,
+                                                                                config.b_nnz,
+                                                                                out_dists,
+                                                                                n_blocks_per_row,
+                                                                                chunk_size,
+                                                                                config.b_ncols,
+                                                                                product_func,
+                                                                                accum_func,
+                                                                                write_func);
+  }
+
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base_rev(strategy_t& strategy,
+                          int smem_dim,
+                          indptr_it& b_indptr,
+                          value_t* out_dists,
+                          value_idx* coo_rows_a,
+                          product_f product_func,
+                          accum_f accum_func,
+                          write_f write_func,
+                          int chunk_size,
+                          int n_blocks,
+                          int n_blocks_per_row)
+  {
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              true,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
+
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
+      <<<n_blocks, tpb, smem, raft::resource::get_cuda_stream(config.handle)>>>(strategy,
+                                                                                b_indptr,
+                                                                                config.b_indices,
+                                                                                config.b_data,
+                                                                                config.b_nnz,
+                                                                                coo_rows_a,
+                                                                                config.a_indices,
+                                                                                config.a_data,
+                                                                                config.b_nrows,
+                                                                                config.a_nrows,
+                                                                                smem_dim,
+                                                                                config.a_nnz,
+                                                                                out_dists,
+                                                                                n_blocks_per_row,
+                                                                                chunk_size,
+                                                                                config.a_ncols,
+                                                                                product_func,
+                                                                                accum_func,
+                                                                                write_func);
+  }
+
+ protected:
+  int smem;
+  const distances_config_t<value_idx, value_t>& config;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh
new file mode 100644
index 000000000..a9040e1d8
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../common.hpp"
+#include "../utils.cuh"
+
+#include <raft/util/cuda_dev_essentials.cuh>  // raft::ceildiv
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx>
+class mask_row_it {
+ public:
+  mask_row_it(const value_idx* full_indptr_,
+              const value_idx& n_rows_,
+              value_idx* mask_row_idx_ = NULL)
+    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_)
+  {
+  }
+
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
+    if (mask_row_idx != NULL) {
+      return mask_row_idx[blockIdx.x / n_blocks_nnz_b];
+    } else {
+      return blockIdx.x / n_blocks_nnz_b;
+    }
+  }
+
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const value_idx& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    start_offset = full_indptr[row_idx];
+    stop_offset  = full_indptr[row_idx + 1] - 1;
+  }
+
+  __device__ constexpr inline void get_indices_boundary(const value_idx* indices,
+                                                        value_idx& indices_len,
+                                                        value_idx& start_offset,
+                                                        value_idx& stop_offset,
+                                                        value_idx& start_index,
+                                                        value_idx& stop_index,
+                                                        bool& first_a_chunk,
+                                                        bool& last_a_chunk)
+  {
+    // do nothing;
+  }
+
+  __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a,
+                                                        value_idx& stop_index_a,
+                                                        value_idx& index_b)
+  {
+    return true;
+  }
+
+  const value_idx *full_indptr, &n_rows;
+  value_idx* mask_row_idx;
+};
+
+template <typename value_idx>
+RAFT_KERNEL fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
+                                      value_idx* chunk_indices,
+                                      value_idx n_rows)
+{
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < n_rows) {
+    auto start = n_chunks_per_row[tid];
+    auto end   = n_chunks_per_row[tid + 1];
+
+#pragma unroll
+    for (int i = start; i < end; i++) {
+      chunk_indices[i] = tid;
+    }
+  }
+}
+
+template <typename value_idx>
+class chunked_mask_row_it : public mask_row_it<value_idx> {
+ public:
+  chunked_mask_row_it(const value_idx* full_indptr_,
+                      const value_idx& n_rows_,
+                      value_idx* mask_row_idx_,
+                      int row_chunk_size_,
+                      const value_idx* n_chunks_per_row_,
+                      const value_idx* chunk_indices_,
+                      const cudaStream_t stream_)
+    : mask_row_it<value_idx>(full_indptr_, n_rows_, mask_row_idx_),
+      row_chunk_size(row_chunk_size_),
+      n_chunks_per_row(n_chunks_per_row_),
+      chunk_indices(chunk_indices_),
+      stream(stream_)
+  {
+  }
+
+  static void init(const value_idx* indptr,
+                   const value_idx* mask_row_idx,
+                   const value_idx& n_rows,
+                   const int row_chunk_size,
+                   rmm::device_uvector<value_idx>& n_chunks_per_row,
+                   rmm::device_uvector<value_idx>& chunk_indices,
+                   cudaStream_t stream)
+  {
+    auto policy = rmm::exec_policy(stream);
+
+    constexpr value_idx first_element = 0;
+    n_chunks_per_row.set_element_async(0, first_element, stream);
+    n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size);
+    thrust::transform(
+      policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor);
+
+    thrust::inclusive_scan(
+      policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1);
+
+    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream);
+
+    fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream);
+  }
+
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
+    return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]];
+  }
+
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const int& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    auto chunk_index    = blockIdx.x / n_blocks_nnz_b;
+    auto chunk_val      = chunk_indices[chunk_index];
+    auto prev_n_chunks  = n_chunks_per_row[chunk_val];
+    auto relative_chunk = chunk_index - prev_n_chunks;
+    first_a_chunk       = relative_chunk == 0;
+
+    start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size;
+    stop_offset  = start_offset + row_chunk_size;
+
+    auto final_stop_offset = this->full_indptr[row_idx + 1];
+
+    last_a_chunk = stop_offset >= final_stop_offset;
+    stop_offset  = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
+  }
+
+  __device__ inline void get_indices_boundary(const value_idx* indices,
+                                              value_idx& row_idx,
+                                              value_idx& start_offset,
+                                              value_idx& stop_offset,
+                                              value_idx& start_index,
+                                              value_idx& stop_index,
+                                              bool& first_a_chunk,
+                                              bool& last_a_chunk)
+  {
+    start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1;
+    stop_index  = last_a_chunk ? stop_index : indices[stop_offset];
+  }
+
+  __device__ inline bool check_indices_bounds(value_idx& start_index_a,
+                                              value_idx& stop_index_a,
+                                              value_idx& index_b)
+  {
+    return (index_b >= start_index_a && index_b <= stop_index_a);
+  }
+
+  inline static value_idx total_row_blocks = 0;
+  const cudaStream_t stream;
+  const value_idx *n_chunks_per_row, *chunk_indices;
+  value_idx row_chunk_size;
+
+  struct n_chunks_per_row_functor {
+   public:
+    n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_)
+      : indptr(indptr_), row_chunk_size(row_chunk_size_)
+    {
+    }
+
+    __host__ __device__ value_idx operator()(const value_idx& i)
+    {
+      auto degree = indptr[i + 1] - indptr[i];
+      return raft::ceildiv(degree, (value_idx)row_chunk_size);
+    }
+
+    const value_idx* indptr;
+    value_idx row_chunk_size;
+  };
+
+ private:
+  static void fill_chunk_indices(const value_idx& n_rows,
+                                 rmm::device_uvector<value_idx>& n_chunks_per_row,
+                                 rmm::device_uvector<value_idx>& chunk_indices,
+                                 cudaStream_t stream)
+  {
+    auto n_threads = std::min(n_rows, 256);
+    auto n_blocks  = raft::ceildiv(n_rows, (value_idx)n_threads);
+
+    chunk_indices.resize(total_row_blocks, stream);
+
+    fill_chunk_indices_kernel<value_idx>
+      <<<n_blocks, n_threads, 0, stream>>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows);
+  }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh
new file mode 100644
index 000000000..baa913a6c
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "base_strategy.cuh"
+
+#include <raft/util/cuda_dev_essentials.cuh>  // raft::ceildiv
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
+ public:
+  using smem_type   = value_t*;
+  using insert_type = smem_type;
+  using find_type   = smem_type;
+
+  dense_smem_strategy(const distances_config_t<value_idx, value_t>& config_)
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_)
+  {
+  }
+
+  inline static int smem_per_block(int n_cols)
+  {
+    return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t));
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.a_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> a_indptr(this->config.a_indptr, this->config.a_nrows);
+
+    this->_dispatch_base(*this,
+                         this->config.b_ncols,
+                         a_indptr,
+                         out_dists,
+                         coo_rows_b,
+                         product_func,
+                         accum_func,
+                         write_func,
+                         chunk_size,
+                         n_blocks,
+                         n_blocks_per_row);
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.b_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> b_indptr(this->config.b_indptr, this->config.b_nrows);
+
+    this->_dispatch_base_rev(*this,
+                             this->config.a_ncols,
+                             b_indptr,
+                             out_dists,
+                             coo_rows_a,
+                             product_func,
+                             accum_func,
+                             write_func,
+                             chunk_size,
+                             n_blocks,
+                             n_blocks_per_row);
+  }
+
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
+    for (int k = threadIdx.x; k < cache_size; k += blockDim.x) {
+      cache[k] = 0.0;
+    }
+    return cache;
+  }
+
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
+    cache[key] = value;
+  }
+
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
+    return cache;
+  }
+
+  __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; }
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh
new file mode 100644
index 000000000..cf212076b
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "base_strategy.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+
+#include <cuco/static_map.cuh>
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+
+// this is needed by cuco as key, value must be bitwise comparable.
+// compilers don't declare float/double as bitwise comparable
+// but that is too strict
+// for example, the following is true (or 0):
+// float a = 5;
+// float b = 5;
+// memcmp(&a, &b, sizeof(float));
+CUCO_DECLARE_BITWISE_COMPARABLE(float);
+CUCO_DECLARE_BITWISE_COMPARABLE(double);
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t, int tpb>
+class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
+ public:
+  using insert_type = typename cuco::legacy::
+    static_map<value_idx, value_t, cuda::thread_scope_block>::device_mutable_view;
+  using smem_type = typename insert_type::slot_type*;
+  using find_type =
+    typename cuco::legacy::static_map<value_idx, value_t, cuda::thread_scope_block>::device_view;
+
+  hash_strategy(const distances_config_t<value_idx, value_t>& config_,
+                float capacity_threshold_ = 0.5,
+                int map_size_             = get_map_size())
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_),
+      capacity_threshold(capacity_threshold_),
+      map_size(map_size_)
+  {
+  }
+
+  void chunking_needed(const value_idx* indptr,
+                       const value_idx n_rows,
+                       rmm::device_uvector<value_idx>& mask_indptr,
+                       std::tuple<value_idx, value_idx>& n_rows_divided,
+                       cudaStream_t stream)
+  {
+    auto policy = raft::resource::get_thrust_policy(this->config.handle);
+
+    auto less                   = thrust::copy_if(policy,
+                                thrust::make_counting_iterator(value_idx(0)),
+                                thrust::make_counting_iterator(n_rows),
+                                mask_indptr.data(),
+                                fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
+    std::get<0>(n_rows_divided) = less - mask_indptr.data();
+
+    auto more = thrust::copy_if(
+      policy,
+      thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows),
+      less,
+      fits_in_hash_table(
+        indptr, capacity_threshold * map_size, std::numeric_limits<value_idx>::max()));
+    std::get<1>(n_rows_divided) = more - less;
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb);
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.a_nrows, raft::resource::get_cuda_stream(this->config.handle));
+    std::tuple<value_idx, value_idx> n_rows_divided;
+
+    chunking_needed(this->config.a_indptr,
+                    this->config.a_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    raft::resource::get_cuda_stream(this->config.handle));
+
+    auto less_rows = std::get<0>(n_rows_divided);
+    if (less_rows > 0) {
+      mask_row_it<value_idx> less(this->config.a_indptr, less_rows, mask_indptr.data());
+
+      auto n_less_blocks = less_rows * n_blocks_per_row;
+      this->_dispatch_base(*this,
+                           map_size,
+                           less,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_less_blocks,
+                           n_blocks_per_row);
+    }
+
+    auto more_rows = std::get<1>(n_rows_divided);
+    if (more_rows > 0) {
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, raft::resource::get_cuda_stream(this->config.handle));
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, raft::resource::get_cuda_stream(this->config.handle));
+      chunked_mask_row_it<value_idx>::init(this->config.a_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           raft::resource::get_cuda_stream(this->config.handle));
+
+      chunked_mask_row_it<value_idx> more(this->config.a_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          raft::resource::get_cuda_stream(this->config.handle));
+
+      auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
+      this->_dispatch_base(*this,
+                           map_size,
+                           more,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_more_blocks,
+                           n_blocks_per_row);
+    }
+  }
+
+  template <typename product_f, typename accum_f, typename write_f>
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb);
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.b_nrows, raft::resource::get_cuda_stream(this->config.handle));
+    std::tuple<value_idx, value_idx> n_rows_divided;
+
+    chunking_needed(this->config.b_indptr,
+                    this->config.b_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    raft::resource::get_cuda_stream(this->config.handle));
+
+    auto less_rows = std::get<0>(n_rows_divided);
+    if (less_rows > 0) {
+      mask_row_it<value_idx> less(this->config.b_indptr, less_rows, mask_indptr.data());
+
+      auto n_less_blocks = less_rows * n_blocks_per_row;
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               less,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_less_blocks,
+                               n_blocks_per_row);
+    }
+
+    auto more_rows = std::get<1>(n_rows_divided);
+    if (more_rows > 0) {
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, raft::resource::get_cuda_stream(this->config.handle));
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, raft::resource::get_cuda_stream(this->config.handle));
+      chunked_mask_row_it<value_idx>::init(this->config.b_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           raft::resource::get_cuda_stream(this->config.handle));
+
+      chunked_mask_row_it<value_idx> more(this->config.b_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          raft::resource::get_cuda_stream(this->config.handle));
+
+      auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               more,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_more_blocks,
+                               n_blocks_per_row);
+    }
+  }
+
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
+    return insert_type::make_from_uninitialized_slots(cooperative_groups::this_thread_block(),
+                                                      cache,
+                                                      cache_size,
+                                                      cuco::empty_key{value_idx{-1}},
+                                                      cuco::empty_value{value_t{0}});
+  }
+
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
+    auto success = cache.insert(cuco::pair<value_idx, value_t>(key, value));
+  }
+
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
+    return find_type(
+      cache, cache_size, cuco::empty_key{value_idx{-1}}, cuco::empty_value{value_t{0}});
+  }
+
+  __device__ inline value_t find(find_type cache, const value_idx& key)
+  {
+    auto a_pair = cache.find(key);
+
+    value_t a_col = 0.0;
+    if (a_pair != cache.end()) { a_col = a_pair->second; }
+    return a_col;
+  }
+
+  struct fits_in_hash_table {
+   public:
+    fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_)
+      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_)
+    {
+    }
+
+    __host__ __device__ bool operator()(const value_idx& i)
+    {
+      auto degree = indptr[i + 1] - indptr[i];
+
+      return degree >= degree_l && degree < degree_r;
+    }
+
+   private:
+    const value_idx* indptr;
+    const value_idx degree_l, degree_r;
+  };
+
+  inline static int get_map_size()
+  {
+    return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+           sizeof(typename insert_type::slot_type);
+  }
+
+ private:
+  float capacity_threshold;
+  int map_size;
+};
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/ip_distance.cuh b/cpp/src/distance/detail/sparse/ip_distance.cuh
new file mode 100644
index 000000000..3a11d4e99
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/ip_distance.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "coo_spmv.cuh"
+
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/linalg/transpose.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx, typename value_t>
+class ip_distances_t : public distances_t<value_t> {
+ public:
+  /**
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   */
+  ip_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), coo_rows_b(config.b_nnz, raft::resource::get_cuda_stream(config.handle))
+  {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows_b.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  /**
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t* out_distances)
+  {
+    /**
+     * Compute pairwise distances and return dense matrix in row-major format
+     */
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_distances,
+                                                               *config_,
+                                                               coo_rows_b.data(),
+                                                               raft::mul_op(),
+                                                               raft::add_op(),
+                                                               raft::atomic_add_op());
+  }
+
+  value_idx* b_rows_coo() { return coo_rows_b.data(); }
+
+  value_t* b_data_coo() { return config_->b_data; }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<value_idx> coo_rows_b;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/l2_distance.cuh b/cpp/src/distance/detail/sparse/l2_distance.cuh
new file mode 100644
index 000000000..40e7070fc
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/l2_distance.cuh
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include "ip_distance.cuh"
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+// @TODO: Move this into sparse prims (coo_norm)
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_row_norm_kernel(value_t* out,
+                                    const value_idx* __restrict__ coo_rows,
+                                    const value_t* __restrict__ data,
+                                    value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
+}
+
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_row_sum_kernel(value_t* out,
+                                   const value_idx* __restrict__ coo_rows,
+                                   const value_t* __restrict__ data,
+                                   value_idx nnz)
+{
+  value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
+}
+
+template <typename value_idx, typename value_t, typename expansion_f>
+RAFT_KERNEL compute_euclidean_warp_kernel(value_t* __restrict__ C,
+                                          const value_t* __restrict__ Q_sq_norms,
+                                          const value_t* __restrict__ R_sq_norms,
+                                          value_idx n_rows,
+                                          value_idx n_cols,
+                                          expansion_f expansion_func)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t dot = C[(size_t)i * n_cols + j];
+
+  // e.g. Euclidean expansion func = -2.0 * dot + q_norm + r_norm
+  value_t val = expansion_func(dot, Q_sq_norms[i], R_sq_norms[j]);
+
+  // correct for small instabilities
+  C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
+}
+
+template <typename value_idx, typename value_t>
+RAFT_KERNEL compute_correlation_warp_kernel(value_t* __restrict__ C,
+                                            const value_t* __restrict__ Q_sq_norms,
+                                            const value_t* __restrict__ R_sq_norms,
+                                            const value_t* __restrict__ Q_norms,
+                                            const value_t* __restrict__ R_norms,
+                                            value_idx n_rows,
+                                            value_idx n_cols,
+                                            value_idx n)
+{
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
+
+  if (i >= n_rows || j >= n_cols) return;
+
+  value_t dot  = C[(size_t)i * n_cols + j];
+  value_t Q_l1 = Q_norms[i];
+  value_t R_l1 = R_norms[j];
+
+  value_t Q_l2 = Q_sq_norms[i];
+  value_t R_l2 = R_sq_norms[j];
+
+  value_t numer   = n * dot - (Q_l1 * R_l1);
+  value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
+  value_t R_denom = n * R_l2 - (R_l1 * R_l1);
+
+  value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom));
+
+  // correct for small instabilities
+  C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_euclidean(value_t* C,
+                       const value_t* Q_sq_norms,
+                       const value_t* R_sq_norms,
+                       value_idx n_rows,
+                       value_idx n_cols,
+                       cudaStream_t stream,
+                       expansion_f expansion_func)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_l2(value_t* out,
+                const value_idx* Q_coo_rows,
+                const value_t* Q_data,
+                value_idx Q_nnz,
+                const value_idx* R_coo_rows,
+                const value_t* R_data,
+                value_idx R_nnz,
+                value_idx m,
+                value_idx n,
+                cudaStream_t stream,
+                expansion_f expansion_func)
+{
+  rmm::device_uvector<value_t> Q_sq_norms(m, stream);
+  rmm::device_uvector<value_t> R_sq_norms(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+
+  compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256>
+void compute_correlation(value_t* C,
+                         const value_t* Q_sq_norms,
+                         const value_t* R_sq_norms,
+                         const value_t* Q_norms,
+                         const value_t* R_norms,
+                         value_idx n_rows,
+                         value_idx n_cols,
+                         value_idx n,
+                         cudaStream_t stream)
+{
+  int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
+  compute_correlation_warp_kernel<<<blocks, tpb, 0, stream>>>(
+    C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n);
+}
+
+template <typename value_idx, typename value_t, int tpb = 256>
+void compute_corr(value_t* out,
+                  const value_idx* Q_coo_rows,
+                  const value_t* Q_data,
+                  value_idx Q_nnz,
+                  const value_idx* R_coo_rows,
+                  const value_t* R_data,
+                  value_idx R_nnz,
+                  value_idx m,
+                  value_idx n,
+                  value_idx n_cols,
+                  cudaStream_t stream)
+{
+  // sum_sq for std dev
+  rmm::device_uvector<value_t> Q_sq_norms(m, stream);
+  rmm::device_uvector<value_t> R_sq_norms(n, stream);
+
+  // sum for mean
+  rmm::device_uvector<value_t> Q_norms(m, stream);
+  rmm::device_uvector<value_t> R_norms(n, stream);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+
+  compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_row_sum_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
+    Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
+  compute_row_sum_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
+    R_norms.data(), R_coo_rows, R_data, R_nnz);
+
+  compute_correlation(out,
+                      Q_sq_norms.data(),
+                      R_sq_norms.data(),
+                      Q_norms.data(),
+                      R_norms.data(),
+                      m,
+                      n,
+                      n_cols,
+                      stream);
+}
+
+/**
+ * L2 distance using the expanded form: sum(x_k)^2 + sum(y_k)^2 - 2 * sum(x_k * y_k)
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class l2_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit l2_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               raft::resource::get_cuda_stream(config_->handle),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 return -2 * dot + q_norm + r_norm;
+               });
+  }
+
+  ~l2_expanded_distances_t() = default;
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * L2 sqrt distance performing the sqrt operation after the distance computation
+ * The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, value_t> {
+ public:
+  explicit l2_sqrt_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_expanded_distances_t<value_idx, value_t>(config)
+  {
+  }
+
+  void compute(value_t* out_dists) override
+  {
+    l2_expanded_distances_t<value_idx, value_t>::compute(out_dists);
+    // Sqrt Post-processing
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
+      [] __device__(value_t input) {
+        int neg = input < 0 ? -1 : 1;
+        return raft::sqrt(abs(input) * neg);
+      },
+      raft::resource::get_cuda_stream(this->config_->handle));
+  }
+
+  ~l2_sqrt_expanded_distances_t() = default;
+};
+
+template <typename value_idx, typename value_t>
+class correlation_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit correlation_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_corr(out_dists,
+                 search_coo_rows.data(),
+                 config_->a_data,
+                 config_->a_nnz,
+                 b_indices,
+                 b_data,
+                 config_->b_nnz,
+                 config_->a_nrows,
+                 config_->b_nrows,
+                 config_->b_ncols,
+                 raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  ~correlation_expanded_distances_t() = default;
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) *
+ * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data.
+ */
+template <typename value_idx = int, typename value_t = float>
+class cosine_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit cosine_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
+
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, raft::resource::get_cuda_stream(config_->handle));
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               raft::resource::get_cuda_stream(config_->handle),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm);
+                 // deal with potential for 0 in denominator by forcing 0/1 instead
+                 value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+
+                 // flip the similarity when both rows are 0
+                 bool both_empty = (q_norm == 0) && (r_norm == 0);
+                 return 1 - ((!both_empty * cos) + both_empty);
+               });
+  }
+
+  ~cosine_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+/**
+ * Hellinger distance using the expanded form: sqrt(1 - sum(sqrt(x_k) * sqrt(y_k)))
+ * The expanded form is more efficient for sparse data.
+ *
+ * This distance computation modifies A and B by computing a sqrt
+ * and then performing a `pow(x, 2)` to convert it back. Because of this,
+ * it is possible that the values in A and B might differ slightly
+ * after this is invoked.
+ */
+template <typename value_idx = int, typename value_t = float>
+class hellinger_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit hellinger_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, raft::resource::get_cuda_stream(config.handle))
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                            raft::resource::get_cuda_stream(config_->handle));
+
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) {
+        // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
+        bool rectifier = (1 - input) > 0;
+        return raft::sqrt(rectifier * (1 - input));
+      },
+      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+  ~hellinger_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class russelrao_expanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit russelrao_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config),
+      workspace(0, raft::resource::get_cuda_stream(config.handle)),
+      ip_dists(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    ip_dists.compute(out_dists);
+
+    value_t n_cols     = config_->a_ncols;
+    value_t n_cols_inv = 1.0 / n_cols;
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
+      [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; },
+      raft::resource::get_cuda_stream(config_->handle));
+
+    auto exec_policy  = rmm::exec_policy(raft::resource::get_cuda_stream(config_->handle));
+    auto diags        = thrust::counting_iterator<value_idx>(0);
+    value_idx b_nrows = config_->b_nrows;
+    thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) {
+      out_dists[input * b_nrows + input] = 0.0;
+    });
+  }
+
+  ~russelrao_expanded_distances_t() = default;
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  rmm::device_uvector<char> workspace;
+  ip_distances_t<value_idx, value_t> ip_dists;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/lp_distance.cuh b/cpp/src/distance/detail/sparse/lp_distance.cuh
new file mode 100644
index 000000000..18e7b04e4
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/lp_distance.cuh
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.hpp"
+
+#include <raft/core/operators.cuh>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <limits.h>
+
+#include <algorithm>
+#include <nvfunctional>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+template <typename value_idx = int,
+          typename value_t   = float,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+void unexpanded_lp_distances(value_t* out_dists,
+                             const distances_config_t<value_idx, value_t>* config_,
+                             product_f product_func,
+                             accum_f accum_func,
+                             write_f write_func)
+{
+  rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                          raft::resource::get_cuda_stream(config_->handle));
+
+  raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                    config_->b_nrows,
+                                    coo_rows.data(),
+                                    config_->b_nnz,
+                                    raft::resource::get_cuda_stream(config_->handle));
+
+  balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+    out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
+
+  raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                    config_->a_nrows,
+                                    coo_rows.data(),
+                                    config_->a_nnz,
+                                    raft::resource::get_cuda_stream(config_->handle));
+
+  balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
+    out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
+}
+
+/**
+ * Computes L1 distances for sparse input. This does not have
+ * an equivalent expanded form, so it is only executed in
+ * an unexpanded form.
+ * @tparam value_idx
+ * @tparam value_t
+ */
+template <typename value_idx = int, typename value_t = float>
+class l1_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l1_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::add_op(), raft::atomic_add_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class l2_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  l2_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::sqdiff_op(), raft::add_op(), raft::atomic_add_op());
+  }
+
+ protected:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_idx, value_t> {
+ public:
+  l2_sqrt_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_unexpanded_distances_t<value_idx, value_t>(config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    l2_unexpanded_distances_t<value_idx, value_t>::compute(out_dists);
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    // Sqrt Post-processing
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      n,
+      [] __device__(value_t input) {
+        int neg = input < 0 ? -1 : 1;
+        return raft::sqrt(abs(input) * neg);
+      },
+      raft::resource::get_cuda_stream(this->config_->handle));
+  }
+};
+
+template <typename value_idx = int, typename value_t = float>
+class linf_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit linf_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::absdiff_op(), raft::max_op(), raft::atomic_max_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class canberra_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit canberra_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      [] __device__(value_t a, value_t b) {
+        value_t d = fabs(a) + fabs(b);
+
+        // deal with potential for 0 in denominator by
+        // forcing 1/0 instead
+        return ((d != 0) * fabs(a - b)) / (d + (d == 0));
+      },
+      raft::add_op(),
+      raft::atomic_add_op());
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class lp_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit lp_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config,
+                                     value_t p_)
+    : config_(&config), p(p_)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      raft::compose_op(raft::pow_const_op<value_t>(p), raft::sub_op()),
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n         = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    value_t one_over_p = value_t{1} / p;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::pow_const_op<value_t>(one_over_p),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+  value_t p;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class hamming_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit hamming_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists, config_, raft::notequal_op(), raft::add_op(), raft::atomic_add_op());
+
+    uint64_t n     = (uint64_t)config_->a_nrows * (uint64_t)config_->b_nrows;
+    value_t n_cols = 1.0 / config_->a_ncols;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::mul_const_op<value_t>(n_cols),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit jensen_shannon_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(
+      out_dists,
+      config_,
+      [] __device__(value_t a, value_t b) {
+        value_t m   = 0.5f * (a + b);
+        bool a_zero = a == 0;
+        bool b_zero = b == 0;
+
+        value_t x = (!a_zero * m) / (a_zero + a);
+        value_t y = (!b_zero * m) / (b_zero + b);
+
+        bool x_zero = x == 0;
+        bool y_zero = y == 0;
+
+        return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
+      },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    raft::linalg::unaryOp<value_t>(
+      out_dists,
+      out_dists,
+      n,
+      [=] __device__(value_t input) { return raft::sqrt(0.5 * input); },
+      raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+template <typename value_idx = int, typename value_t = float>
+class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
+ public:
+  explicit kl_divergence_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
+
+  void compute(value_t* out_dists)
+  {
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
+                                            raft::resource::get_cuda_stream(config_->handle));
+
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
+                                      raft::resource::get_cuda_stream(config_->handle));
+
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return a * log(a / b); },
+      raft::add_op(),
+      raft::atomic_add_op());
+
+    uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows;
+    raft::linalg::unaryOp<value_t>(out_dists,
+                                   out_dists,
+                                   n,
+                                   raft::mul_const_op<value_t>(0.5),
+                                   raft::resource::get_cuda_stream(config_->handle));
+  }
+
+ private:
+  const distances_config_t<value_idx, value_t>* config_;
+};
+
+}  // END namespace sparse
+}  // END namespace detail
+}  // END namespace distance
+}  // END namespace cuvs
diff --git a/cpp/src/distance/detail/sparse/utils.cuh b/cpp/src/distance/detail/sparse/utils.cuh
new file mode 100644
index 000000000..dc7ae6df6
--- /dev/null
+++ b/cpp/src/distance/detail/sparse/utils.cuh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/math.hpp>
+
+#include <cub/cub.cuh>
+#include <cuda_fp16.h>
+#include <cuda_pipeline.h>
+
+namespace cuvs {
+namespace distance {
+namespace detail {
+namespace sparse {
+
+/**
+ * Computes the maximum number of columns that can be stored
+ * in shared memory in dense form with the given block size
+ * and precision.
+ * @return the maximum number of columns that can be stored in smem
+ */
+template <typename value_idx, typename value_t, int tpb = 1024>
+inline int max_cols_per_block()
+{
+  // max cols = (total smem available - cub reduction smem)
+  return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+         sizeof(value_t);
+}
+
+template <typename value_idx, typename value_t, typename dot_t = value_t>
+RAFT_KERNEL faster_dot_on_csr_kernel(dot_t* __restrict__ dot,
+                                     const value_idx* __restrict__ indptr,
+                                     const value_idx* __restrict__ cols,
+                                     const value_t* __restrict__ A,
+                                     const value_t* __restrict__ B,
+                                     const value_idx nnz,
+                                     const value_idx n_rows,
+                                     const value_idx dim)
+{
+  auto vec_id  = threadIdx.x;
+  auto lane_id = threadIdx.x & 0x1f;
+
+  extern __shared__ char smem[];
+  value_t* s_A      = (value_t*)smem;
+  value_idx cur_row = -1;
+
+  for (int row = blockIdx.x; row < n_rows; row += gridDim.x) {
+    for (int dot_id = blockIdx.y + indptr[row]; dot_id < indptr[row + 1]; dot_id += gridDim.y) {
+      if (dot_id >= nnz) { return; }
+      const value_idx col               = cols[dot_id] * dim;
+      const value_t* __restrict__ B_col = B + col;
+
+      if (threadIdx.x == 0) { dot[dot_id] = 0.0; }
+      __syncthreads();
+
+      if (cur_row != row) {
+        for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+          s_A[k] = A[row * dim + k];
+        }
+        cur_row = row;
+      }
+
+      dot_t l_dot_ = 0.0;
+      for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+        asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x));
+        if constexpr ((std::is_same_v<dot_t, float> && std::is_same_v<value_t, half>)) {
+          l_dot_ += __half2float(s_A[k]) * __half2float(__ldcg(B_col + k));
+        } else {
+          l_dot_ += s_A[k] * __ldcg(B_col + k);
+        }
+      }
+
+      typedef cub::WarpReduce<dot_t> WarpReduce;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      dot_t warp_sum = WarpReduce(temp_storage).Sum(l_dot_);
+
+      if (lane_id == 0) { atomicAdd_block(dot + dot_id, warp_sum); }
+    }
+  }
+}
+
+template <typename value_idx, typename value_t, typename dot_t = value_t>
+void faster_dot_on_csr(raft::resources const& handle,
+                       dot_t* dot,
+                       const value_idx nnz,
+                       const value_idx* indptr,
+                       const value_idx* cols,
+                       const value_t* A,
+                       const value_t* B,
+                       const value_idx n_rows,
+                       const value_idx dim)
+{
+  if (nnz == 0 || n_rows == 0) return;
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  constexpr value_idx MAX_ROW_PER_ITER = 500;
+  int dev_id, sm_count, blocks_per_sm;
+
+  const int smem_size = dim * sizeof(value_t);
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  if (dim < 128) {
+    constexpr int tpb = 64;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+
+  } else if (dim < 256) {
+    constexpr int tpb = 128;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else if (dim < 512) {
+    constexpr int tpb = 256;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else {
+    constexpr int tpb = 512;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t, dot_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t, dot_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  }
+
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace sparse
+}  // namespace detail
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/sparse_distance.cu b/cpp/src/distance/sparse_distance.cu
new file mode 100644
index 000000000..338c4e908
--- /dev/null
+++ b/cpp/src/distance/sparse_distance.cu
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include "sparse_distance.cuh"
+
+namespace cuvs {
+namespace distance {
+
+template <typename ElementType, typename IndexType>
+void pairwise_distance(
+  raft::resources const& handle,
+  raft::device_csr_matrix_view<const ElementType, IndexType, IndexType, IndexType> x,
+  raft::device_csr_matrix_view<const ElementType, IndexType, IndexType, IndexType> y,
+  raft::device_matrix_view<ElementType, IndexType, raft::row_major> dist,
+  cuvs::distance::DistanceType metric,
+  float metric_arg = 2.0f)
+{
+  auto x_structure = x.structure_view();
+  auto y_structure = y.structure_view();
+
+  RAFT_EXPECTS(x_structure.get_n_cols() == y_structure.get_n_cols(),
+               "Number of columns must be equal");
+
+  RAFT_EXPECTS(dist.extent(0) == x_structure.get_n_rows(),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y_structure.get_n_rows(),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  detail::sparse::distances_config_t<IndexType, ElementType> input_config(handle);
+  input_config.a_nrows   = x_structure.get_n_rows();
+  input_config.a_ncols   = x_structure.get_n_cols();
+  input_config.a_nnz     = x_structure.get_nnz();
+  input_config.a_indptr  = const_cast<IndexType*>(x_structure.get_indptr().data());
+  input_config.a_indices = const_cast<IndexType*>(x_structure.get_indices().data());
+  input_config.a_data    = const_cast<ElementType*>(x.get_elements().data());
+
+  input_config.b_nrows   = y_structure.get_n_rows();
+  input_config.b_ncols   = y_structure.get_n_cols();
+  input_config.b_nnz     = y_structure.get_nnz();
+  input_config.b_indptr  = const_cast<IndexType*>(y_structure.get_indptr().data());
+  input_config.b_indices = const_cast<IndexType*>(y_structure.get_indices().data());
+  input_config.b_data    = const_cast<ElementType*>(y.get_elements().data());
+
+  pairwiseDistance(dist.data_handle(), input_config, metric, metric_arg);
+}
+
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const float, int, int, int> x,
+                       raft::device_csr_matrix_view<const float, int, int, int> y,
+                       raft::device_matrix_view<float, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg)
+{
+  pairwise_distance<float, int>(handle, x, y, dist, metric, metric_arg);
+}
+
+void pairwise_distance(raft::resources const& handle,
+                       raft::device_csr_matrix_view<const double, int, int, int> x,
+                       raft::device_csr_matrix_view<const double, int, int, int> y,
+                       raft::device_matrix_view<double, int, raft::row_major> dist,
+                       cuvs::distance::DistanceType metric,
+                       float metric_arg)
+{
+  pairwise_distance<double, int>(handle, x, y, dist, metric, metric_arg);
+}
+}  // namespace distance
+}  // namespace cuvs
diff --git a/cpp/src/distance/sparse_distance.cuh b/cpp/src/distance/sparse_distance.cuh
new file mode 100644
index 000000000..0d6dc0e6f
--- /dev/null
+++ b/cpp/src/distance/sparse_distance.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/sparse/bin_distance.cuh"
+#include "detail/sparse/common.hpp"
+#include "detail/sparse/ip_distance.cuh"
+#include "detail/sparse/l2_distance.cuh"
+#include "detail/sparse/lp_distance.cuh"
+
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+
+#include <unordered_set>
+
+namespace cuvs {
+namespace distance {
+/**
+ * Compute pairwise distances between A and B, using the provided
+ * input configuration and distance function.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @param[out] out dense output array (size A.nrows * B.nrows)
+ * @param[in] input_config input argument configuration
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename value_idx = int, typename value_t = float>
+void pairwiseDistance(value_t* out,
+                      detail::sparse::distances_config_t<value_idx, value_t> input_config,
+                      cuvs::distance::DistanceType metric,
+                      float metric_arg)
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded:
+      detail::sparse::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+      detail::sparse::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::InnerProduct:
+      detail::sparse::ip_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2Unexpanded:
+      detail::sparse::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
+      detail::sparse::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::L1:
+      detail::sparse::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::LpUnexpanded:
+      detail::sparse::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::Linf:
+      detail::sparse::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::Canberra:
+      detail::sparse::canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::JaccardExpanded:
+      detail::sparse::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::CosineExpanded:
+      detail::sparse::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::HellingerExpanded:
+      detail::sparse::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::DiceExpanded:
+      detail::sparse::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::CorrelationExpanded:
+      detail::sparse::correlation_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::RusselRaoExpanded:
+      detail::sparse::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::HammingUnexpanded:
+      detail::sparse::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case cuvs::distance::DistanceType::JensenShannon:
+      detail::sparse::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+    case cuvs::distance::DistanceType::KLDivergence:
+      detail::sparse::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
+      break;
+
+    default: THROW("Unsupported distance: %d", metric);
+  }
+}
+};  // namespace distance
+};  // namespace cuvs
diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu
index b0f87e9ac..d534676e3 100644
--- a/cpp/src/neighbors/brute_force.cu
+++ b/cpp/src/neighbors/brute_force.cu
@@ -21,6 +21,21 @@
 #include <raft/core/copy.hpp>
 
 namespace cuvs::neighbors::brute_force {
+
+template <typename T, typename DistT>
+index<T, DistT>::index(raft::resources const& res)
+  // this constructor is just for a temporary index, for use in the deserialization
+  // api. all the parameters here will get replaced with loaded values - that aren't
+  // necessarily known ahead of time before deserialization.
+  // TODO: do we even need a handle here - could just construct one?
+  : cuvs::neighbors::index(),
+    metric_(cuvs::distance::DistanceType::L2Expanded),
+    dataset_(raft::make_device_matrix<T, int64_t>(res, 0, 0)),
+    norms_(std::nullopt),
+    metric_arg_(0)
+{
+}
+
 template <typename T, typename DistT>
 index<T, DistT>::index(raft::resources const& res,
                        raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp
index eda79aa31..f1a8c995d 100644
--- a/cpp/src/neighbors/brute_force_c.cpp
+++ b/cpp/src/neighbors/brute_force_c.cpp
@@ -17,10 +17,12 @@
 
 #include <cstdint>
 #include <dlpack/dlpack.h>
+#include <fstream>
 
 #include <raft/core/error.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
 
 #include <cuvs/core/c_api.h>
 #include <cuvs/core/exceptions.hpp>
@@ -91,6 +93,22 @@ void _search(cuvsResources_t res,
   }
 }
 
+template <typename T>
+void _serialize(cuvsResources_t res, const char* filename, cuvsBruteForceIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::brute_force::index<T>*>(index.addr);
+  cuvs::neighbors::brute_force::serialize(*res_ptr, std::string(filename), *index_ptr);
+}
+
+template <typename T>
+void* _deserialize(cuvsResources_t res, const char* filename)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  auto index   = new cuvs::neighbors::brute_force::index<T>(*res_ptr);
+  cuvs::neighbors::brute_force::deserialize(*res_ptr, std::string(filename), index);
+  return index;
+}
 }  // namespace
 
 extern "C" cuvsError_t cuvsBruteForceIndexCreate(cuvsBruteForceIndex_t* index)
@@ -129,7 +147,7 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res,
     if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
       index->addr =
         reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
-      index->dtype.code = kDLFloat;
+      index->dtype = dataset.dtype;
     } else {
       RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
                 dataset.dtype.code,
@@ -174,3 +192,38 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
     }
   });
 }
+
+extern "C" cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                                 const char* filename,
+                                                 cuvsBruteForceIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    // read the numpy dtype from the beginning of the file
+    std::ifstream is(filename, std::ios::in | std::ios::binary);
+    if (!is) { RAFT_FAIL("Cannot open file %s", filename); }
+    char dtype_string[4];
+    is.read(dtype_string, 4);
+    auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4));
+
+    index->dtype.bits = dtype.itemsize * 8;
+    if (dtype.kind == 'f' && dtype.itemsize == 4) {
+      index->dtype.code = kDLFloat;
+      index->addr       = reinterpret_cast<uintptr_t>(_deserialize<float>(res, filename));
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                               const char* filename,
+                                               cuvsBruteForceIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
+      _serialize<float>(res, filename, *index);
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
\ No newline at end of file
diff --git a/cpp/src/neighbors/brute_force_serialize.cu b/cpp/src/neighbors/brute_force_serialize.cu
new file mode 100644
index 000000000..1b5b5111e
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_serialize.cu
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/copy.cuh>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
+
+#include <fstream>
+
+namespace cuvs::neighbors::brute_force {
+
+int constexpr serialization_version = 0;
+
+template <typename T, typename DistT>
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<T, DistT>& index,
+               bool include_dataset = true)
+{
+  RAFT_LOG_DEBUG(
+    "Saving brute force index, size %zu, dim %u", static_cast<size_t>(index.size()), index.dim());
+
+  auto dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<T>().to_string();
+  dtype_string.resize(4);
+  os << dtype_string;
+
+  raft::serialize_scalar(handle, os, serialization_version);
+  raft::serialize_scalar(handle, os, index.size());
+  raft::serialize_scalar(handle, os, index.dim());
+  raft::serialize_scalar(handle, os, index.metric());
+  raft::serialize_scalar(handle, os, index.metric_arg());
+  raft::serialize_scalar(handle, os, include_dataset);
+  if (include_dataset) { raft::serialize_mdspan(handle, os, index.dataset()); }
+  auto has_norms = index.has_norms();
+  raft::serialize_scalar(handle, os, has_norms);
+  if (has_norms) { raft::serialize_mdspan(handle, os, index.norms()); }
+  raft::resource::sync_stream(handle);
+}
+
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<half, float>& index,
+               bool include_dataset)
+{
+  auto os = std::ofstream{filename, std::ios::out | std::ios::binary};
+  RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str());
+  serialize<half, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<float, float>& index,
+               bool include_dataset)
+{
+  auto os = std::ofstream{filename, std::ios::out | std::ios::binary};
+  RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str());
+  serialize<float, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<half, float>& index,
+               bool include_dataset)
+{
+  serialize<half, float>(handle, os, index, include_dataset);
+}
+
+void serialize(raft::resources const& handle,
+               std::ostream& os,
+               const index<float, float>& index,
+               bool include_dataset)
+{
+  serialize<float, float>(handle, os, index, include_dataset);
+}
+
+template <typename T, typename DistT>
+auto deserialize(raft::resources const& handle, std::istream& is)
+{
+  auto dtype_string = std::array<char, 4>{};
+  is.read(dtype_string.data(), 4);
+
+  auto ver = raft::deserialize_scalar<int>(handle, is);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  std::int64_t rows = raft::deserialize_scalar<size_t>(handle, is);
+  std::int64_t dim  = raft::deserialize_scalar<size_t>(handle, is);
+  auto metric       = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
+  auto metric_arg   = raft::deserialize_scalar<DistT>(handle, is);
+
+  auto dataset_storage = raft::make_host_matrix<T>(std::int64_t{}, std::int64_t{});
+  auto include_dataset = raft::deserialize_scalar<bool>(handle, is);
+  if (include_dataset) {
+    dataset_storage = raft::make_host_matrix<T>(rows, dim);
+    raft::deserialize_mdspan(handle, is, dataset_storage.view());
+  }
+
+  auto has_norms     = raft::deserialize_scalar<bool>(handle, is);
+  auto norms_storage = has_norms ? std::optional{raft::make_host_vector<DistT, std::int64_t>(rows)}
+                                 : std::optional<raft::host_vector<DistT, std::int64_t>>{};
+  // TODO(wphicks): Use mdbuffer here when available
+  auto norms_storage_dev =
+    has_norms ? std::optional{raft::make_device_vector<DistT, std::int64_t>(handle, rows)}
+              : std::optional<raft::device_vector<DistT, std::int64_t>>{};
+  if (has_norms) {
+    raft::deserialize_mdspan(handle, is, norms_storage->view());
+    raft::copy(handle, norms_storage_dev->view(), norms_storage->view());
+  }
+
+  auto result = index<T, DistT>(handle,
+                                raft::make_const_mdspan(dataset_storage.view()),
+                                std::move(norms_storage_dev),
+                                metric,
+                                metric_arg);
+  raft::resource::sync_stream(handle);
+
+  return result;
+}
+
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<half, float>* index)
+{
+  auto is = std::ifstream{filename, std::ios::in | std::ios::binary};
+  RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str());
+
+  *index = deserialize<half, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::brute_force::index<float, float>* index)
+{
+  auto is = std::ifstream{filename, std::ios::in | std::ios::binary};
+  RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str());
+
+  *index = deserialize<float, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<half, float>* index)
+{
+  *index = deserialize<half, float>(handle, is);
+}
+
+void deserialize(raft::resources const& handle,
+                 std::istream& is,
+                 cuvs::neighbors::brute_force::index<float, float>* index)
+{
+  *index = deserialize<float, float>(handle, is);
+}
+
+}  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp
index 6985ff094..326a89665 100644
--- a/cpp/src/neighbors/cagra_c.cpp
+++ b/cpp/src/neighbors/cagra_c.cpp
@@ -29,6 +29,8 @@
 #include <cuvs/neighbors/cagra.h>
 #include <cuvs/neighbors/cagra.hpp>
 
+#include <fstream>
+
 namespace {
 
 template <typename T>
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 9e4d453e3..b7fec724b 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -33,8 +33,7 @@
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <cuvs/neighbors/refine.hpp>
 
-// TODO: Fixme- this needs to be migrated
-#include "../../nn_descent.cuh"
+#include <cuvs/neighbors/nn_descent.hpp>
 
 // TODO: This shouldn't be calling spatial/knn APIs
 #include "../ann_utils.cuh"
@@ -356,8 +355,8 @@ void build_knn_graph(
   raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,
   cuvs::neighbors::nn_descent::index_params build_params)
 {
-  auto nn_descent_idx = cuvs::neighbors::nn_descent::index<IdxT>(res, knn_graph);
-  cuvs::neighbors::nn_descent::build<DataT, IdxT>(res, build_params, dataset, nn_descent_idx);
+  std::optional<raft::host_matrix_view<IdxT, int64_t, row_major>> graph_view = knn_graph;
+  auto nn_descent_idx = cuvs::neighbors::nn_descent::build(res, build_params, dataset, graph_view);
 
   using internal_IdxT = typename std::make_unsigned<IdxT>::type;
   using g_accessor    = typename decltype(nn_descent_idx.graph())::accessor_type;
@@ -437,11 +436,11 @@ index<T, IdxT> build(
   auto knn_build_params = params.graph_build_params;
   if (std::holds_alternative<std::monostate>(params.graph_build_params)) {
     // Heuristic to decide default build algo and its params.
-    if (params.metric == cuvs::distance::DistanceType::L2Expanded &&
-        cuvs::neighbors::nn_descent::has_enough_device_memory(
+    if (cuvs::neighbors::nn_descent::has_enough_device_memory(
           res, dataset.extents(), sizeof(IdxT))) {
       RAFT_LOG_DEBUG("NN descent solver");
-      knn_build_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
+      knn_build_params =
+        cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
     } else {
       RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
       knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
@@ -454,9 +453,6 @@ index<T, IdxT> build(
       std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(knn_build_params);
     build_knn_graph(res, dataset, knn_graph->view(), ivf_pq_params);
   } else {
-    RAFT_EXPECTS(
-      params.metric == cuvs::distance::DistanceType::L2Expanded,
-      "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent");
     auto nn_descent_params =
       std::get<cagra::graph_build_params::nn_descent_params>(knn_build_params);
 
@@ -467,10 +463,12 @@ index<T, IdxT> build(
         "nn-descent graph_degree.",
         nn_descent_params.graph_degree,
         intermediate_degree);
-      nn_descent_params = cagra::graph_build_params::nn_descent_params(intermediate_degree);
+      nn_descent_params =
+        cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric);
     }
 
     // Use nn-descent to build CAGRA knn graph
+    nn_descent_params.return_distances = false;
     build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
   }
 
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index 95c158675..5778d85a6 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -151,7 +151,7 @@ void search_main(raft::resources const& res,
   if (auto* strided_dset = dynamic_cast<const strided_dataset<T, ds_idx_type>*>(&index.data());
       strided_dset != nullptr) {
     // Search using a plain (strided) row-major dataset
-    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+    auto desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
       res, params, *strided_dset, index.metric());
     search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
       res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
@@ -161,7 +161,7 @@ void search_main(raft::resources const& res,
     RAFT_FAIL("FP32 VPQ dataset support is coming soon");
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
-    auto& desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
+    auto desc = dataset_descriptor_init_with_cache<T, InternalIdxT, DistanceT>(
       res, params, *vpq_dset, index.metric());
     search_main_core<T, InternalIdxT, DistanceT, CagraSampleFilterT>(
       res, params, desc, graph_internal, queries, neighbors, distances, sample_filter);
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 297eb1f55..7eb798459 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -31,8 +31,10 @@
 #include <raft/util/device_loads_stores.cuh>
 #include <raft/util/vectorized.cuh>
 
+#include <atomic>
 #include <functional>
 #include <memory>
+#include <mutex>
 #include <type_traits>
 #include <variant>
 
@@ -232,52 +234,77 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t {
  */
 template <typename DataT, typename IndexT, typename DistanceT>
 struct dataset_descriptor_host {
-  using dev_descriptor_t = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
-  using dd_ptr_t         = std::shared_ptr<dev_descriptor_t>;
-  using init_f =
-    std::tuple<std::function<void(dev_descriptor_t*, rmm::cuda_stream_view stream)>, size_t>;
+  using dev_descriptor_t         = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
   uint32_t smem_ws_size_in_bytes = 0;
   uint32_t team_size             = 0;
 
+  struct state {
+    using ready_t = std::tuple<dev_descriptor_t*, rmm::cuda_stream_view>;
+    using init_f =
+      std::tuple<std::function<void(dev_descriptor_t*, rmm::cuda_stream_view)>, size_t>;
+
+    std::mutex mutex;
+    std::atomic<bool> ready;  // Not sure if std::holds_alternative is thread-safe
+    std::variant<ready_t, init_f> value;
+
+    template <typename InitF>
+    state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)}
+    {
+    }
+
+    ~state() noexcept
+    {
+      if (std::holds_alternative<ready_t>(value)) {
+        auto& [ptr, stream] = std::get<ready_t>(value);
+        RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream));
+      }
+    }
+
+    void eval(rmm::cuda_stream_view stream)
+    {
+      std::lock_guard<std::mutex> lock(mutex);
+      if (std::holds_alternative<init_f>(value)) {
+        auto& [fun, size]     = std::get<init_f>(value);
+        dev_descriptor_t* ptr = nullptr;
+        RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream));
+        fun(ptr, stream);
+        value = std::make_tuple(ptr, stream);
+        ready.store(true, std::memory_order_release);
+      }
+    }
+
+    auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t*
+    {
+      if (!ready.load(std::memory_order_acquire)) { eval(stream); }
+      return std::get<0>(std::get<ready_t>(value));
+    }
+  };
+
   template <typename DescriptorImpl, typename InitF>
   dataset_descriptor_host(const DescriptorImpl& dd_host, InitF init)
-    : value_{std::make_tuple(init, sizeof(DescriptorImpl))},
+    : value_{std::make_shared<state>(init, sizeof(DescriptorImpl))},
       smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()},
       team_size{dd_host.team_size()}
   {
   }
 
+  dataset_descriptor_host() = default;
+
   /**
    * Return the device pointer, possibly evaluating it in the given thread.
    */
   [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) const -> const dev_descriptor_t*
   {
-    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
-    return std::get<dd_ptr_t>(value_).get();
+    return value_->get(stream);
   }
+
   [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) -> dev_descriptor_t*
   {
-    if (std::holds_alternative<init_f>(value_)) { value_ = eval(std::get<init_f>(value_), stream); }
-    return std::get<dd_ptr_t>(value_).get();
+    return value_->get(stream);
   }
 
  private:
-  mutable std::variant<dd_ptr_t, init_f> value_;
-
-  static auto eval(init_f init, rmm::cuda_stream_view stream) -> dd_ptr_t
-  {
-    using raft::RAFT_NAME;
-    auto& [fun, size] = init;
-    dd_ptr_t dev_ptr{
-      [stream, s = size]() {
-        dev_descriptor_t* p;
-        RAFT_CUDA_TRY(cudaMallocAsync(&p, s, stream));
-        return p;
-      }(),
-      [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }};
-    fun(dev_ptr.get(), stream);
-    return dev_ptr;
-  }
+  mutable std::shared_ptr<state> value_;
 };
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh
index abc907da5..e6e7ff64f 100644
--- a/cpp/src/neighbors/detail/cagra/factory.cuh
+++ b/cpp/src/neighbors/detail/cagra/factory.cuh
@@ -135,11 +135,9 @@ template <typename DataT, typename IndexT, typename DistanceT>
 struct store {
   /** Number of descriptors to cache. */
   static constexpr size_t kDefaultSize = 100;
-  raft::cache::lru<key,
-                   key_hash,
-                   std::equal_to<>,
-                   std::shared_ptr<dataset_descriptor_host<DataT, IndexT, DistanceT>>>
-    value{kDefaultSize};
+  raft::cache::
+    lru<key, key_hash, std::equal_to<>, dataset_descriptor_host<DataT, IndexT, DistanceT>>
+      value{kDefaultSize};
 };
 
 }  // namespace descriptor_cache
@@ -159,20 +157,18 @@ auto dataset_descriptor_init_with_cache(const raft::resources& res,
                                         const cagra::search_params& params,
                                         const DatasetT& dataset,
                                         cuvs::distance::DistanceType metric)
-  -> const dataset_descriptor_host<DataT, IndexT, DistanceT>&
+  -> dataset_descriptor_host<DataT, IndexT, DistanceT>
 {
-  using desc_t = dataset_descriptor_host<DataT, IndexT, DistanceT>;
-  auto key     = descriptor_cache::make_key(params, dataset, metric);
+  auto key = descriptor_cache::make_key(params, dataset, metric);
   auto& cache =
     raft::resource::get_custom_resource<descriptor_cache::store<DataT, IndexT, DistanceT>>(res)
       ->value;
-  std::shared_ptr<desc_t> desc{nullptr};
+  dataset_descriptor_host<DataT, IndexT, DistanceT> desc;
   if (!cache.get(key, &desc)) {
-    desc = std::make_shared<desc_t>(
-      std::move(dataset_descriptor_init<DataT, IndexT, DistanceT>(params, dataset, metric)));
+    desc = dataset_descriptor_init<DataT, IndexT, DistanceT>(params, dataset, metric);
     cache.set(key, desc);
   }
-  return *desc;
+  return desc;
 }
 
 };  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
index 4253cb781..daeac82b9 100644
--- a/cpp/src/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -156,6 +156,7 @@ __global__ void kern_prune(const IdxT* const knn_graph,  // [graph_chunk_size, g
   // count number of detours (A->D->B)
   for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
     const uint64_t iD = knn_graph[kAD + (graph_degree * iA)];
+    if (iD >= graph_size) { continue; }
     for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) {
       const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)];
       for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) {
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index 0003f2495..ecfd856f1 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -93,10 +93,10 @@ struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_
   using base_type::num_seeds;
 
   uint32_t num_cta_per_query;
-  rmm::device_uvector<INDEX_T> intermediate_indices;
-  rmm::device_uvector<float> intermediate_distances;
+  lightweight_uvector<INDEX_T> intermediate_indices;
+  lightweight_uvector<float> intermediate_distances;
   size_t topk_workspace_size;
-  rmm::device_uvector<uint32_t> topk_workspace;
+  lightweight_uvector<uint32_t> topk_workspace;
 
   search(raft::resources const& res,
          search_params params,
@@ -105,9 +105,9 @@ struct search : public search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_
          int64_t graph_degree,
          uint32_t topk)
     : base_type(res, params, dataset_desc, dim, graph_degree, topk),
-      intermediate_indices(0, raft::resource::get_cuda_stream(res)),
-      intermediate_distances(0, raft::resource::get_cuda_stream(res)),
-      topk_workspace(0, raft::resource::get_cuda_stream(res))
+      intermediate_indices(res),
+      intermediate_distances(res),
+      topk_workspace(res)
 
   {
     set_params(res, params);
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index 9c22134a6..c6fe21642 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -91,6 +91,15 @@ void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stre
   get_value_kernel<T><<<1, 1, 0, cuda_stream>>>(host_ptr, dev_ptr);
 }
 
+template <class T>
+auto get_value(const T* const dev_ptr, cudaStream_t stream) -> T
+{
+  T value;
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&value, dev_ptr, sizeof(value), cudaMemcpyDefault, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  return value;
+}
+
 // MAX_DATASET_DIM : must equal to or greater than dataset_dim
 template <class DATASET_DESCRIPTOR_T>
 RAFT_KERNEL random_pickup_kernel(
@@ -609,18 +618,18 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
   using base_type::num_seeds;
 
   size_t result_buffer_allocation_size;
-  rmm::device_uvector<INDEX_T> result_indices;       // results_indices_buffer
-  rmm::device_uvector<DISTANCE_T> result_distances;  // result_distances_buffer
-  rmm::device_uvector<INDEX_T> parent_node_list;
-  rmm::device_uvector<uint32_t> topk_hint;
-  rmm::device_scalar<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
-  rmm::device_uvector<uint32_t> topk_workspace;
+  lightweight_uvector<INDEX_T> result_indices;       // results_indices_buffer
+  lightweight_uvector<DISTANCE_T> result_distances;  // result_distances_buffer
+  lightweight_uvector<INDEX_T> parent_node_list;
+  lightweight_uvector<uint32_t> topk_hint;
+  lightweight_uvector<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
+  lightweight_uvector<uint32_t> topk_workspace;
 
   // temporary storage for _find_topk
-  rmm::device_uvector<float> input_keys_storage;
-  rmm::device_uvector<float> output_keys_storage;
-  rmm::device_uvector<INDEX_T> input_values_storage;
-  rmm::device_uvector<INDEX_T> output_values_storage;
+  lightweight_uvector<float> input_keys_storage;
+  lightweight_uvector<float> output_keys_storage;
+  lightweight_uvector<INDEX_T> input_values_storage;
+  lightweight_uvector<INDEX_T> output_values_storage;
 
   search(raft::resources const& res,
          search_params params,
@@ -629,16 +638,16 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
          int64_t graph_degree,
          uint32_t topk)
     : base_type(res, params, dataset_desc, dim, graph_degree, topk),
-      result_indices(0, raft::resource::get_cuda_stream(res)),
-      result_distances(0, raft::resource::get_cuda_stream(res)),
-      parent_node_list(0, raft::resource::get_cuda_stream(res)),
-      topk_hint(0, raft::resource::get_cuda_stream(res)),
-      topk_workspace(0, raft::resource::get_cuda_stream(res)),
-      terminate_flag(raft::resource::get_cuda_stream(res)),
-      input_keys_storage(0, raft::resource::get_cuda_stream(res)),
-      output_keys_storage(0, raft::resource::get_cuda_stream(res)),
-      input_values_storage(0, raft::resource::get_cuda_stream(res)),
-      output_values_storage(0, raft::resource::get_cuda_stream(res))
+      result_indices(res),
+      result_distances(res),
+      parent_node_list(res),
+      topk_hint(res),
+      topk_workspace(res),
+      terminate_flag(res),
+      input_keys_storage(res),
+      output_keys_storage(res),
+      input_values_storage(res),
+      output_values_storage(res)
   {
     set_params(res);
   }
@@ -662,7 +671,7 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
       itopk_size, max_queries, result_buffer_size, utils::get_cuda_data_type<DATA_T>());
     RAFT_LOG_DEBUG("# topk_workspace_size: %lu", topk_workspace_size);
     topk_workspace.resize(topk_workspace_size, raft::resource::get_cuda_stream(res));
-
+    terminate_flag.resize(1, raft::resource::get_cuda_stream(res));
     hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res));
   }
 
@@ -847,7 +856,7 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
                           stream);
 
       // termination (2)
-      if (iter + 1 >= min_iterations && terminate_flag.value(stream)) {
+      if (iter + 1 >= min_iterations && get_value(terminate_flag.data(), stream)) {
         iter++;
         break;
       }
diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh
index f23b96631..99254aa50 100644
--- a/cpp/src/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh
@@ -151,7 +151,7 @@ struct search_plan_impl : public search_plan_impl_base {
   lightweight_uvector<INDEX_T> hashmap;
   lightweight_uvector<uint32_t> num_executed_iterations;  // device or managed?
   lightweight_uvector<INDEX_T> dev_seed;
-  const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc;
+  dataset_descriptor_host<DataT, IndexT, DistanceT> dataset_desc;
 
   search_plan_impl(raft::resources const& res,
                    search_params params,
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 8c5767c50..c62a52540 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -16,42 +16,42 @@
 
 #pragma once
 
-#include <cuvs/neighbors/nn_descent.hpp>
-
 #include "ann_utils.cuh"
 #include "cagra/device_common.hpp"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/error.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/pinned_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
-
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/slice.cuh>
 #include <raft/util/arch.cuh>  // raft::util::arch::SM_*
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
-#include <cub/cub.cuh>
+#include <rmm/device_uvector.hpp>
+
 #include <cuda_runtime.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/host_vector.h>
-#include <thrust/mr/allocator.h>
-#include <thrust/mr/device_memory_resource.h>
 
 #include <mma.h>
 #include <omp.h>
 
 #include <limits>
+#include <optional>
 #include <queue>
 #include <random>
 
 namespace cuvs::neighbors::nn_descent::detail {
-static const std::string RAFT_NAME = "raft";
-using pinned_memory_resource       = thrust::universal_host_pinned_memory_resource;
-template <typename T>
-using pinned_memory_allocator = thrust::mr::stateless_resource_allocator<T, pinned_memory_resource>;
 
 using DistData_t = float;
 constexpr int DEGREE_ON_DEVICE{32};
@@ -216,6 +216,8 @@ struct BuildConfig {
   // If internal_node_degree == 0, the value of node_degree will be assigned to it
   size_t max_iterations{50};
   float termination_threshold{0.0001};
+  size_t output_graph_degree{32};
+  cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded};
 };
 
 template <typename Index_t>
@@ -300,6 +302,7 @@ class BloomFilter {
 
 template <typename Index_t>
 struct GnndGraph {
+  raft::resources const& res;
   static constexpr int segment_size = 32;
   InternalID_t<Index_t>* h_graph;
 
@@ -310,16 +313,17 @@ struct GnndGraph {
 
   raft::host_matrix<DistData_t, size_t, raft::row_major> h_dists;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_new;
-  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_new;
+  raft::pinned_matrix<Index_t, size_t> h_graph_new;
+  raft::pinned_vector<int2, size_t> h_list_sizes_new;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old;
-  thrust::host_vector<int2, pinned_memory_allocator<int2>> h_list_sizes_old;
+  raft::pinned_matrix<Index_t, size_t> h_graph_old;
+  raft::pinned_vector<int2, size_t> h_list_sizes_old;
   BloomFilter<Index_t> bloom_filter;
 
   GnndGraph(const GnndGraph&)            = delete;
   GnndGraph& operator=(const GnndGraph&) = delete;
-  GnndGraph(const size_t nrow,
+  GnndGraph(raft::resources const& res,
+            const size_t nrow,
             const size_t node_degree,
             const size_t internal_node_degree,
             const size_t num_samples);
@@ -344,9 +348,14 @@ class GNND {
   GNND(const GNND&)            = delete;
   GNND& operator=(const GNND&) = delete;
 
-  void build(Data_t* data, const Index_t nrow, Index_t* output_graph);
+  void build(Data_t* data,
+             const Index_t nrow,
+             Index_t* output_graph,
+             bool return_distances,
+             DistData_t* output_distances);
   ~GNND()    = default;
   using ID_t = InternalID_t<Index_t>;
+  void reset(raft::resources const& res);
 
  private:
   void add_reverse_edges(Index_t* graph_ptr,
@@ -371,15 +380,14 @@ class GNND {
   raft::device_matrix<ID_t, size_t, raft::row_major> graph_buffer_;
   raft::device_matrix<DistData_t, size_t, raft::row_major> dists_buffer_;
 
-  // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827
-  thrust::host_vector<ID_t, pinned_memory_allocator<ID_t>> graph_host_buffer_;
-  thrust::host_vector<DistData_t, pinned_memory_allocator<DistData_t>> dists_host_buffer_;
+  raft::pinned_matrix<ID_t, size_t> graph_host_buffer_;
+  raft::pinned_matrix<DistData_t, size_t> dists_host_buffer_;
 
   raft::device_vector<int, size_t> d_locks_;
 
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_new_;
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_graph_old_;
-  thrust::host_vector<Index_t, pinned_memory_allocator<Index_t>> h_rev_graph_old_;
+  raft::pinned_matrix<Index_t, size_t> h_rev_graph_new_;
+  raft::pinned_matrix<Index_t, size_t> h_graph_old_;
+  raft::pinned_matrix<Index_t, size_t> h_rev_graph_old_;
   // int2.x is the number of forward edges, int2.y is the number of reverse edges
 
   raft::device_vector<int2, size_t> d_list_sizes_new_;
@@ -448,11 +456,13 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer,
 // TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827
 /** Calculate L2 norm, and cast data to __half */
 template <typename Data_t>
-RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
-                                   __half* output_data,
-                                   int dim,
-                                   DistData_t* l2_norms,
-                                   size_t list_offset = 0)
+RAFT_KERNEL preprocess_data_kernel(
+  const Data_t* input_data,
+  __half* output_data,
+  int dim,
+  DistData_t* l2_norms,
+  size_t list_offset                  = 0,
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded)
 {
   extern __shared__ char buffer[];
   __shared__ float l2_norm;
@@ -462,26 +472,32 @@ RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data,
   load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size());
   if (threadIdx.x == 0) { l2_norm = 0; }
   __syncthreads();
-  int lane_id = threadIdx.x % raft::warp_size();
-  for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
-    int idx         = step * raft::warp_size() + lane_id;
-    float part_dist = 0;
-    if (idx < dim) {
-      part_dist = s_vec[idx];
-      part_dist = part_dist * part_dist;
-    }
-    __syncwarp();
-    for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
-      part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+
+  if (metric == cuvs::distance::DistanceType::L2Expanded ||
+      metric == cuvs::distance::DistanceType::CosineExpanded) {
+    int lane_id = threadIdx.x % raft::warp_size();
+    for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
+      int idx         = step * raft::warp_size() + lane_id;
+      float part_dist = 0;
+      if (idx < dim) {
+        part_dist = s_vec[idx];
+        part_dist = part_dist * part_dist;
+      }
+      __syncwarp();
+      for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) {
+        part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset);
+      }
+      if (lane_id == 0) { l2_norm += part_dist; }
+      __syncwarp();
     }
-    if (lane_id == 0) { l2_norm += part_dist; }
-    __syncwarp();
   }
 
   for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) {
     int idx = step * raft::warp_size() + threadIdx.x;
     if (idx < dim) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
+        output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
         output_data[list_id * dim + idx] =
           (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm);
       } else {
@@ -709,7 +725,8 @@ __launch_bounds__(BLOCK_SIZE, 4)
                     DistData_t* dists,
                     int graph_width,
                     int* locks,
-                    DistData_t* l2_norms)
+                    DistData_t* l2_norms,
+                    cuvs::distance::DistanceType metric)
 {
 #if (__CUDA_ARCH__ >= 700)
   using namespace nvcuda;
@@ -821,8 +838,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
     if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size &&
         i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
+        s_distances[i] = 1.0 - s_distances[i];
       } else {
         s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
                          l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
@@ -900,8 +919,10 @@ __launch_bounds__(BLOCK_SIZE, 4)
   for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) {
     if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size &&
         i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) {
-      if (l2_norms == nullptr) {
+      if (metric == cuvs::distance::DistanceType::InnerProduct) {
         s_distances[i] = -s_distances[i];
+      } else if (metric == cuvs::distance::DistanceType::CosineExpanded) {
+        s_distances[i] = 1.0 - s_distances[i];
       } else {
         s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] +
                          l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] -
@@ -971,19 +992,21 @@ int insert_to_ordered_list(InternalID_t<Index_t>* list,
 }  // namespace
 
 template <typename Index_t>
-GnndGraph<Index_t>::GnndGraph(const size_t nrow,
+GnndGraph<Index_t>::GnndGraph(raft::resources const& res,
+                              const size_t nrow,
                               const size_t node_degree,
                               const size_t internal_node_degree,
                               const size_t num_samples)
-  : nrow(nrow),
+  : res(res),
+    nrow(nrow),
     node_degree(node_degree),
     num_samples(num_samples),
     bloom_filter(nrow, internal_node_degree / segment_size, 3),
     h_dists{raft::make_host_matrix<DistData_t, size_t, raft::row_major>(nrow, node_degree)},
-    h_graph_new(nrow * num_samples),
-    h_list_sizes_new(nrow),
-    h_graph_old(nrow * num_samples),
-    h_list_sizes_old{nrow}
+    h_graph_new{raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow, num_samples)},
+    h_list_sizes_new{raft::make_pinned_vector<int2, size_t>(res, nrow)},
+    h_graph_old{raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow, num_samples)},
+    h_list_sizes_old{raft::make_pinned_vector<int2, size_t>(res, nrow)}
 {
   // node_degree must be a multiple of segment_size;
   assert(node_degree % segment_size == 0);
@@ -1001,9 +1024,9 @@ void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors,
 {
 #pragma omp parallel for
   for (size_t i = 0; i < nrow; i++) {
-    auto list_new         = h_graph_new.data() + i * num_samples;
-    h_list_sizes_new[i].x = 0;
-    h_list_sizes_new[i].y = 0;
+    auto list_new                       = h_graph_new.data_handle() + i * num_samples;
+    h_list_sizes_new.data_handle()[i].x = 0;
+    h_list_sizes_new.data_handle()[i].y = 0;
 
     for (size_t j = 0; j < width; j++) {
       auto new_neighb_id = new_neighbors[i * width + j].id();
@@ -1011,8 +1034,8 @@ void GnndGraph<Index_t>::sample_graph_new(InternalID_t<Index_t>* new_neighbors,
       if (bloom_filter.check(i, new_neighb_id)) { continue; }
       bloom_filter.add(i, new_neighb_id);
       new_neighbors[i * width + j].mark_old();
-      list_new[h_list_sizes_new[i].x++] = new_neighb_id;
-      if (h_list_sizes_new[i].x == num_samples) break;
+      list_new[h_list_sizes_new.data_handle()[i].x++] = new_neighb_id;
+      if (h_list_sizes_new.data_handle()[i].x == num_samples) break;
     }
   }
 }
@@ -1051,31 +1074,37 @@ void GnndGraph<Index_t>::sample_graph(bool sample_new)
 {
 #pragma omp parallel for
   for (size_t i = 0; i < nrow; i++) {
-    h_list_sizes_old[i].x = 0;
-    h_list_sizes_old[i].y = 0;
-    h_list_sizes_new[i].x = 0;
-    h_list_sizes_new[i].y = 0;
+    h_list_sizes_old.data_handle()[i].x = 0;
+    h_list_sizes_old.data_handle()[i].y = 0;
+    h_list_sizes_new.data_handle()[i].x = 0;
+    h_list_sizes_new.data_handle()[i].y = 0;
 
     auto list     = h_graph + i * node_degree;
-    auto list_old = h_graph_old.data() + i * num_samples;
-    auto list_new = h_graph_new.data() + i * num_samples;
+    auto list_old = h_graph_old.data_handle() + i * num_samples;
+    auto list_new = h_graph_new.data_handle() + i * num_samples;
     for (int j = 0; j < segment_size; j++) {
       for (int k = 0; k < num_segments; k++) {
         auto neighbor = list[k * segment_size + j];
         if ((size_t)neighbor.id() >= nrow) continue;
         if (!neighbor.is_new()) {
-          if (h_list_sizes_old[i].x < num_samples) {
-            list_old[h_list_sizes_old[i].x++] = neighbor.id();
+          if (h_list_sizes_old.data_handle()[i].x < num_samples) {
+            list_old[h_list_sizes_old.data_handle()[i].x++] = neighbor.id();
           }
         } else if (sample_new) {
-          if (h_list_sizes_new[i].x < num_samples) {
+          if (h_list_sizes_new.data_handle()[i].x < num_samples) {
             list[k * segment_size + j].mark_old();
-            list_new[h_list_sizes_new[i].x++] = neighbor.id();
+            list_new[h_list_sizes_new.data_handle()[i].x++] = neighbor.id();
           }
         }
-        if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
+        if (h_list_sizes_old.data_handle()[i].x == num_samples &&
+            h_list_sizes_new.data_handle()[i].x == num_samples) {
+          break;
+        }
+      }
+      if (h_list_sizes_old.data_handle()[i].x == num_samples &&
+          h_list_sizes_new.data_handle()[i].x == num_samples) {
+        break;
       }
-      if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; }
     }
   }
 }
@@ -1137,7 +1166,8 @@ template <typename Data_t, typename Index_t>
 GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build_config)
   : res(res),
     build_config_(build_config),
-    graph_(build_config.max_dataset_size,
+    graph_(res,
+           build_config.max_dataset_size,
            align32::roundUp(build_config.node_degree),
            align32::roundUp(build_config.internal_node_degree ? build_config.internal_node_degree
                                                               : build_config.node_degree),
@@ -1146,33 +1176,48 @@ GNND<Data_t, Index_t>::GNND(raft::resources const& res, const BuildConfig& build
     ndim_(build_config.dataset_dim),
     d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>(
       res, nrow_, build_config.dataset_dim)},
-    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, nrow_)},
+    l2_norms_{raft::make_device_vector<DistData_t, size_t>(res, 0)},
     graph_buffer_{
       raft::make_device_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     dists_buffer_{
       raft::make_device_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
-    graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
-    dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE),
+    graph_host_buffer_{
+      raft::make_pinned_matrix<ID_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
+    dists_host_buffer_{
+      raft::make_pinned_matrix<DistData_t, size_t, raft::row_major>(res, nrow_, DEGREE_ON_DEVICE)},
     d_locks_{raft::make_device_vector<int, size_t>(res, nrow_)},
-    h_rev_graph_new_(nrow_ * NUM_SAMPLES),
-    h_graph_old_(nrow_ * NUM_SAMPLES),
-    h_rev_graph_old_(nrow_ * NUM_SAMPLES),
+    h_rev_graph_new_{
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)},
+    h_graph_old_(
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)),
+    h_rev_graph_old_{
+      raft::make_pinned_matrix<Index_t, size_t, raft::row_major>(res, nrow_, NUM_SAMPLES)},
     d_list_sizes_new_{raft::make_device_vector<int2, size_t>(res, nrow_)},
     d_list_sizes_old_{raft::make_device_vector<int2, size_t>(res, nrow_)}
 {
   static_assert(NUM_SAMPLES <= 32);
 
-  thrust::fill(thrust::device,
-               dists_buffer_.data_handle(),
-               dists_buffer_.data_handle() + dists_buffer_.size(),
-               std::numeric_limits<float>::max());
-  thrust::fill(thrust::device,
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()),
-               reinterpret_cast<Index_t*>(graph_buffer_.data_handle()) + graph_buffer_.size(),
-               std::numeric_limits<Index_t>::max());
-  thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0);
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
+
+  if (build_config.metric == cuvs::distance::DistanceType::L2Expanded) {
+    l2_norms_ = raft::make_device_vector<DistData_t, size_t>(res, nrow_);
+  }
 };
 
+template <typename Data_t, typename Index_t>
+void GNND<Data_t, Index_t>::reset(raft::resources const& res)
+{
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  auto graph_buffer_view = raft::make_device_matrix_view<Index_t, int64_t>(
+    reinterpret_cast<Index_t*>(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE);
+  raft::matrix::fill(res, graph_buffer_view, std::numeric_limits<Index_t>::max());
+  raft::matrix::fill(res, d_locks_.view(), 0);
+}
+
 template <typename Data_t, typename Index_t>
 void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
                                               Index_t* h_rev_graph_ptr,
@@ -1189,34 +1234,36 @@ void GNND<Data_t, Index_t>::add_reverse_edges(Index_t* graph_ptr,
 template <typename Data_t, typename Index_t>
 void GNND<Data_t, Index_t>::local_join(cudaStream_t stream)
 {
-  thrust::fill(thrust::device.on(stream),
-               dists_buffer_.data_handle(),
-               dists_buffer_.data_handle() + dists_buffer_.size(),
-               std::numeric_limits<float>::max());
-  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(
-    thrust::raw_pointer_cast(graph_.h_graph_new.data()),
-    thrust::raw_pointer_cast(h_rev_graph_new_.data()),
-    d_list_sizes_new_.data_handle(),
-    thrust::raw_pointer_cast(h_graph_old_.data()),
-    thrust::raw_pointer_cast(h_rev_graph_old_.data()),
-    d_list_sizes_old_.data_handle(),
-    NUM_SAMPLES,
-    d_data_.data_handle(),
-    ndim_,
-    graph_buffer_.data_handle(),
-    dists_buffer_.data_handle(),
-    DEGREE_ON_DEVICE,
-    d_locks_.data_handle(),
-    l2_norms_.data_handle());
+  raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits<float>::max());
+  local_join_kernel<<<nrow_, BLOCK_SIZE, 0, stream>>>(graph_.h_graph_new.data_handle(),
+                                                      h_rev_graph_new_.data_handle(),
+                                                      d_list_sizes_new_.data_handle(),
+                                                      h_graph_old_.data_handle(),
+                                                      h_rev_graph_old_.data_handle(),
+                                                      d_list_sizes_old_.data_handle(),
+                                                      NUM_SAMPLES,
+                                                      d_data_.data_handle(),
+                                                      ndim_,
+                                                      graph_buffer_.data_handle(),
+                                                      dists_buffer_.data_handle(),
+                                                      DEGREE_ON_DEVICE,
+                                                      d_locks_.data_handle(),
+                                                      l2_norms_.data_handle(),
+                                                      build_config_.metric);
 }
 
 template <typename Data_t, typename Index_t>
-void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* output_graph)
+void GNND<Data_t, Index_t>::build(Data_t* data,
+                                  const Index_t nrow,
+                                  Index_t* output_graph,
+                                  bool return_distances,
+                                  DistData_t* output_distances)
 {
   using input_t = typename std::remove_const<Data_t>::type;
 
   cudaStream_t stream = raft::resource::get_cuda_stream(res);
   nrow_               = nrow;
+  graph_.nrow         = nrow;
   graph_.h_graph      = (InternalID_t<Index_t>*)output_graph;
 
   cudaPointerAttributes data_ptr_attr;
@@ -1226,24 +1273,19 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{
     data, static_cast<size_t>(nrow_), build_config_.dataset_dim, batch_size, stream};
   for (auto const& batch : vec_batches) {
-    preprocess_data_kernel<<<batch.size(),
-                             raft::warp_size(),
-                             sizeof(Data_t) *
-                               raft::ceildiv(build_config_.dataset_dim,
-                                             static_cast<size_t>(raft::warp_size())) *
-                               raft::warp_size(),
-                             stream>>>(batch.data(),
-                                       d_data_.data_handle(),
-                                       build_config_.dataset_dim,
-                                       l2_norms_.data_handle(),
-                                       batch.offset());
+    preprocess_data_kernel<<<
+      batch.size(),
+      raft::warp_size(),
+      sizeof(Data_t) * ceildiv(build_config_.dataset_dim, static_cast<size_t>(raft::warp_size())) *
+        raft::warp_size(),
+      stream>>>(batch.data(),
+                d_data_.data_handle(),
+                build_config_.dataset_dim,
+                l2_norms_.data_handle(),
+                batch.offset(),
+                build_config_.metric);
   }
 
-  thrust::fill(thrust::device.on(stream),
-               (Index_t*)graph_buffer_.data_handle(),
-               (Index_t*)graph_buffer_.data_handle() + graph_buffer_.size(),
-               std::numeric_limits<Index_t>::max());
-
   graph_.clear();
   graph_.init_random_graph();
   graph_.sample_graph(true);
@@ -1251,8 +1293,8 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
   auto update_and_sample = [&](bool update_graph) {
     if (update_graph) {
       update_counter_ = 0;
-      graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
-                          thrust::raw_pointer_cast(dists_host_buffer_.data()),
+      graph_.update_graph(graph_host_buffer_.data_handle(),
+                          dists_host_buffer_.data_handle(),
                           DEGREE_ON_DEVICE,
                           update_counter_);
       if (update_counter_ < build_config_.termination_threshold * nrow_ *
@@ -1265,15 +1307,15 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
   for (size_t it = 0; it < build_config_.max_iterations; it++) {
     raft::copy(d_list_sizes_new_.data_handle(),
-               thrust::raw_pointer_cast(graph_.h_list_sizes_new.data()),
+               graph_.h_list_sizes_new.data_handle(),
                nrow_,
                raft::resource::get_cuda_stream(res));
-    raft::copy(thrust::raw_pointer_cast(h_graph_old_.data()),
-               thrust::raw_pointer_cast(graph_.h_graph_old.data()),
+    raft::copy(h_graph_old_.data_handle(),
+               graph_.h_graph_old.data_handle(),
                nrow_ * NUM_SAMPLES,
                raft::resource::get_cuda_stream(res));
     raft::copy(d_list_sizes_old_.data_handle(),
-               thrust::raw_pointer_cast(graph_.h_list_sizes_old.data()),
+               graph_.h_list_sizes_old.data_handle(),
                nrow_,
                raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
@@ -1286,13 +1328,13 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
     // contains some information for local_join.
     static_assert(DEGREE_ON_DEVICE * sizeof(*(dists_buffer_.data_handle())) >=
                   NUM_SAMPLES * sizeof(*(graph_buffer_.data_handle())));
-    add_reverse_edges(thrust::raw_pointer_cast(graph_.h_graph_new.data()),
-                      thrust::raw_pointer_cast(h_rev_graph_new_.data()),
+    add_reverse_edges(graph_.h_graph_new.data_handle(),
+                      h_rev_graph_new_.data_handle(),
                       (Index_t*)dists_buffer_.data_handle(),
                       d_list_sizes_new_.data_handle(),
                       stream);
-    add_reverse_edges(thrust::raw_pointer_cast(h_graph_old_.data()),
-                      thrust::raw_pointer_cast(h_rev_graph_old_.data()),
+    add_reverse_edges(h_graph_old_.data_handle(),
+                      h_rev_graph_old_.data_handle(),
                       (Index_t*)dists_buffer_.data_handle(),
                       d_list_sizes_old_.data_handle(),
                       stream);
@@ -1316,21 +1358,21 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
     update_and_sample_thread.join();
 
     if (update_counter_ == -1) { break; }
-    raft::copy(thrust::raw_pointer_cast(graph_host_buffer_.data()),
+    raft::copy(graph_host_buffer_.data_handle(),
                graph_buffer_.data_handle(),
                nrow_ * DEGREE_ON_DEVICE,
                raft::resource::get_cuda_stream(res));
     raft::resource::sync_stream(res);
-    raft::copy(thrust::raw_pointer_cast(dists_host_buffer_.data()),
+    raft::copy(dists_host_buffer_.data_handle(),
                dists_buffer_.data_handle(),
                nrow_ * DEGREE_ON_DEVICE,
                raft::resource::get_cuda_stream(res));
 
-    graph_.sample_graph_new(thrust::raw_pointer_cast(graph_host_buffer_.data()), DEGREE_ON_DEVICE);
+    graph_.sample_graph_new(graph_host_buffer_.data_handle(), DEGREE_ON_DEVICE);
   }
 
-  graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()),
-                      thrust::raw_pointer_cast(dists_host_buffer_.data()),
+  graph_.update_graph(graph_host_buffer_.data_handle(),
+                      dists_host_buffer_.data_handle(),
                       DEGREE_ON_DEVICE,
                       update_counter_);
   raft::resource::sync_stream(res);
@@ -1338,6 +1380,27 @@ void GNND<Data_t, Index_t>::build(Data_t* data, const Index_t nrow, Index_t* out
 
   // Reuse graph_.h_dists as the buffer for shrink the lists in graph
   static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t));
+
+  if (return_distances) {
+    auto graph_d_dists = raft::make_device_matrix<DistData_t, int64_t, raft::row_major>(
+      res, nrow_, build_config_.node_degree);
+    raft::copy(graph_d_dists.data_handle(),
+               graph_.h_dists.data_handle(),
+               nrow_ * build_config_.node_degree,
+               raft::resource::get_cuda_stream(res));
+
+    auto output_dist_view = raft::make_device_matrix_view<DistData_t, int64_t, raft::row_major>(
+      output_distances, nrow_, build_config_.output_graph_degree);
+
+    raft::matrix::slice_coordinates coords{static_cast<int64_t>(0),
+                                           static_cast<int64_t>(0),
+                                           static_cast<int64_t>(nrow_),
+                                           static_cast<int64_t>(build_config_.output_graph_degree)};
+    raft::matrix::slice<DistData_t, int64_t, raft::row_major>(
+      res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords);
+    raft::resource::sync_stream(res);
+  }
+
   Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle();
 
 #pragma omp parallel for
@@ -1376,6 +1439,11 @@ void build(raft::resources const& res,
   RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits<int>::max() - 1,
                "The dataset size for GNND should be less than %d",
                std::numeric_limits<int>::max() - 1);
+  auto allowed_metrics = params.metric == cuvs::distance::DistanceType::L2Expanded ||
+                         params.metric == cuvs::distance::DistanceType::CosineExpanded ||
+                         params.metric == cuvs::distance::DistanceType::InnerProduct;
+  RAFT_EXPECTS(allowed_metrics && idx.metric() == params.metric,
+               "The metric for NN Descent should be L2Expanded, CosineExpanded or InnerProduct");
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
 
@@ -1410,10 +1478,25 @@ void build(raft::resources const& res,
                            .node_degree           = extended_graph_degree,
                            .internal_node_degree  = extended_intermediate_degree,
                            .max_iterations        = params.max_iterations,
-                           .termination_threshold = params.termination_threshold};
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = params.graph_degree,
+                           .metric                = params.metric};
 
   GNND<const T, int> nnd(res, build_config);
-  nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle());
+
+  if (idx.distances().has_value() || !params.return_distances) {
+    nnd.build(dataset.data_handle(),
+              dataset.extent(0),
+              int_graph.data_handle(),
+              params.return_distances,
+              idx.distances()
+                .value_or(raft::make_device_matrix<float, int64_t>(res, 0, 0).view())
+                .data_handle());
+  } else {
+    RAFT_EXPECTS(!params.return_distances,
+                 "Distance view not allocated. Using return_distances set to true requires "
+                 "distance view to be allocated.");
+  }
 
 #pragma omp parallel for
   for (size_t i = 0; i < static_cast<size_t>(dataset.extent(0)); i++) {
@@ -1445,11 +1528,15 @@ index<IdxT> build(
     graph_degree = intermediate_degree;
   }
 
-  index<IdxT> idx{res, dataset.extent(0), static_cast<int64_t>(graph_degree)};
+  index<IdxT> idx{res,
+                  dataset.extent(0),
+                  static_cast<int64_t>(graph_degree),
+                  params.return_distances,
+                  params.metric};
 
   build(res, params, dataset, idx);
 
   return idx;
 }
 
-}  // namespace  cuvs::neighbors::nn_descent::detail
+}  // namespace cuvs::neighbors::nn_descent::detail
diff --git a/cpp/src/neighbors/detail/nn_descent_batch.cuh b/cpp/src/neighbors/detail/nn_descent_batch.cuh
new file mode 100644
index 000000000..842dbe788
--- /dev/null
+++ b/cpp/src/neighbors/detail/nn_descent_batch.cuh
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <sys/types.h>
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#include "nn_descent.cuh"
+#include <cuvs/neighbors/brute_force.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/managed_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/matrix/detail/gather_inplace.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/sample_rows.cuh>
+
+#include <thrust/copy.h>
+
+#include <vector_types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <random>
+#include <type_traits>
+
+namespace cuvs::neighbors::nn_descent::detail::experimental {
+
+//
+// Run balanced kmeans on a subsample of the dataset to get centroids
+//
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void get_balanced_kmeans_centroids(
+  raft::resources const& res,
+  cuvs::distance::DistanceType metric,
+  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+  raft::device_matrix_view<T, IdxT> centroids)
+{
+  size_t num_rows   = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols   = static_cast<size_t>(dataset.extent(1));
+  size_t n_clusters = centroids.extent(0);
+  size_t num_subsamples =
+    std::min(static_cast<size_t>(num_rows / n_clusters), static_cast<size_t>(num_rows * 0.1));
+
+  auto d_subsample_dataset =
+    raft::make_device_matrix<T, int64_t, raft::row_major>(res, num_subsamples, num_cols);
+  raft::matrix::sample_rows<T, int64_t, Accessor>(
+    res, raft::random::RngState{0}, dataset, d_subsample_dataset.view());
+
+  cuvs::cluster::kmeans::balanced_params kmeans_params;
+  kmeans_params.metric = metric;
+
+  auto d_subsample_dataset_const_view =
+    raft::make_device_matrix_view<const T, int, raft::row_major>(
+      d_subsample_dataset.data_handle(), num_subsamples, num_cols);
+  auto centroids_view = raft::make_device_matrix_view<T, int, raft::row_major>(
+    centroids.data_handle(), n_clusters, num_cols);
+  cuvs::cluster::kmeans::fit(res, kmeans_params, d_subsample_dataset_const_view, centroids_view);
+}
+
+//
+// Get the top k closest centroid indices for each data point
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename T, typename IdxT = uint32_t>
+void get_global_nearest_k(
+  raft::resources const& res,
+  size_t k,
+  size_t num_rows,
+  size_t n_clusters,
+  const T* dataset,
+  raft::host_matrix_view<IdxT, IdxT, raft::row_major> global_nearest_cluster,
+  raft::device_matrix_view<T, IdxT, raft::row_major> centroids,
+  cuvs::distance::DistanceType metric)
+{
+  size_t num_cols     = centroids.extent(1);
+  auto centroids_view = raft::make_device_matrix_view<const T, int64_t, raft::row_major>(
+    centroids.data_handle(), n_clusters, num_cols);
+
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset));
+  float* ptr = reinterpret_cast<float*>(attr.devicePointer);
+
+  size_t num_batches = n_clusters;
+  size_t batch_size  = (num_rows + n_clusters) / n_clusters;
+  if (ptr == nullptr) {  // data on host
+
+    auto d_dataset_batch =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, batch_size, num_cols);
+
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<int64_t, int64_t, raft::row_major>(res, batch_size, k);
+    auto nearest_clusters_idxt =
+      raft::make_device_matrix<IdxT, int64_t, raft::row_major>(res, batch_size, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, batch_size, k);
+
+    for (size_t i = 0; i < num_batches; i++) {
+      size_t batch_size_ = batch_size;
+
+      if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; }
+      raft::copy(d_dataset_batch.data_handle(),
+                 dataset + i * batch_size * num_cols,
+                 batch_size_ * num_cols,
+                 resource::get_cuda_stream(res));
+
+      std::optional<raft::device_vector_view<const T, int64_t>> norms_view;
+      cuvs::neighbors::brute_force::index<T> brute_force_index(
+        res, centroids_view, norms_view, metric);
+      cuvs::neighbors::brute_force::search(res,
+                                           brute_force_index,
+                                           raft::make_const_mdspan(d_dataset_batch.view()),
+                                           nearest_clusters_idx.view(),
+                                           nearest_clusters_dist.view());
+
+      thrust::copy(raft::resource::get_thrust_policy(res),
+                   nearest_clusters_idx.data_handle(),
+                   nearest_clusters_idx.data_handle() + nearest_clusters_idx.size(),
+                   nearest_clusters_idxt.data_handle());
+      raft::copy(global_nearest_cluster.data_handle() + i * batch_size * k,
+                 nearest_clusters_idxt.data_handle(),
+                 batch_size_ * k,
+                 resource::get_cuda_stream(res));
+    }
+  } else {  // data on device
+    auto nearest_clusters_idx =
+      raft::make_device_matrix<int64_t, int64_t, raft::row_major>(res, num_rows, k);
+    auto nearest_clusters_dist =
+      raft::make_device_matrix<T, int64_t, raft::row_major>(res, num_rows, k);
+
+    std::optional<raft::device_vector_view<const T, int64_t>> norms_view;
+    cuvs::neighbors::brute_force::index<T> brute_force_index(
+      res, centroids_view, norms_view, metric);
+    auto dataset_view =
+      raft::make_device_matrix_view<const T, int64_t, raft::row_major>(dataset, num_rows, num_cols);
+    cuvs::neighbors::brute_force::search(res,
+                                         brute_force_index,
+                                         dataset_view,
+                                         nearest_clusters_idx.view(),
+                                         nearest_clusters_dist.view());
+
+    auto nearest_clusters_idxt =
+      raft::make_device_matrix<IdxT, int64_t, raft::row_major>(res, batch_size, k);
+    for (size_t i = 0; i < num_batches; i++) {
+      size_t batch_size_ = batch_size;
+
+      if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; }
+      thrust::copy(raft::resource::get_thrust_policy(res),
+                   nearest_clusters_idx.data_handle() + i * batch_size_ * k,
+                   nearest_clusters_idx.data_handle() + (i + 1) * batch_size_ * k,
+                   nearest_clusters_idxt.data_handle());
+      raft::copy(global_nearest_cluster.data_handle() + i * batch_size_ * k,
+                 nearest_clusters_idxt.data_handle(),
+                 batch_size_ * k,
+                 resource::get_cuda_stream(res));
+    }
+  }
+}
+
+//
+// global_nearest_cluster [num_rows X k=2] : top 2 closest clusters for each data point
+// inverted_indices [num_rows x k vector] : sparse vector for data indices for each cluster
+// cluster_size [n_cluster] : cluster size for each cluster
+// offset [n_cluster] : offset in inverted_indices for each cluster
+// Loads the data in batches onto device if data is on host for memory efficiency
+//
+template <typename IdxT = uint32_t>
+void get_inverted_indices(raft::resources const& res,
+                          size_t n_clusters,
+                          size_t& max_cluster_size,
+                          size_t& min_cluster_size,
+                          raft::host_matrix_view<IdxT, IdxT> global_nearest_cluster,
+                          raft::host_vector_view<IdxT, IdxT> inverted_indices,
+                          raft::host_vector_view<IdxT, IdxT> cluster_size,
+                          raft::host_vector_view<IdxT, IdxT> offset)
+{
+  // build sparse inverted indices and get number of data points for each cluster
+  size_t num_rows = global_nearest_cluster.extent(0);
+  size_t k        = global_nearest_cluster.extent(1);
+
+  auto local_offset = raft::make_host_vector<IdxT>(n_clusters);
+
+  max_cluster_size = 0;
+  min_cluster_size = std::numeric_limits<size_t>::max();
+
+  std::fill(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters, 0);
+  std::fill(local_offset.data_handle(), local_offset.data_handle() + n_clusters, 0);
+
+  // TODO: this part isn't really a bottleneck but maybe worth trying omp parallel
+  // for with atomic add
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      cluster_size(cluster_id) += 1;
+    }
+  }
+
+  offset(0) = 0;
+  for (size_t i = 1; i < n_clusters; i++) {
+    offset(i) = offset(i - 1) + cluster_size(i - 1);
+  }
+  for (size_t i = 0; i < num_rows; i++) {
+    for (size_t j = 0; j < k; j++) {
+      IdxT cluster_id = global_nearest_cluster(i, j);
+      inverted_indices(offset(cluster_id) + local_offset(cluster_id)) = i;
+      local_offset(cluster_id) += 1;
+    }
+  }
+
+  max_cluster_size = static_cast<size_t>(
+    *std::max_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+  min_cluster_size = static_cast<size_t>(
+    *std::min_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters));
+}
+
+template <typename KeyType, typename ValueType>
+struct KeyValuePair {
+  KeyType key;
+  ValueType value;
+};
+
+template <typename KeyType, typename ValueType>
+struct CustomKeyComparator {
+  __device__ bool operator()(const KeyValuePair<KeyType, ValueType>& a,
+                             const KeyValuePair<KeyType, ValueType>& b) const
+  {
+    if (a.key == b.key) { return a.value < b.value; }
+    return a.key < b.key;
+  }
+};
+
+template <typename IdxT, int BLOCK_SIZE, int ITEMS_PER_THREAD>
+RAFT_KERNEL merge_subgraphs(IdxT* cluster_data_indices,
+                            size_t graph_degree,
+                            size_t num_cluster_in_batch,
+                            float* global_distances,
+                            float* batch_distances,
+                            IdxT* global_indices,
+                            IdxT* batch_indices)
+{
+  size_t batch_row = blockIdx.x;
+  typedef cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>
+    BlockMergeSortType;
+  __shared__ typename cub::BlockMergeSort<KeyValuePair<float, IdxT>, BLOCK_SIZE, ITEMS_PER_THREAD>::
+    TempStorage tmpSmem;
+
+  extern __shared__ char sharedMem[];
+  float* blockKeys  = reinterpret_cast<float*>(sharedMem);
+  IdxT* blockValues = reinterpret_cast<IdxT*>(&sharedMem[graph_degree * 2 * sizeof(float)]);
+  int16_t* uniqueMask =
+    reinterpret_cast<int16_t*>(&sharedMem[graph_degree * 2 * (sizeof(float) + sizeof(IdxT))]);
+
+  if (batch_row < num_cluster_in_batch) {
+    // load batch or global depending on threadIdx
+    size_t global_row = cluster_data_indices[batch_row];
+
+    KeyValuePair<float, IdxT> threadKeyValuePair[ITEMS_PER_THREAD];
+
+    size_t halfway   = BLOCK_SIZE / 2;
+    size_t do_global = threadIdx.x < halfway;
+
+    float* distances;
+    IdxT* indices;
+
+    if (do_global) {
+      distances = global_distances;
+      indices   = global_indices;
+    } else {
+      distances = batch_distances;
+      indices   = batch_indices;
+    }
+
+    size_t idxBase = (threadIdx.x * do_global + (threadIdx.x - halfway) * (1lu - do_global)) *
+                     static_cast<size_t>(ITEMS_PER_THREAD);
+    size_t arrIdxBase = (global_row * do_global + batch_row * (1lu - do_global)) * graph_degree;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < graph_degree) {
+        threadKeyValuePair[i].key   = distances[arrIdxBase + colId];
+        threadKeyValuePair[i].value = indices[arrIdxBase + colId];
+      } else {
+        threadKeyValuePair[i].key   = std::numeric_limits<float>::max();
+        threadKeyValuePair[i].value = std::numeric_limits<IdxT>::max();
+      }
+    }
+
+    __syncthreads();
+
+    BlockMergeSortType(tmpSmem).Sort(threadKeyValuePair, CustomKeyComparator<float, IdxT>{});
+
+    // load sorted result into shared memory to get unique values
+    idxBase = threadIdx.x * ITEMS_PER_THREAD;
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId < 2 * graph_degree) {
+        blockKeys[colId]   = threadKeyValuePair[i].key;
+        blockValues[colId] = threadKeyValuePair[i].value;
+      }
+    }
+
+    __syncthreads();
+
+    // get unique mask
+    if (threadIdx.x == 0) { uniqueMask[0] = 1; }
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        uniqueMask[colId] = static_cast<int16_t>(blockValues[colId] != blockValues[colId - 1]);
+      }
+    }
+
+    __syncthreads();
+
+    // prefix sum
+    if (threadIdx.x == 0) {
+      for (int i = 1; i < 2 * graph_degree; i++) {
+        uniqueMask[i] += uniqueMask[i - 1];
+      }
+    }
+
+    __syncthreads();
+    // load unique values to global memory
+    if (threadIdx.x == 0) {
+      global_distances[global_row * graph_degree] = blockKeys[0];
+      global_indices[global_row * graph_degree]   = blockValues[0];
+    }
+
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      size_t colId = idxBase + i;
+      if (colId > 0 && colId < 2 * graph_degree) {
+        bool is_unique       = uniqueMask[colId] != uniqueMask[colId - 1];
+        int16_t global_colId = uniqueMask[colId] - 1;
+        if (is_unique && static_cast<size_t>(global_colId) < graph_degree) {
+          global_distances[global_row * graph_degree + global_colId] = blockKeys[colId];
+          global_indices[global_row * graph_degree + global_colId]   = blockValues[colId];
+        }
+      }
+    }
+  }
+}
+
+//
+// builds knn graph using NN Descent and merge with global graph
+//
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+void build_and_merge(raft::resources const& res,
+                     const index_params& params,
+                     size_t num_data_in_cluster,
+                     size_t graph_degree,
+                     size_t int_graph_node_degree,
+                     T* cluster_data,
+                     IdxT* cluster_data_indices,
+                     int* int_graph,
+                     IdxT* inverted_indices,
+                     IdxT* global_indices_d,
+                     float* global_distances_d,
+                     IdxT* batch_indices_h,
+                     IdxT* batch_indices_d,
+                     float* batch_distances_d,
+                     GNND<const T, int>& nnd)
+{
+  nnd.build(cluster_data, num_data_in_cluster, int_graph, true, batch_distances_d);
+
+  // remap indices
+#pragma omp parallel for
+  for (size_t i = 0; i < num_data_in_cluster; i++) {
+    for (size_t j = 0; j < graph_degree; j++) {
+      size_t local_idx                      = int_graph[i * int_graph_node_degree + j];
+      batch_indices_h[i * graph_degree + j] = inverted_indices[local_idx];
+    }
+  }
+
+  raft::copy(batch_indices_d,
+             batch_indices_h,
+             num_data_in_cluster * graph_degree,
+             raft::resource::get_cuda_stream(res));
+
+  size_t num_elems     = graph_degree * 2;
+  size_t sharedMemSize = num_elems * (sizeof(float) + sizeof(IdxT) + sizeof(int16_t));
+
+  if (num_elems <= 128) {
+    merge_subgraphs<IdxT, 32, 4>
+      <<<num_data_in_cluster, 32, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 512) {
+    merge_subgraphs<IdxT, 128, 4>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 1024) {
+    merge_subgraphs<IdxT, 128, 8>
+      <<<num_data_in_cluster, 128, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else if (num_elems <= 2048) {
+    merge_subgraphs<IdxT, 256, 8>
+      <<<num_data_in_cluster, 256, sharedMemSize, raft::resource::get_cuda_stream(res)>>>(
+        cluster_data_indices,
+        graph_degree,
+        num_data_in_cluster,
+        global_distances_d,
+        batch_distances_d,
+        global_indices_d,
+        batch_indices_d);
+  } else {
+    // this is as far as we can get due to the shared mem usage of cub::BlockMergeSort
+    RAFT_FAIL("The degree of knn is too large (%lu). It must be smaller than 1024", graph_degree);
+  }
+  raft::resource::sync_stream(res);
+}
+
+//
+// For each cluster, gather the data samples that belong to that cluster, and
+// call build_and_merge
+//
+template <typename T, typename IdxT = uint32_t>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::host_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_host_matrix<T, int64_t, row_major>(max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on host. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+#pragma omp parallel for
+    for (size_t i = 0; i < num_data_in_cluster; i++) {
+      for (size_t j = 0; j < num_cols; j++) {
+        size_t global_row         = (inverted_indices + offset)[i];
+        cluster_data_matrix(i, j) = dataset(global_row, j);
+      }
+    }
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_matrix.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd);
+    nnd.reset(res);
+  }
+}
+
+template <typename T, typename IdxT = uint32_t>
+void cluster_nnd(raft::resources const& res,
+                 const index_params& params,
+                 size_t graph_degree,
+                 size_t extended_graph_degree,
+                 size_t max_cluster_size,
+                 raft::device_matrix_view<const T, int64_t> dataset,
+                 IdxT* offsets,
+                 IdxT* cluster_size,
+                 IdxT* cluster_data_indices,
+                 int* int_graph,
+                 IdxT* inverted_indices,
+                 IdxT* global_indices_h,
+                 float* global_distances_h,
+                 IdxT* batch_indices_h,
+                 IdxT* batch_indices_d,
+                 float* batch_distances_d,
+                 const BuildConfig& build_config)
+{
+  size_t num_rows = dataset.extent(0);
+  size_t num_cols = dataset.extent(1);
+
+  GNND<const T, int> nnd(res, build_config);
+
+  auto cluster_data_matrix =
+    raft::make_device_matrix<T, int64_t, row_major>(res, max_cluster_size, num_cols);
+
+  for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) {
+    RAFT_LOG_DEBUG(
+      "# Data on device. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters);
+    size_t num_data_in_cluster = cluster_size[cluster_id];
+    size_t offset              = offsets[cluster_id];
+
+    auto cluster_data_view = raft::make_device_matrix_view<T, IdxT>(
+      cluster_data_matrix.data_handle(), num_data_in_cluster, num_cols);
+    auto cluster_data_indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+      cluster_data_indices + offset, num_data_in_cluster);
+
+    auto dataset_IdxT =
+      raft::make_device_matrix_view<const T, IdxT>(dataset.data_handle(), num_rows, num_cols);
+    raft::matrix::gather(res, dataset_IdxT, cluster_data_indices_view, cluster_data_view);
+
+    build_and_merge<T, IdxT>(res,
+                             params,
+                             num_data_in_cluster,
+                             graph_degree,
+                             extended_graph_degree,
+                             cluster_data_view.data_handle(),
+                             cluster_data_indices + offset,
+                             int_graph,
+                             inverted_indices + offset,
+                             global_indices_h,
+                             global_distances_h,
+                             batch_indices_h,
+                             batch_indices_d,
+                             batch_distances_d,
+                             nnd);
+    nnd.reset(res);
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<float>, memory_type::host>>
+void batch_build(raft::resources const& res,
+                 const index_params& params,
+                 mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+                 index<IdxT>& global_idx)
+{
+  size_t graph_degree        = params.graph_degree;
+  size_t intermediate_degree = params.intermediate_graph_degree;
+
+  size_t num_rows = static_cast<size_t>(dataset.extent(0));
+  size_t num_cols = static_cast<size_t>(dataset.extent(1));
+
+  auto centroids =
+    raft::make_device_matrix<T, IdxT, raft::row_major>(res, params.n_clusters, num_cols);
+  get_balanced_kmeans_centroids<T, IdxT>(res, params.metric, dataset, centroids.view());
+
+  size_t k                    = 2;
+  auto global_nearest_cluster = raft::make_host_matrix<IdxT, IdxT, raft::row_major>(num_rows, k);
+  get_global_nearest_k<T, IdxT>(res,
+                                k,
+                                num_rows,
+                                params.n_clusters,
+                                dataset.data_handle(),
+                                global_nearest_cluster.view(),
+                                centroids.view(),
+                                params.metric);
+
+  auto inverted_indices = raft::make_host_vector<IdxT, IdxT, raft::row_major>(num_rows * k);
+  auto cluster_size     = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+  auto offset           = raft::make_host_vector<IdxT, IdxT, raft::row_major>(params.n_clusters);
+
+  size_t max_cluster_size, min_cluster_size;
+  get_inverted_indices(res,
+                       params.n_clusters,
+                       max_cluster_size,
+                       min_cluster_size,
+                       global_nearest_cluster.view(),
+                       inverted_indices.view(),
+                       cluster_size.view(),
+                       offset.view());
+
+  if (intermediate_degree >= min_cluster_size) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than minimum cluster size, reducing it to %lu",
+      dataset.extent(0));
+    intermediate_degree = min_cluster_size - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  size_t extended_graph_degree =
+    align32::roundUp(static_cast<size_t>(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3)));
+  size_t extended_intermediate_degree = align32::roundUp(
+    static_cast<size_t>(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3)));
+
+  auto int_graph = raft::make_host_matrix<int, int64_t, row_major>(
+    max_cluster_size, static_cast<int64_t>(extended_graph_degree));
+
+  BuildConfig build_config{.max_dataset_size      = max_cluster_size,
+                           .dataset_dim           = num_cols,
+                           .node_degree           = extended_graph_degree,
+                           .internal_node_degree  = extended_intermediate_degree,
+                           .max_iterations        = params.max_iterations,
+                           .termination_threshold = params.termination_threshold,
+                           .output_graph_degree   = graph_degree};
+
+  auto global_indices_h   = raft::make_managed_matrix<IdxT, int64_t>(res, num_rows, graph_degree);
+  auto global_distances_h = raft::make_managed_matrix<float, int64_t>(res, num_rows, graph_degree);
+
+  std::fill(global_indices_h.data_handle(),
+            global_indices_h.data_handle() + num_rows * graph_degree,
+            std::numeric_limits<IdxT>::max());
+  std::fill(global_distances_h.data_handle(),
+            global_distances_h.data_handle() + num_rows * graph_degree,
+            std::numeric_limits<float>::max());
+
+  auto batch_indices_h =
+    raft::make_host_matrix<IdxT, int64_t, row_major>(max_cluster_size, graph_degree);
+  auto batch_indices_d =
+    raft::make_device_matrix<IdxT, int64_t, row_major>(res, max_cluster_size, graph_degree);
+  auto batch_distances_d =
+    raft::make_device_matrix<float, int64_t, row_major>(res, max_cluster_size, graph_degree);
+
+  auto cluster_data_indices = raft::make_device_vector<IdxT, IdxT>(res, num_rows * k);
+  raft::copy(cluster_data_indices.data_handle(),
+             inverted_indices.data_handle(),
+             num_rows * k,
+             resource::get_cuda_stream(res));
+
+  cluster_nnd<T, IdxT>(res,
+                       params,
+                       graph_degree,
+                       extended_graph_degree,
+                       max_cluster_size,
+                       dataset,
+                       offset.data_handle(),
+                       cluster_size.data_handle(),
+                       cluster_data_indices.data_handle(),
+                       int_graph.data_handle(),
+                       inverted_indices.data_handle(),
+                       global_indices_h.data_handle(),
+                       global_distances_h.data_handle(),
+                       batch_indices_h.data_handle(),
+                       batch_indices_d.data_handle(),
+                       batch_distances_d.data_handle(),
+                       build_config);
+
+  raft::copy(global_idx.graph().data_handle(),
+             global_indices_h.data_handle(),
+             num_rows * graph_degree,
+             raft::resource::get_cuda_stream(res));
+  if (params.return_distances && global_idx.distances().has_value()) {
+    raft::copy(global_idx.distances().value().data_handle(),
+               global_distances_h.data_handle(),
+               num_rows * graph_degree,
+               raft::resource::get_cuda_stream(res));
+  }
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<float>, memory_type::host>>
+index<IdxT> batch_build(raft::resources const& res,
+                        const index_params& params,
+                        mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset)
+{
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  index<IdxT> idx{
+    res, dataset.extent(0), static_cast<int64_t>(graph_degree), params.return_distances};
+
+  batch_build(res, params, dataset, idx);
+
+  return idx;
+}
+
+}  // namespace cuvs::neighbors::nn_descent::detail::experimental
diff --git a/cpp/src/neighbors/detail/sparse_knn.cuh b/cpp/src/neighbors/detail/sparse_knn.cuh
new file mode 100644
index 000000000..9c8e971b9
--- /dev/null
+++ b/cpp/src/neighbors/detail/sparse_knn.cuh
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "../../distance/sparse_distance.cuh"
+#include "knn_merge_parts.cuh"
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+
+#include <cuvs/selection/select_k.hpp>
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/op/slice.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <algorithm>
+
+namespace cuvs::neighbors::detail {
+
+template <typename value_idx, typename value_t>
+struct csr_batcher_t {
+  csr_batcher_t(value_idx batch_size,
+                value_idx n_rows,
+                const value_idx* csr_indptr,
+                const value_idx* csr_indices,
+                const value_t* csr_data)
+    : batch_start_(0),
+      batch_stop_(0),
+      batch_rows_(0),
+      total_rows_(n_rows),
+      batch_size_(batch_size),
+      csr_indptr_(csr_indptr),
+      csr_indices_(csr_indices),
+      csr_data_(csr_data),
+      batch_csr_start_offset_(0),
+      batch_csr_stop_offset_(0)
+  {
+  }
+
+  void set_batch(int batch_num)
+  {
+    batch_start_ = batch_num * batch_size_;
+    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
+
+    if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
+
+    batch_rows_ = (batch_stop_ - batch_start_) + 1;
+  }
+
+  value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_indptr(batch_start_,
+                                           batch_stop_,
+                                           csr_indptr_,
+                                           batch_indptr,
+                                           &batch_csr_start_offset_,
+                                           &batch_csr_stop_offset_,
+                                           stream);
+
+    return batch_csr_stop_offset_ - batch_csr_start_offset_;
+  }
+
+  void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_,
+                                             batch_csr_stop_offset_,
+                                             csr_indices_,
+                                             csr_data_,
+                                             csr_indices,
+                                             csr_data,
+                                             stream);
+  }
+
+  value_idx batch_rows() const { return batch_rows_; }
+
+  value_idx batch_start() const { return batch_start_; }
+
+  value_idx batch_stop() const { return batch_stop_; }
+
+ private:
+  value_idx batch_size_;
+  value_idx batch_start_;
+  value_idx batch_stop_;
+  value_idx batch_rows_;
+
+  value_idx total_rows_;
+
+  const value_idx* csr_indptr_;
+  const value_idx* csr_indices_;
+  const value_t* csr_data_;
+
+  value_idx batch_csr_start_offset_;
+  value_idx batch_csr_stop_offset_;
+};
+
+template <typename value_idx, typename value_t>
+class sparse_knn_t {
+ public:
+  sparse_knn_t(const value_idx* idxIndptr_,
+               const value_idx* idxIndices_,
+               const value_t* idxData_,
+               size_t idxNNZ_,
+               int n_idx_rows_,
+               int n_idx_cols_,
+               const value_idx* queryIndptr_,
+               const value_idx* queryIndices_,
+               const value_t* queryData_,
+               size_t queryNNZ_,
+               int n_query_rows_,
+               int n_query_cols_,
+               value_idx* output_indices_,
+               value_t* output_dists_,
+               int k_,
+               raft::resources const& handle_,
+               size_t batch_size_index_             = 2 << 14,  // approx 1M
+               size_t batch_size_query_             = 2 << 14,
+               cuvs::distance::DistanceType metric_ = cuvs::distance::DistanceType::L2Expanded,
+               float metricArg_                     = 0)
+    : idxIndptr(idxIndptr_),
+      idxIndices(idxIndices_),
+      idxData(idxData_),
+      idxNNZ(idxNNZ_),
+      n_idx_rows(n_idx_rows_),
+      n_idx_cols(n_idx_cols_),
+      queryIndptr(queryIndptr_),
+      queryIndices(queryIndices_),
+      queryData(queryData_),
+      queryNNZ(queryNNZ_),
+      n_query_rows(n_query_rows_),
+      n_query_cols(n_query_cols_),
+      output_indices(output_indices_),
+      output_dists(output_dists_),
+      k(k_),
+      handle(handle_),
+      batch_size_index(batch_size_index_),
+      batch_size_query(batch_size_query_),
+      metric(metric_),
+      metricArg(metricArg_)
+  {
+  }
+
+  void run()
+  {
+    using namespace raft::sparse;
+
+    int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
+    csr_batcher_t<value_idx, value_t> query_batcher(
+      batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData);
+
+    size_t rows_processed = 0;
+
+    for (int i = 0; i < n_batches_query; i++) {
+      /**
+       * Compute index batch info
+       */
+      query_batcher.set_batch(i);
+
+      /**
+       * Slice CSR to rows in batch
+       */
+
+      rmm::device_uvector<value_idx> query_batch_indptr(query_batcher.batch_rows() + 1,
+                                                        raft::resource::get_cuda_stream(handle));
+
+      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
+        query_batch_indptr.data(), raft::resource::get_cuda_stream(handle));
+
+      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
+                                                         raft::resource::get_cuda_stream(handle));
+      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
+                                                    raft::resource::get_cuda_stream(handle));
+
+      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
+                                               query_batch_data.data(),
+                                               raft::resource::get_cuda_stream(handle));
+
+      // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
+      // batches and 1 space for the results of the merge, which get copied back to the top
+      rmm::device_uvector<value_idx> merge_buffer_indices(0,
+                                                          raft::resource::get_cuda_stream(handle));
+      rmm::device_uvector<value_t> merge_buffer_dists(0, raft::resource::get_cuda_stream(handle));
+
+      value_t* dists_merge_buffer_ptr;
+      value_idx* indices_merge_buffer_ptr;
+
+      int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
+      csr_batcher_t<value_idx, value_t> idx_batcher(
+        batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData);
+
+      for (int j = 0; j < n_batches_idx; j++) {
+        idx_batcher.set_batch(j);
+
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
+                                    raft::resource::get_cuda_stream(handle));
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
+                                  raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Slice CSR to rows in batch
+         */
+        rmm::device_uvector<value_idx> idx_batch_indptr(idx_batcher.batch_rows() + 1,
+                                                        raft::resource::get_cuda_stream(handle));
+        rmm::device_uvector<value_idx> idx_batch_indices(0,
+                                                         raft::resource::get_cuda_stream(handle));
+        rmm::device_uvector<value_t> idx_batch_data(0, raft::resource::get_cuda_stream(handle));
+
+        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
+          idx_batch_indptr.data(), raft::resource::get_cuda_stream(handle));
+
+        idx_batch_indices.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle));
+        idx_batch_data.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle));
+
+        idx_batcher.get_batch_csr_indices_data(
+          idx_batch_indices.data(), idx_batch_data.data(), raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Compute distances
+         */
+        uint64_t dense_size =
+          (uint64_t)idx_batcher.batch_rows() * (uint64_t)query_batcher.batch_rows();
+        rmm::device_uvector<value_t> batch_dists(dense_size,
+                                                 raft::resource::get_cuda_stream(handle));
+
+        RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher,
+                          query_batcher,
+                          idx_batch_nnz,
+                          n_query_batch_nnz,
+                          idx_batch_indptr.data(),
+                          idx_batch_indices.data(),
+                          idx_batch_data.data(),
+                          query_batch_indptr.data(),
+                          query_batch_indices.data(),
+                          query_batch_data.data(),
+                          batch_dists.data());
+
+        // Build batch indices array
+        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
+                                                     raft::resource::get_cuda_stream(handle));
+
+        // populate batch indices array
+        value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+
+        iota_fill(
+          batch_indices.data(), batch_rows, batch_cols, raft::resource::get_cuda_stream(handle));
+
+        /**
+         * Perform k-selection on batch & merge with other k-selections
+         */
+        size_t merge_buffer_offset = batch_rows * k;
+        dists_merge_buffer_ptr     = merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr   = merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher,
+                            query_batcher,
+                            batch_dists.data(),
+                            batch_indices.data(),
+                            dists_merge_buffer_ptr,
+                            indices_merge_buffer_ptr);
+
+        value_t* dists_merge_buffer_tmp_ptr     = dists_merge_buffer_ptr;
+        value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+
+        // Merge results of difference batches if necessary
+        if (idx_batcher.batch_start() > 0) {
+          size_t merge_buffer_tmp_out  = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr   = merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher,
+                        query_batcher,
+                        merge_buffer_dists.data(),
+                        merge_buffer_indices.data(),
+                        dists_merge_buffer_tmp_ptr,
+                        indices_merge_buffer_tmp_ptr);
+        }
+
+        // copy merged output back into merge buffer partition for next iteration
+        raft::copy_async<value_idx>(merge_buffer_indices.data(),
+                                    indices_merge_buffer_tmp_ptr,
+                                    batch_rows * k,
+                                    raft::resource::get_cuda_stream(handle));
+        raft::copy_async<value_t>(merge_buffer_dists.data(),
+                                  dists_merge_buffer_tmp_ptr,
+                                  batch_rows * k,
+                                  raft::resource::get_cuda_stream(handle));
+      }
+
+      // Copy final merged batch to output array
+      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
+                                  merge_buffer_indices.data(),
+                                  query_batcher.batch_rows() * k,
+                                  raft::resource::get_cuda_stream(handle));
+      raft::copy_async<value_t>(output_dists + (rows_processed * k),
+                                merge_buffer_dists.data(),
+                                query_batcher.batch_rows() * k,
+                                raft::resource::get_cuda_stream(handle));
+
+      rows_processed += query_batcher.batch_rows();
+    }
+  }
+
+ private:
+  void merge_batches(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                     csr_batcher_t<value_idx, value_t>& query_batcher,
+                     value_t* merge_buffer_dists,
+                     value_idx* merge_buffer_indices,
+                     value_t* out_dists,
+                     value_idx* out_indices)
+  {
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    rmm::device_uvector<value_idx> trans(id_ranges.size(), raft::resource::get_cuda_stream(handle));
+    raft::update_device(
+      trans.data(), id_ranges.data(), id_ranges.size(), raft::resource::get_cuda_stream(handle));
+
+    // combine merge buffers only if there's more than 1 partition to combine
+    cuvs::neighbors::detail::knn_merge_parts(merge_buffer_dists,
+                                             merge_buffer_indices,
+                                             out_dists,
+                                             out_indices,
+                                             query_batcher.batch_rows(),
+                                             2,
+                                             k,
+                                             raft::resource::get_cuda_stream(handle),
+                                             trans.data());
+  }
+
+  void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
+                           csr_batcher_t<value_idx, value_t> query_batcher,
+                           value_t* batch_dists,
+                           value_idx* batch_indices,
+                           value_t* out_dists,
+                           value_idx* out_indices)
+  {
+    // populate batch indices array
+    value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+
+    // build translation buffer to shift resulting indices by the batch
+    std::vector<value_idx> id_ranges;
+    id_ranges.push_back(0);
+    id_ranges.push_back(idx_batcher.batch_start());
+
+    // in the case where the number of idx rows in the batch is < k, we
+    // want to adjust k.
+    value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);
+
+    bool ascending = cuvs::distance::is_min_close(metric);
+
+    // kernel to slice first (min) k cols and copy into batched merge buffer
+    cuvs::selection::select_k(
+      handle,
+      raft::make_device_matrix_view<const value_t, int64_t>(batch_dists, batch_rows, batch_cols),
+      raft::make_device_matrix_view<const value_idx, int64_t>(
+        batch_indices, batch_rows, batch_cols),
+      raft::make_device_matrix_view<value_t, int64_t>(out_dists, batch_rows, n_neighbors),
+      raft::make_device_matrix_view<value_idx, int64_t>(out_indices, batch_rows, n_neighbors),
+      ascending,
+      true);
+  }
+
+  void compute_distances(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                         csr_batcher_t<value_idx, value_t>& query_batcher,
+                         size_t idx_batch_nnz,
+                         size_t query_batch_nnz,
+                         value_idx* idx_batch_indptr,
+                         value_idx* idx_batch_indices,
+                         value_t* idx_batch_data,
+                         value_idx* query_batch_indptr,
+                         value_idx* query_batch_indices,
+                         value_t* query_batch_data,
+                         value_t* batch_dists)
+  {
+    /**
+     * Compute distances
+     */
+    cuvs::distance::detail::sparse::distances_config_t<value_idx, value_t> dist_config(handle);
+    dist_config.b_nrows = idx_batcher.batch_rows();
+    dist_config.b_ncols = n_idx_cols;
+    dist_config.b_nnz   = idx_batch_nnz;
+
+    dist_config.b_indptr  = idx_batch_indptr;
+    dist_config.b_indices = idx_batch_indices;
+    dist_config.b_data    = idx_batch_data;
+
+    dist_config.a_nrows = query_batcher.batch_rows();
+    dist_config.a_ncols = n_query_cols;
+    dist_config.a_nnz   = query_batch_nnz;
+
+    dist_config.a_indptr  = query_batch_indptr;
+    dist_config.a_indices = query_batch_indices;
+    dist_config.a_data    = query_batch_data;
+
+    cuvs::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg);
+  }
+
+  const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
+  value_idx* output_indices;
+  const value_t *idxData, *queryData;
+  value_t* output_dists;
+
+  size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
+
+  cuvs::distance::DistanceType metric;
+
+  float metricArg;
+
+  int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
+
+  raft::resources const& handle;
+};
+
+};  // namespace cuvs::neighbors::detail
diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp
index a329db429..9b3da75a4 100644
--- a/cpp/src/neighbors/iface/iface.hpp
+++ b/cpp/src/neighbors/iface/iface.hpp
@@ -1,4 +1,20 @@
-#include <mutex>
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
@@ -6,6 +22,9 @@
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <raft/core/device_resources.hpp>
 
+#include <fstream>
+#include <mutex>
+
 namespace cuvs::neighbors {
 
 using namespace raft;
@@ -16,7 +35,7 @@ void build(const raft::device_resources& handle,
            const cuvs::neighbors::index_params* index_params,
            raft::mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> index_dataset)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     auto idx = cuvs::neighbors::ivf_flat::build(
@@ -32,8 +51,6 @@ void build(const raft::device_resources& handle,
     interface.index_.emplace(std::move(idx));
   }
   resource::sync_stream(handle);
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT, typename Accessor1, typename Accessor2>
@@ -44,7 +61,7 @@ void extend(
   std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, Accessor2>>
     new_indices)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     auto idx =
@@ -58,8 +75,6 @@ void extend(
     RAFT_FAIL("CAGRA does not implement the extend method");
   }
   resource::sync_stream(handle);
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -70,7 +85,7 @@ void search(const raft::device_resources& handle,
             raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
             raft::device_matrix_view<float, int64_t, row_major> distances)
 {
-  // interface.mutex_->lock();
+  // std::lock_guard(*interface.mutex_);
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, int64_t>>::value) {
     cuvs::neighbors::ivf_flat::search(
       handle,
@@ -94,9 +109,7 @@ void search(const raft::device_resources& handle,
                                    neighbors,
                                    distances);
   }
-  resource::sync_stream(handle);
-
-  // interface.mutex_->unlock();
+  // resource::sync_stream(handle);
 }
 
 // for MG ANN only
@@ -108,7 +121,7 @@ void search(const raft::device_resources& handle,
             raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,
             raft::device_matrix_view<float, int64_t, row_major> d_distances)
 {
-  // interface.mutex_->lock();
+  // std::lock_guard(*interface.mutex_);
 
   int64_t n_rows = h_queries.extent(0);
   int64_t n_dims = h_queries.extent(1);
@@ -120,8 +133,6 @@ void search(const raft::device_resources& handle,
   auto d_query_view = raft::make_const_mdspan(d_queries.view());
 
   search(handle, interface, search_params, d_query_view, d_neighbors, d_distances);
-
-  // interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -129,7 +140,7 @@ void serialize(const raft::device_resources& handle,
                const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                std::ostream& os)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     ivf_flat::serialize(handle, os, interface.index_.value());
@@ -138,8 +149,6 @@ void serialize(const raft::device_resources& handle,
   } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
     cagra::serialize(handle, os, interface.index_.value(), true);
   }
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -147,7 +156,7 @@ void deserialize(const raft::device_resources& handle,
                  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                  std::istream& is)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
     ivf_flat::index<T, IdxT> idx(handle);
@@ -162,8 +171,6 @@ void deserialize(const raft::device_resources& handle,
     cagra::deserialize(handle, is, &idx);
     interface.index_.emplace(std::move(idx));
   }
-
-  interface.mutex_->unlock();
 }
 
 template <typename AnnIndexType, typename T, typename IdxT>
@@ -171,7 +178,7 @@ void deserialize(const raft::device_resources& handle,
                  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
                  const std::string& filename)
 {
-  interface.mutex_->lock();
+  std::lock_guard(*interface.mutex_);
 
   std::ifstream is(filename, std::ios::in | std::ios::binary);
   if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
@@ -191,8 +198,6 @@ void deserialize(const raft::device_resources& handle,
   }
 
   is.close();
-
-  interface.mutex_->unlock();
 }
 
-};  // namespace cuvs::neighbors
\ No newline at end of file
+};  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
index fb110d810..d6ffc1218 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
@@ -132,6 +132,10 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
 {
   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
   if (i >= n_rows) { return; }
+  auto source_ix = source_ixs == nullptr ? i + batch_offset : source_ixs[i];
+  // In the context of refinement, some indices may be invalid (the generating NN algorithm does
+  // not return enough valid items). Do not add the item to the index in this case.
+  if (source_ix == ivf::kInvalidRecord<IdxT> || source_ix == raft::upper_bound<IdxT>()) { return; }
 
   auto list_id     = labels[i];
   auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
@@ -139,7 +143,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
   auto* list_data  = list_data_ptrs[list_id];
 
   // Record the source vector id in the index
-  list_index[inlist_id] = source_ixs == nullptr ? i + batch_offset : source_ixs[i];
+  list_index[inlist_id] = source_ix;
 
   // The data is written in interleaved groups of `index::kGroupSize` vectors
   using interleaved_group = raft::Pow2<kIndexGroupSize>;
@@ -151,7 +155,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels,
 
   // Point to the source vector
   if constexpr (gather_src) {
-    source_vecs += source_ixs[i] * dim;
+    source_vecs += source_ix * dim;
   } else {
     source_vecs += i * dim;
   }
diff --git a/cpp/src/neighbors/ivf_flat_c.cpp b/cpp/src/neighbors/ivf_flat_c.cpp
old mode 100755
new mode 100644
index c14c1edc0..2acc6b678
--- a/cpp/src/neighbors/ivf_flat_c.cpp
+++ b/cpp/src/neighbors/ivf_flat_c.cpp
@@ -29,6 +29,8 @@
 #include <cuvs/neighbors/ivf_flat.h>
 #include <cuvs/neighbors/ivf_flat.hpp>
 
+#include <fstream>
+
 namespace {
 
 template <typename T, typename IdxT>
diff --git a/cpp/src/neighbors/mg/mg.cuh b/cpp/src/neighbors/mg/mg.cuh
index d3f635bc4..e9cdc30f6 100644
--- a/cpp/src/neighbors/mg/mg.cuh
+++ b/cpp/src/neighbors/mg/mg.cuh
@@ -25,6 +25,8 @@
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/mg.hpp>
 
+#include <fstream>
+
 namespace cuvs::neighbors {
 using namespace raft;
 
diff --git a/cpp/src/neighbors/nn_descent.cuh b/cpp/src/neighbors/nn_descent.cuh
index 582da72c1..ed91dac91 100644
--- a/cpp/src/neighbors/nn_descent.cuh
+++ b/cpp/src/neighbors/nn_descent.cuh
@@ -17,9 +17,14 @@
 #pragma once
 
 #include "detail/nn_descent.cuh"
+#include "detail/nn_descent_batch.cuh"
+
+#include <cmath>
+#include <cstdint>
 #include <cuvs/neighbors/nn_descent.hpp>
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
 #include <raft/core/host_mdspan.hpp>
 
 namespace cuvs::neighbors::nn_descent {
@@ -61,7 +66,15 @@ auto build(raft::resources const& res,
            index_params const& params,
            raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      return detail::experimental::batch_build<T, IdxT>(res, params, dataset);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset);
+  }
 }
 
 /**
@@ -100,7 +113,15 @@ void build(raft::resources const& res,
            raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,
            index<IdxT>& idx)
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      detail::experimental::batch_build<T, IdxT>(res, params, dataset, idx);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    detail::build<T, IdxT>(res, params, dataset, idx);
+  }
 }
 
 /**
@@ -135,7 +156,15 @@ auto build(raft::resources const& res,
            index_params const& params,
            raft::host_matrix_view<const T, int64_t, raft::row_major> dataset) -> index<IdxT>
 {
-  return detail::build<T, IdxT>(res, params, dataset);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      return detail::experimental::batch_build<T, IdxT>(res, params, dataset);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    return detail::build<T, IdxT>(res, params, dataset);
+  }
 }
 
 /**
@@ -174,7 +203,15 @@ void build(raft::resources const& res,
            raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
            index<IdxT>& idx)
 {
-  detail::build<T, IdxT>(res, params, dataset, idx);
+  if (params.n_clusters > 1) {
+    if constexpr (std::is_same_v<T, float>) {
+      detail::experimental::batch_build<T, IdxT>(res, params, dataset, idx);
+    } else {
+      RAFT_FAIL("Batched nn-descent is only supported for float precision");
+    }
+  } else {
+    detail::build<T, IdxT>(res, params, dataset, idx);
+  }
 }
 
 /** @} */  // end group nn-descent
diff --git a/cpp/src/neighbors/nn_descent_float.cu b/cpp/src/neighbors/nn_descent_float.cu
index c6d356671..fa85db127 100644
--- a/cpp/src/neighbors/nn_descent_float.cu
+++ b/cpp/src/neighbors/nn_descent_float.cu
@@ -19,21 +19,38 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    };                                                                                        \
+  }                                                                                           \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(float, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_half.cu b/cpp/src/neighbors/nn_descent_half.cu
index 587993031..2ee45d435 100644
--- a/cpp/src/neighbors/nn_descent_half.cu
+++ b/cpp/src/neighbors/nn_descent_half.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(half, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_index.cpp b/cpp/src/neighbors/nn_descent_index.cpp
new file mode 100644
index 000000000..25d5b6af8
--- /dev/null
+++ b/cpp/src/neighbors/nn_descent_index.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/nn_descent.hpp>
+
+namespace cuvs::neighbors::nn_descent {
+
+index_params::index_params(size_t graph_degree, cuvs::distance::DistanceType metric)
+{
+  this->graph_degree              = graph_degree;
+  this->intermediate_graph_degree = 1.5 * graph_degree;
+  this->metric                    = metric;
+}
+}  // namespace cuvs::neighbors::nn_descent
\ No newline at end of file
diff --git a/cpp/src/neighbors/nn_descent_int8.cu b/cpp/src/neighbors/nn_descent_int8.cu
index 813a01746..e150f511b 100644
--- a/cpp/src/neighbors/nn_descent_int8.cu
+++ b/cpp/src/neighbors/nn_descent_int8.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(int8_t, uint32_t);
diff --git a/cpp/src/neighbors/nn_descent_uint8.cu b/cpp/src/neighbors/nn_descent_uint8.cu
index 9d73dd90f..d8657777b 100644
--- a/cpp/src/neighbors/nn_descent_uint8.cu
+++ b/cpp/src/neighbors/nn_descent_uint8.cu
@@ -19,21 +19,39 @@
 
 namespace cuvs::neighbors::nn_descent {
 
-#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                       \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset) \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
-  };                                                                              \
-                                                                                  \
-  auto build(raft::resources const& handle,                                       \
-             const cuvs::neighbors::nn_descent::index_params& params,             \
-             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)   \
-    ->cuvs::neighbors::nn_descent::index<IdxT>                                    \
-  {                                                                               \
-    return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);  \
+#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT)                                                   \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
+  };                                                                                          \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::nn_descent::index_params& params,                         \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             std::optional<raft::host_matrix_view<uint32_t, int64_t, raft::row_major>> graph) \
+    ->cuvs::neighbors::nn_descent::index<IdxT>                                                \
+  {                                                                                           \
+    if (!graph.has_value()) {                                                                 \
+      return cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset);            \
+    } else {                                                                                  \
+      std::optional<raft::device_matrix_view<float, int64_t, raft::row_major>> distances =    \
+        std::nullopt;                                                                         \
+      cuvs::neighbors::nn_descent::index<IdxT> idx{handle, graph.value(), distances};         \
+      cuvs::neighbors::nn_descent::build<T, IdxT>(handle, params, dataset, idx);              \
+      return idx;                                                                             \
+    }                                                                                         \
   };
 
 CUVS_INST_NN_DESCENT_BUILD(uint8_t, uint32_t);
diff --git a/cpp/src/neighbors/sparse_brute_force.cu b/cpp/src/neighbors/sparse_brute_force.cu
new file mode 100644
index 000000000..e277961ec
--- /dev/null
+++ b/cpp/src/neighbors/sparse_brute_force.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/neighbors/brute_force.hpp>
+
+#include "detail/sparse_knn.cuh"
+
+namespace cuvs::neighbors::brute_force {
+template <typename T, typename IdxT>
+sparse_index<T, IdxT>::sparse_index(raft::resources const& res,
+                                    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> dataset,
+                                    cuvs::distance::DistanceType metric,
+                                    T metric_arg)
+  : dataset_(dataset), metric_(metric), metric_arg_(metric_arg)
+{
+}
+
+auto build(raft::resources const& handle,
+           raft::device_csr_matrix_view<const float, int, int, int> dataset,
+           cuvs::distance::DistanceType metric,
+           float metric_arg) -> cuvs::neighbors::brute_force::sparse_index<float, int>
+{
+  return sparse_index<float, int>(handle, dataset, metric, metric_arg);
+}
+
+void search(raft::resources const& handle,
+            const sparse_search_params& params,
+            const sparse_index<float, int>& index,
+            raft::device_csr_matrix_view<const float, int, int, int> query,
+            raft::device_matrix_view<int, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances)
+{
+  auto idx_structure   = index.dataset().structure_view();
+  auto query_structure = query.structure_view();
+  int k                = neighbors.extent(1);
+
+  detail::sparse_knn_t<int, float>(idx_structure.get_indptr().data(),
+                                   idx_structure.get_indices().data(),
+                                   index.dataset().get_elements().data(),
+                                   idx_structure.get_nnz(),
+                                   idx_structure.get_n_rows(),
+                                   idx_structure.get_n_cols(),
+                                   query_structure.get_indptr().data(),
+                                   query_structure.get_indices().data(),
+                                   query.get_elements().data(),
+                                   query_structure.get_nnz(),
+                                   query_structure.get_n_rows(),
+                                   query_structure.get_n_cols(),
+                                   neighbors.data_handle(),
+                                   distances.data_handle(),
+                                   k,
+                                   handle,
+                                   params.batch_size_index,
+                                   params.batch_size_query,
+                                   index.metric(),
+                                   index.metric_arg())
+    .run();
+}
+}  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 1ed8466b3..286d721d7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -94,7 +94,7 @@ endfunction()
 if(BUILD_TESTS)
   ConfigureTest(
     NAME NEIGHBORS_TEST PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu
-    neighbors/refine.cu GPUS 1 PERCENT 100
+    neighbors/sparse_brute_force.cu neighbors/refine.cu GPUS 1 PERCENT 100
   )
 
   ConfigureTest(
@@ -137,6 +137,7 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_CAGRA_TEST
     PATH
+    neighbors/ann_cagra/bug_extreme_inputs_oob.cu
     neighbors/ann_cagra/bug_multi_cta_crash.cu
     neighbors/ann_cagra/test_float_uint32_t.cu
     neighbors/ann_cagra/test_half_uint32_t.cu
@@ -205,6 +206,7 @@ if(BUILD_TESTS)
     distance/dist_lp_unexp.cu
     distance/dist_russell_rao.cu
     distance/masked_nn.cu
+    distance/sparse_distance.cu
     sparse/neighbors/cross_component_nn.cu
     GPUS
     1
diff --git a/cpp/test/distance/sparse_distance.cu b/cpp/test/distance/sparse_distance.cu
new file mode 100644
index 000000000..f95487414
--- /dev/null
+++ b/cpp/test/distance/sparse_distance.cu
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cuvs/distance/distance.hpp>
+
+#include <cusparse_v2.h>
+#include <gtest/gtest.h>
+
+namespace cuvs {
+namespace distance {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseDistanceInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+
+  cuvs::distance::DistanceType metric;
+
+  float metric_arg = 0.0;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs<value_idx, value_t>& dims)
+{
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseDistanceTest
+  : public ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>> {
+ public:
+  SparseDistanceTest()
+    : params(::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>>::GetParam()),
+      indptr(0, resource::get_cuda_stream(handle)),
+      indices(0, resource::get_cuda_stream(handle)),
+      data(0, resource::get_cuda_stream(handle)),
+      out_dists(0, resource::get_cuda_stream(handle)),
+      out_dists_ref(0, resource::get_cuda_stream(handle))
+  {
+  }
+
+  void SetUp() override
+  {
+    make_data();
+
+    int out_size = static_cast<value_idx>(params.indptr_h.size() - 1) *
+                   static_cast<value_idx>(params.indptr_h.size() - 1);
+
+    out_dists.resize(out_size, resource::get_cuda_stream(handle));
+
+    auto out = raft::make_device_matrix_view<value_t, value_idx>(
+      out_dists.data(),
+      static_cast<value_idx>(params.indptr_h.size() - 1),
+      static_cast<value_idx>(params.indptr_h.size() - 1));
+
+    auto x_structure = raft::make_device_compressed_structure_view<value_idx, value_idx, value_idx>(
+      indptr.data(),
+      indices.data(),
+      static_cast<value_idx>(params.indptr_h.size() - 1),
+      params.n_cols,
+      static_cast<value_idx>(params.indices_h.size()));
+    auto x = raft::make_device_csr_matrix_view<const value_t>(data.data(), x_structure);
+
+    cuvs::distance::pairwise_distance(handle, x, x, out, params.metric, params.metric_arg);
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle)));
+  }
+
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(),
+                            out_dists.data(),
+                            params.out_dists_ref_h.size(),
+                            CompareApprox<value_t>(1e-3)));
+  }
+
+ protected:
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h      = params.data_h;
+
+    auto stream = resource::get_cuda_stream(handle);
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
+
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+
+    out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
+
+    update_device(out_dists_ref.data(),
+                  out_dists_ref_h.data(),
+                  out_dists_ref_h.size(),
+                  resource::get_cuda_stream(handle));
+  }
+
+  raft::resources handle;
+
+  // input data
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
+
+  // output data
+  rmm::device_uvector<value_t> out_dists, out_dists_ref;
+
+  SparseDistanceInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
+  {5,
+   {0, 0, 1, 2},
+
+   {1, 2},
+   {0.5, 0.5},
+   {0, 1, 1, 1, 0, 1, 1, 1, 0},
+   cuvs::distance::DistanceType::CosineExpanded,
+   0.0},
+  {5,
+   {0, 0, 1, 2},
+
+   {1, 2},
+   {1.0, 1.0},
+   {0, 1, 1, 1, 0, 1, 1, 1, 0},
+   cuvs::distance::DistanceType::JaccardExpanded,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L2Expanded,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},
+   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
+   cuvs::distance::DistanceType::InnerProduct,
+   0.0},
+  {2,
+   {0, 2, 4, 6, 8},
+   {0, 1, 0, 1, 0, 1, 0, 1},  // indices
+   {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
+   {
+     // dense output
+     0.0,
+     4.0,
+     3026.0,
+     226.0,
+     4.0,
+     0.0,
+     2930.0,
+     234.0,
+     3026.0,
+     2930.0,
+     0.0,
+     1832.0,
+     226.0,
+     234.0,
+     1832.0,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L2Unexpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102,
+    1.,         0.76978799, 0.39419924, 0.,         0.97577154, 0.48904013, 0.48300801, 0.45087445,
+    0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,         0.79593037, 0.48904013,
+    0.51413997, 0.,         0.28605559, 0.35772784, 1.,         0.60889396, 0.43324829, 0.84923694,
+    0.45658883, 0.48300801, 0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0.,
+    0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1.,
+    0.6745457,  0.77917274, 0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,         0.51360432, 0.68185144,
+    1.,         0.54847744, 0.8321819,  0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432,
+    0.,         1.,         0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+    0.61547536, 0.68185144, 1.,         0.},
+   cuvs::distance::DistanceType::CosineExpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {0.0,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.75,
+    0.2857142857142857,
+    0.75,
+    0.7142857142857143,
+    0.5,
+    1.0,
+    0.6666666666666666,
+    0.42857142857142855,
+    0.0,
+    0.75,
+    0.625,
+    0.375,
+    0.42857142857142855,
+    0.75,
+    0.375,
+    0.75,
+    0.7142857142857143,
+    0.7142857142857143,
+    0.75,
+    0.0,
+    0.7142857142857143,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.6666666666666666,
+    0.625,
+    0.6666666666666666,
+    1.0,
+    0.75,
+    0.625,
+    0.7142857142857143,
+    0.0,
+    0.5,
+    0.5714285714285714,
+    1.0,
+    0.8,
+    0.5,
+    0.6666666666666666,
+    0.2857142857142857,
+    0.375,
+    0.42857142857142855,
+    0.5,
+    0.0,
+    0.6666666666666666,
+    0.7777777777777778,
+    0.4444444444444444,
+    0.7777777777777778,
+    0.75,
+    0.75,
+    0.42857142857142855,
+    0.7142857142857143,
+    0.5714285714285714,
+    0.6666666666666666,
+    0.0,
+    0.7142857142857143,
+    0.5,
+    0.5,
+    0.8571428571428571,
+    0.7142857142857143,
+    0.75,
+    0.6666666666666666,
+    1.0,
+    0.7777777777777778,
+    0.7142857142857143,
+    0.0,
+    0.42857142857142855,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.5,
+    0.375,
+    0.625,
+    0.8,
+    0.4444444444444444,
+    0.5,
+    0.42857142857142855,
+    0.0,
+    0.7777777777777778,
+    0.75,
+    1.0,
+    0.75,
+    0.6666666666666666,
+    0.5,
+    0.7777777777777778,
+    0.5,
+    0.8571428571428571,
+    0.7777777777777778,
+    0.0,
+    1.0,
+    0.6666666666666666,
+    0.7142857142857143,
+    1.0,
+    0.6666666666666666,
+    0.75,
+    0.8571428571428571,
+    0.8333333333333334,
+    0.75,
+    1.0,
+    0.0},
+   cuvs::distance::DistanceType::JaccardExpanded,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    3.3954660629919076,
+    5.6469232737388815,
+    6.373112846266441,
+    4.0212880272531715,
+    6.916281504639404,
+    5.741508386786526,
+    5.411470999663036,
+    9.0,
+    4.977014354725805,
+    3.3954660629919076,
+    0.0,
+    7.56256082439209,
+    5.540261147481582,
+    4.832322929216881,
+    4.62003193872216,
+    6.498056792320361,
+    4.309846252268695,
+    6.317531174829905,
+    6.016362684141827,
+    5.6469232737388815,
+    7.56256082439209,
+    0.0,
+    5.974878731322299,
+    4.898357301336036,
+    6.442097410320605,
+    5.227077347287883,
+    7.134101195584642,
+    5.457753923371659,
+    7.0,
+    6.373112846266441,
+    5.540261147481582,
+    5.974878731322299,
+    0.0,
+    5.5507273748583,
+    4.897749658726415,
+    9.0,
+    8.398776718824767,
+    3.908281400328807,
+    4.83431066343688,
+    4.0212880272531715,
+    4.832322929216881,
+    4.898357301336036,
+    5.5507273748583,
+    0.0,
+    6.632989819428174,
+    7.438852294822894,
+    5.6631570310967465,
+    7.579428202635459,
+    6.760811985364303,
+    6.916281504639404,
+    4.62003193872216,
+    6.442097410320605,
+    4.897749658726415,
+    6.632989819428174,
+    0.0,
+    5.249404187382862,
+    6.072559523278559,
+    4.07661278488929,
+    6.19678948003145,
+    5.741508386786526,
+    6.498056792320361,
+    5.227077347287883,
+    9.0,
+    7.438852294822894,
+    5.249404187382862,
+    0.0,
+    3.854811639654704,
+    6.652724827169063,
+    5.298236851430971,
+    5.411470999663036,
+    4.309846252268695,
+    7.134101195584642,
+    8.398776718824767,
+    5.6631570310967465,
+    6.072559523278559,
+    3.854811639654704,
+    0.0,
+    7.529184598969917,
+    6.903282911791188,
+    9.0,
+    6.317531174829905,
+    5.457753923371659,
+    3.908281400328807,
+    7.579428202635459,
+    4.07661278488929,
+    6.652724827169063,
+    7.529184598969917,
+    0.0,
+    7.0,
+    4.977014354725805,
+    6.016362684141827,
+    7.0,
+    4.83431066343688,
+    6.760811985364303,
+    6.19678948003145,
+    5.298236851430971,
+    6.903282911791188,
+    7.0,
+    0.0},
+   cuvs::distance::DistanceType::Canberra,
+   0.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    1.31462855332296,
+    1.3690307816129905,
+    1.698603990921237,
+    1.3460470789553531,
+    1.6636670712582544,
+    1.2651744044972217,
+    1.1938329352055201,
+    1.8811409082590185,
+    1.3653115050624267,
+    1.31462855332296,
+    0.0,
+    1.9447722703291133,
+    1.42818777206562,
+    1.4685491458946494,
+    1.3071999866010466,
+    1.4988622861692171,
+    0.9698559287406783,
+    1.4972023224597841,
+    1.5243383567266802,
+    1.3690307816129905,
+    1.9447722703291133,
+    0.0,
+    1.2748400840107568,
+    1.0599569946448246,
+    1.546591282841402,
+    1.147526531928459,
+    1.447002179128145,
+    1.5982242387673176,
+    1.3112533607072414,
+    1.698603990921237,
+    1.42818777206562,
+    1.2748400840107568,
+    0.0,
+    1.038121552545461,
+    1.011788365364402,
+    1.3907391109256988,
+    1.3128200942311496,
+    1.19595706584447,
+    1.3233328139624725,
+    1.3460470789553531,
+    1.4685491458946494,
+    1.0599569946448246,
+    1.038121552545461,
+    0.0,
+    1.3642741698145529,
+    1.3493868683808095,
+    1.394942694628328,
+    1.572881849642552,
+    1.380122665319464,
+    1.6636670712582544,
+    1.3071999866010466,
+    1.546591282841402,
+    1.011788365364402,
+    1.3642741698145529,
+    0.0,
+    1.018961640373018,
+    1.0114394258945634,
+    0.8338711034820684,
+    1.1247823842299223,
+    1.2651744044972217,
+    1.4988622861692171,
+    1.147526531928459,
+    1.3907391109256988,
+    1.3493868683808095,
+    1.018961640373018,
+    0.0,
+    0.7701238110357329,
+    1.245486437864406,
+    0.5551259549534626,
+    1.1938329352055201,
+    0.9698559287406783,
+    1.447002179128145,
+    1.3128200942311496,
+    1.394942694628328,
+    1.0114394258945634,
+    0.7701238110357329,
+    0.0,
+    1.1886800117391216,
+    1.0083692448135637,
+    1.8811409082590185,
+    1.4972023224597841,
+    1.5982242387673176,
+    1.19595706584447,
+    1.572881849642552,
+    0.8338711034820684,
+    1.245486437864406,
+    1.1886800117391216,
+    0.0,
+    1.3661374102525012,
+    1.3653115050624267,
+    1.5243383567266802,
+    1.3112533607072414,
+    1.3233328139624725,
+    1.380122665319464,
+    1.1247823842299223,
+    0.5551259549534626,
+    1.0083692448135637,
+    1.3661374102525012,
+    0.0},
+   cuvs::distance::DistanceType::LpUnexpanded,
+   2.0},
+
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    0.9251771844789913,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.717493881903289,
+    0.6920214832303888,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.0,
+    0.9036452083899731,
+    0.8655339692155823,
+    0.8706483735804971,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.6329837991017668,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.0,
+    0.7988276152181608,
+    0.7028075145996631,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.8429599432532096,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.7988276152181608,
+    0.0,
+    0.48376552205293305,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8429599432532096,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.7028075145996631,
+    0.48376552205293305,
+    0.0,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8429599432532096,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.0,
+    0.8853924473642432,
+    0.535821510936138,
+    0.6497196601457607,
+    0.8853924473642432,
+    0.717493881903289,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.0,
+    0.5279604218147174,
+    0.6658348373853169,
+    0.33799874888632914,
+    0.6920214832303888,
+    0.6329837991017668,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.535821510936138,
+    0.5279604218147174,
+    0.0,
+    0.662579808115858,
+    0.5079750812968089,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.6497196601457607,
+    0.6658348373853169,
+    0.662579808115858,
+    0.0,
+    0.8429599432532096,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.33799874888632914,
+    0.5079750812968089,
+    0.8429599432532096,
+    0.0},
+   cuvs::distance::DistanceType::Linf,
+   0.0},
+
+  {15,
+   {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
+   {0, 1, 5,  6, 9, 1,  4,  14, 7, 3,  4,  7, 9, 11, 14, 0, 3, 7, 8, 12, 0,  2, 5,
+    7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8,  9,  0, 2, 3, 4, 6,  10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219,
+    1.,         0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246,
+    0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739,
+    0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409,  0.31461499, 0.24412279, 0.08327667,
+    0.04444576, 0.05047969, 0.26190054, 0.2077349,  0.10803964},
+   {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
+    9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
+    6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
+    1.00000000e+00, 7.33151506e-01, 1.00000000e+00, 9.86880955e-01,
+    9.19154851e-01, 5.38849774e-01, 1.00000000e+00, 8.98332369e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 8.03303970e-01,
+    6.64465915e-01, 8.69374690e-01, 1.00000000e+00, 1.00000000e+00,
+    1.00000000e+00, 1.00000000e+00, 9.24116813e-01, 7.33151506e-01,
+    8.03303970e-01, 0.00000000e+00, 8.16225843e-01, 9.39818306e-01,
+    7.27700415e-01, 7.30155528e-01, 8.89451011e-01, 8.05419635e-01,
+    9.90039274e-01, 1.00000000e+00, 6.64465915e-01, 8.16225843e-01,
+    0.00000000e+00, 6.38804490e-01, 1.00000000e+00, 1.00000000e+00,
+    9.52559809e-01, 9.53789212e-01, 7.97613546e-01, 9.86880955e-01,
+    8.69374690e-01, 9.39818306e-01, 6.38804490e-01, 0.0,
+    1.00000000e+00, 9.72569112e-01, 8.24907516e-01, 8.07933016e-01,
+    8.91271059e-01, 9.19154851e-01, 1.00000000e+00, 7.27700415e-01,
+    1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 7.63596268e-01,
+    8.40131263e-01, 7.40428532e-01, 1.00000000e+00, 5.38849774e-01,
+    1.00000000e+00, 7.30155528e-01, 1.00000000e+00, 9.72569112e-01,
+    7.63596268e-01, 0.00000000e+00, 1.00000000e+00, 7.95485011e-01,
+    6.64669302e-01, 1.00000000e+00, 1.00000000e+00, 8.89451011e-01,
+    9.52559809e-01, 8.24907516e-01, 8.40131263e-01, 1.00000000e+00,
+    0.00000000e+00, 8.51370877e-01, 8.59439512e-01, 8.98332369e-01,
+    1.00000000e+00, 8.05419635e-01, 9.53789212e-01, 8.07933016e-01,
+    7.40428532e-01, 7.95485011e-01, 8.51370877e-01, 1.49011612e-08},
+   // Dataset is L1 normalized into pdfs
+   cuvs::distance::DistanceType::HellingerExpanded,
+   0.0},
+
+  {4,
+   {0, 1, 1, 2, 4},
+   {3, 2, 0, 1},  // indices
+   {0.99296, 0.42180, 0.11687, 0.305869},
+   {
+     // dense output
+     0.0,
+     0.99296,
+     1.41476,
+     1.415707,
+     0.99296,
+     0.0,
+     0.42180,
+     0.42274,
+     1.41476,
+     0.42180,
+     0.0,
+     0.84454,
+     1.41570,
+     0.42274,
+     0.84454,
+     0.0,
+   },
+   cuvs::distance::DistanceType::L1,
+   0.0},
+  {5,
+   {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45},
+   {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2,
+    3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4},
+   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667,
+    0.34426657, 0.2357925,  0.01274851, 0.11422017, 0.3437756,  0.31967718, 0.5956055,  0.31610373,
+    0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
+    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736,
+    0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815,
+    0.31648823, 0.89874295, 0.27366735, 0.5119944,  0.11416134},
+   {// dense output
+    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,  0.76962708, 1.122858,
+    1.1232498,  1.08166081, 0.48769777, 0.,         1.31332116, 0.98318907, 0.42661815, 0.09279052,
+    1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
+    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907,
+    1.82943642, 0.,         0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116,
+    0.26657011, 0.42661815, 1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
+    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093, 0.45060069, 0.,
+    1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281,
+    0.77814948, 1.29899154, 0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
+    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,         1.47318624, 1.92660889,
+    1.1232498,  0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624,
+    0.,         0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
+    1.88812175, 1.92660889, 0.24992619, 0.},
+   cuvs::distance::DistanceType::CorrelationExpanded,
+   0.0},
+  {5,
+   {0, 1, 2, 4, 4, 5, 6, 7, 9, 9, 10},
+   {1, 4, 0, 4, 1, 3, 0, 1, 3, 0},
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {// dense output
+    0.,  1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1.,  0., 0.8, 1., 1.,  1.,  1.,  1.,  1., 1.,
+    1.,  0.8, 0.,  1., 1.,  1., 0.8, 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  1.,  1.,  1., 1.,
+    0.8, 1.,  1.,  1., 0.,  1., 1.,  0.8, 1., 1.,  1.,  1., 1.,  1., 1.,  0.,  1.,  0.8, 1., 1.,
+    1.,  1.,  0.8, 1., 1.,  1., 0.,  1.,  1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1.,  0.,  1., 1.,
+    1.,  1.,  1.,  1., 1.,  1., 1.,  1.,  0., 1.,  1.,  1., 0.8, 1., 1.,  1.,  0.8, 1.,  1., 0.},
+   cuvs::distance::DistanceType::RusselRaoExpanded,
+   0.0},
+  {5,
+   {0, 1, 1, 3, 3, 4, 4, 6, 9, 10, 10},
+   {0, 3, 4, 4, 2, 3, 0, 2, 3, 2},
+   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+   {// dense output
+    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4,
+    0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0.,  0.4, 0.,
+    0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,  0.2, 0.6, 0.8, 0.4, 0.2, 0.2,
+    0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2,
+    0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4,
+    0.2, 0.2, 0.4, 0.,  0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
+   cuvs::distance::DistanceType::HammingUnexpanded,
+   0.0},
+  {3,
+   {0, 1, 2},
+   {0, 1},
+   {1.0, 1.0},
+   {0.0, 0.83255, 0.83255, 0.0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+  {2,
+   {0, 1, 3},
+   {0, 0, 1},
+   {1.0, 0.5, 0.5},
+   {0, 0.4645014, 0.4645014, 0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+  {3,
+   {0, 1, 2},
+   {0, 0},
+   {1.0, 1.0},
+   {0.0, 0.0, 0.0, 0.0},
+   cuvs::distance::DistanceType::JensenShannon,
+   0.0},
+
+  {3,
+   {0, 1, 2},
+   {0, 1},
+   {1.0, 1.0},
+   {0.0, 1.0, 1.0, 0.0},
+   cuvs::distance::DistanceType::DiceExpanded,
+   0.0},
+  {3,
+   {0, 1, 3},
+   {0, 0, 1},
+   {1.0, 1.0, 1.0},
+   {0, 0.333333, 0.333333, 0},
+   cuvs::distance::DistanceType::DiceExpanded,
+   0.0},
+
+};
+
+typedef SparseDistanceTest<int, float> SparseDistanceTestF;
+TEST_P(SparseDistanceTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests,
+                        SparseDistanceTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
+
+}  // end namespace distance
+}  // end namespace cuvs
diff --git a/cpp/test/neighbors/ann_brute_force.cuh b/cpp/test/neighbors/ann_brute_force.cuh
index c2afa4e8b..03d6e820c 100644
--- a/cpp/test/neighbors/ann_brute_force.cuh
+++ b/cpp/test/neighbors/ann_brute_force.cuh
@@ -114,12 +114,28 @@ class AnnBruteForceTest : public ::testing::TestWithParam<AnnBruteForceInputs<Id
                                                       0.001f,
                                                       stream_,
                                                       true));
+
+      brute_force::serialize(handle_, std::string{"brute_force_index"}, idx, true);
+      auto index_loaded = brute_force::index<DataT, T>(handle_);
+      brute_force::deserialize(handle_, std::string{"brute_force_index"}, &index_loaded);
+
       brute_force::search(handle_,
-                          idx,
+                          index_loaded,
                           search_queries_view,
                           indices_out_view,
                           dists_out_view,
                           cuvs::neighbors::filtering::none_sample_filter{});
+      raft::resource::sync_stream(handle_);
+
+      ASSERT_TRUE(cuvs::neighbors::devArrMatchKnnPair(indices_naive_dev.data(),
+                                                      indices_bruteforce_dev.data(),
+                                                      distances_naive_dev.data(),
+                                                      distances_bruteforce_dev.data(),
+                                                      ps.num_queries,
+                                                      ps.k,
+                                                      0.001f,
+                                                      stream_,
+                                                      true));
     }
   }
 
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 37d42dd1d..660246c67 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -361,8 +361,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
                                           // not used for knn_graph building.
         switch (ps.build_algo) {
           case graph_build_algo::IVF_PQ:
-            index_params.graph_build_params =
-              graph_build_params::ivf_pq_params(raft::matrix_extent<int64_t>(ps.n_rows, ps.dim));
+            index_params.graph_build_params = graph_build_params::ivf_pq_params(
+              raft::matrix_extent<int64_t>(ps.n_rows, ps.dim), index_params.metric);
             if (ps.ivf_pq_search_refine_ratio) {
               std::get<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
                 index_params.graph_build_params)
@@ -370,8 +370,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
             }
             break;
           case graph_build_algo::NN_DESCENT: {
-            index_params.graph_build_params =
-              graph_build_params::nn_descent_params(index_params.intermediate_graph_degree);
+            index_params.graph_build_params = graph_build_params::nn_descent_params(
+              index_params.intermediate_graph_degree, index_params.metric);
             break;
           }
           case graph_build_algo::AUTO:
@@ -389,7 +389,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
           (const DataT*)database.data(), ps.n_rows, ps.dim);
 
         {
-          cagra::index<DataT, IdxT> index(handle_);
+          cagra::index<DataT, IdxT> index(handle_, index_params.metric);
           if (ps.host_dataset) {
             auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
diff --git a/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu
new file mode 100644
index 000000000..e21a54e9e
--- /dev/null
+++ b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/cagra.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::cagra {
+
+class cagra_extreme_inputs_oob_test : public ::testing::Test {
+ public:
+  using data_type = float;
+
+ protected:
+  void run()
+  {
+    cagra::index_params ix_ps;
+    graph_build_params::ivf_pq_params gb_params{};
+    gb_params.refinement_rate       = 2;
+    ix_ps.graph_build_params        = gb_params;
+    ix_ps.graph_degree              = 64;
+    ix_ps.intermediate_graph_degree = 128;
+
+    [[maybe_unused]] auto ix = cagra::build(res, ix_ps, raft::make_const_mdspan(dataset->view()));
+    raft::resource::sync_stream(res);
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, n_samples, n_dim));
+    raft::random::RngState r(1234ULL);
+    raft::random::normal(
+      res, r, dataset->data_handle(), n_samples * n_dim, data_type(0), data_type(1e20));
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    dataset.reset();
+    raft::resource::sync_stream(res);
+  }
+
+ private:
+  raft::resources res;
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset = std::nullopt;
+
+  constexpr static int64_t n_samples                   = 100000;
+  constexpr static int64_t n_dim                       = 200;
+  constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+};
+
+TEST_F(cagra_extreme_inputs_oob_test, cagra_extreme_inputs_oob_test) { this->run(); }
+
+}  // namespace cuvs::neighbors::cagra
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 8cc46b2f7..23d84ca98 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -24,6 +24,7 @@
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <raft/linalg/normalize.cuh>
 #include <raft/stats/mean.cuh>
+#include <thrust/reduce.h>
 #include <thrust/sequence.h>
 
 #include <raft/core/resource/cuda_stream_pool.hpp>
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index bce0f9899..09861a219 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -18,11 +18,16 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
+
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include "naive_knn.cuh"
+#include <cuvs/distance/distance.hpp>
 
 #include <gtest/gtest.h>
 
@@ -42,6 +47,15 @@ struct AnnNNDescentInputs {
   double min_recall;
 };
 
+struct AnnNNDescentBatchInputs {
+  std::pair<double, size_t> recall_cluster;
+  int n_rows;
+  int dim;
+  int graph_degree;
+  cuvs::distance::DistanceType metric;
+  bool host_dataset;
+};
+
 inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p)
 {
   os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
@@ -50,6 +64,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs&
   return os;
 }
 
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentBatchInputs& p)
+{
+  os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device")
+     << ", clusters=" << p.recall_cluster.second << std::endl;
+  return os;
+}
+
 template <typename DistanceT, typename DataT, typename IdxT>
 class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
  public:
@@ -65,7 +87,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
   {
     size_t queries_size = ps.n_rows * ps.graph_degree;
     std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
 
     {
       rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
@@ -81,16 +105,17 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
                                         ps.graph_degree,
                                         ps.metric);
       raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       raft::resource::sync_stream(handle_);
     }
-
     {
       {
-        cuvs::neighbors::nn_descent::index_params index_params;
+        nn_descent::index_params index_params;
         index_params.metric                    = ps.metric;
         index_params.graph_degree              = ps.graph_degree;
         index_params.intermediate_graph_degree = 2 * ps.graph_degree;
         index_params.max_iterations            = 100;
+        index_params.return_distances          = true;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -99,24 +124,171 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
           if (ps.host_dataset) {
             auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
             raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            raft::resource::sync_stream(handle_);
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
               (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
-            auto index =
-              cuvs::neighbors::nn_descent::build(handle_, index_params, database_host_view);
-            raft::update_host(
+            auto index = nn_descent::build(handle_, index_params, database_host_view);
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
           } else {
-            auto index = cuvs::neighbors::nn_descent::build(handle_, index_params, database_view);
-            raft::update_host(
+            auto index = nn_descent::build(handle_, index_params, database_view);
+            raft::copy(
               indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
           };
         }
         raft::resource::sync_stream(handle_);
       }
 
+      if (ps.metric == cuvs::distance::DistanceType::InnerProduct) {
+        std::transform(
+          distances_naive.begin(), distances_naive.end(), distances_naive.begin(), [](auto x) {
+            return -x;
+          });
+      }
+
       double min_recall = ps.min_recall;
-      EXPECT_TRUE(eval_recall(
-        indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall));
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.001,
+                                  min_recall));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    raft::random::RngState r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0));
+    } else if constexpr (std::is_same<DataT, int8_t>{}) {
+      raft::random::uniformInt(
+        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(-5), DataT(5));
+    } else {
+      raft::random::uniformInt(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0), DataT(5));
+    }
+    raft::resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    raft::resource::sync_stream(handle_);
+    database.resize(0, stream_);
+  }
+
+ private:
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnNNDescentInputs ps;
+  rmm::device_uvector<DataT> database;
+};
+
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnNNDescentBatchTest : public ::testing::TestWithParam<AnnNNDescentBatchInputs> {
+ public:
+  AnnNNDescentBatchTest()
+    : stream_(raft::resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnNNDescentBatchInputs>::GetParam()),
+      database(0, stream_)
+  {
+  }
+
+  void testNNDescentBatch()
+  {
+    size_t queries_size = ps.n_rows * ps.graph_degree;
+    std::vector<IdxT> indices_NNDescent(queries_size);
+    std::vector<DistanceT> distances_NNDescent(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                        distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        database.data(),
+                                        database.data(),
+                                        ps.n_rows,
+                                        ps.n_rows,
+                                        ps.dim,
+                                        ps.graph_degree,
+                                        ps.metric);
+      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    {
+      {
+        nn_descent::index_params index_params;
+        index_params.metric                    = ps.metric;
+        index_params.graph_degree              = ps.graph_degree;
+        index_params.intermediate_graph_degree = 2 * ps.graph_degree;
+        index_params.max_iterations            = 10;
+        index_params.return_distances          = true;
+        index_params.n_clusters                = ps.recall_cluster.second;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        {
+          if (ps.host_dataset) {
+            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
+              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            auto index = nn_descent::build(handle_, index_params, database_host_view);
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+
+          } else {
+            auto index = nn_descent::build(handle_, index_params, database_view);
+            raft::copy(
+              indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_);
+            if (index.distances().has_value()) {
+              raft::copy(distances_NNDescent.data(),
+                         index.distances().value().data_handle(),
+                         queries_size,
+                         stream_);
+            }
+          };
+        }
+        raft::resource::sync_stream(handle_);
+      }
+      double min_recall = ps.recall_cluster.first;
+      EXPECT_TRUE(eval_neighbours(indices_naive,
+                                  indices_NNDescent,
+                                  distances_naive,
+                                  distances_NNDescent,
+                                  ps.n_rows,
+                                  ps.graph_degree,
+                                  0.01,
+                                  min_recall,
+                                  true,
+                                  static_cast<size_t>(ps.graph_degree * 0.1)));
     }
   }
 
@@ -142,16 +314,29 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
  private:
   raft::resources handle_;
   rmm::cuda_stream_view stream_;
-  AnnNNDescentInputs ps;
+  AnnNNDescentBatchInputs ps;
   rmm::device_uvector<DataT> database;
 };
 
-const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<AnnNNDescentInputs>(
-  {1000, 2000},                                              // n_rows
-  {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024},  // dim
-  {32, 64},                                                  // graph_degree
-  {cuvs::distance::DistanceType::L2Expanded},
-  {false, true},
-  {0.90});
+const std::vector<AnnNNDescentInputs> inputs =
+  raft::util::itertools::product<AnnNNDescentInputs>({2000, 4000},            // n_rows
+                                                     {4, 16, 64, 256, 1024},  // dim
+                                                     {32, 64},                // graph_degree
+                                                     {cuvs::distance::DistanceType::L2Expanded,
+                                                      cuvs::distance::DistanceType::InnerProduct,
+                                                      cuvs::distance::DistanceType::CosineExpanded},
+                                                     {false, true},
+                                                     {0.90});
+
+// TODO : Investigate why this test is failing Reference issue https
+// :  // github.com/rapidsai/raft/issues/2450
+const std::vector<AnnNNDescentBatchInputs> inputsBatch =
+  raft::util::itertools::product<AnnNNDescentBatchInputs>(
+    {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
+    {4000, 5000},                                          // n_rows
+    {192, 512},                                            // dim
+    {32, 64},                                              // graph_degree
+    {cuvs::distance::DistanceType::L2Expanded},
+    {false, true});
 
-}  // namespace  cuvs::neighbors::nn_descent
+}  // namespace cuvs::neighbors::nn_descent
diff --git a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
index 64c0e0291..7a24f96a1 100644
--- a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu
@@ -23,6 +23,12 @@ namespace cuvs::neighbors::nn_descent {
 typedef AnnNNDescentTest<float, float, std::uint32_t> AnnNNDescentTestF_U32;
 TEST_P(AnnNNDescentTestF_U32, AnnNNDescent) { this->testNNDescent(); }
 
+// typedef AnnNNDescentBatchTest<float, float, std::uint32_t> AnnNNDescentBatchTestF_U32;
+// TEST_P(AnnNNDescentBatchTestF_U32, AnnNNDescentBatch) { this->testNNDescentBatch(); }
+
 INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestF_U32, ::testing::ValuesIn(inputs));
+// INSTANTIATE_TEST_CASE_P(AnnNNDescentBatchTest,
+//                         AnnNNDescentBatchTestF_U32,
+//                         ::testing::ValuesIn(inputsBatch));
 
 }  // namespace   cuvs::neighbors::nn_descent
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index b08e1d725..94bccade2 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/device_mdarray.hpp>  // raft::make_device_matrix
 #include <raft/core/resource/cuda_stream.hpp>
@@ -165,9 +166,14 @@ auto calc_recall(const std::vector<T>& expected_idx,
 /** check uniqueness of indices
  */
 template <typename T>
-auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t cols)
+auto check_unique_indices(const std::vector<T>& actual_idx,
+                          size_t rows,
+                          size_t cols,
+                          size_t max_duplicates = 0)
 {
   size_t max_count;
+  size_t dup_count = 0lu;
+
   std::set<T> unique_indices;
   for (size_t i = 0; i < rows; ++i) {
     unique_indices.clear();
@@ -180,8 +186,11 @@ auto check_unique_indices(const std::vector<T>& actual_idx, size_t rows, size_t
       } else if (unique_indices.find(act_idx) == unique_indices.end()) {
         unique_indices.insert(act_idx);
       } else {
-        return testing::AssertionFailure()
-               << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        dup_count++;
+        if (dup_count > max_duplicates) {
+          return testing::AssertionFailure()
+                 << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! ";
+        }
       }
     }
   }
@@ -264,7 +273,8 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
                      size_t cols,
                      double eps,
                      double min_recall,
-                     bool test_unique = true) -> testing::AssertionResult
+                     bool test_unique      = true,
+                     size_t max_duplicates = 0) -> testing::AssertionResult
 {
   auto [actual_recall, match_count, total_count] =
     calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps);
@@ -284,7 +294,7 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
            << min_recall << "); eps = " << eps << ". ";
   }
   if (test_unique)
-    return check_unique_indices(actual_idx, rows, cols);
+    return check_unique_indices(actual_idx, rows, cols, max_duplicates);
   else
     return testing::AssertionSuccess();
 }
diff --git a/cpp/test/neighbors/sparse_brute_force.cu b/cpp/test/neighbors/sparse_brute_force.cu
new file mode 100644
index 000000000..cb68989d4
--- /dev/null
+++ b/cpp/test/neighbors/sparse_brute_force.cu
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cusparse_v2.h>
+#include <gtest/gtest.h>
+
+namespace cuvs {
+namespace neighbors {
+
+using namespace raft;
+using namespace raft::sparse;
+
+template <typename value_idx, typename value_t>
+struct SparseKNNInputs {
+  value_idx n_cols;
+
+  std::vector<value_idx> indptr_h;
+  std::vector<value_idx> indices_h;
+  std::vector<value_t> data_h;
+
+  std::vector<value_t> out_dists_ref_h;
+  std::vector<value_idx> out_indices_ref_h;
+
+  int k;
+
+  int batch_size_index = 2;
+  int batch_size_query = 2;
+
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2SqrtExpanded;
+};
+
+template <typename value_idx, typename value_t>
+::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs<value_idx, value_t>& dims)
+{
+  return os;
+}
+
+template <typename value_idx, typename value_t>
+class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+ public:
+  SparseKNNTest()
+    : params(::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam()),
+      indptr(0, resource::get_cuda_stream(handle)),
+      indices(0, resource::get_cuda_stream(handle)),
+      data(0, resource::get_cuda_stream(handle)),
+      out_indices(0, resource::get_cuda_stream(handle)),
+      out_dists(0, resource::get_cuda_stream(handle)),
+      out_indices_ref(0, resource::get_cuda_stream(handle)),
+      out_dists_ref(0, resource::get_cuda_stream(handle))
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    n_rows = params.indptr_h.size() - 1;
+    nnz    = params.indices_h.size();
+    k      = params.k;
+
+    make_data();
+
+    auto index_structure =
+      raft::make_device_compressed_structure_view<value_idx, value_idx, value_idx>(
+        indptr.data(), indices.data(), n_rows, params.n_cols, nnz);
+    auto index_csr = raft::make_device_csr_matrix_view<const value_t>(data.data(), index_structure);
+
+    auto index = cuvs::neighbors::brute_force::build(handle, index_csr, params.metric);
+
+    cuvs::neighbors::brute_force::sparse_search_params search_params;
+    search_params.batch_size_index = params.batch_size_index;
+    search_params.batch_size_query = params.batch_size_query;
+
+    cuvs::neighbors::brute_force::search(
+      handle,
+      search_params,
+      index,
+      index_csr,
+      raft::make_device_matrix_view<value_idx, int64_t>(out_indices.data(), n_rows, k),
+      raft::make_device_matrix_view<value_t, int64_t>(out_dists.data(), n_rows, k));
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle)));
+  }
+
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(
+      out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(
+      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
+  }
+
+ protected:
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
+    std::vector<value_idx> indices_h = params.indices_h;
+    std::vector<value_t> data_h      = params.data_h;
+
+    auto stream = resource::get_cuda_stream(handle);
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
+
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
+
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
+    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+
+    out_indices_ref.resize(out_indices_ref_h.size(), stream);
+    out_dists_ref.resize(out_dists_ref_h.size(), stream);
+
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
+
+    out_dists.resize(n_rows * k, stream);
+    out_indices.resize(n_rows * k, stream);
+  }
+
+  raft::resources handle;
+
+  int n_rows, nnz, k;
+
+  // input data
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
+
+  // output data
+  rmm::device_uvector<value_idx> out_indices;
+  rmm::device_uvector<value_t> out_dists;
+
+  rmm::device_uvector<value_idx> out_indices_ref;
+  rmm::device_uvector<value_t> out_dists_ref;
+
+  SparseKNNInputs<value_idx, value_t> params;
+};
+
+const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
+  {9,                                                 // ncols
+   {0, 2, 4, 6, 8},                                   // indptr
+   {0, 4, 0, 3, 0, 2, 0, 8},                          // indices
+   {0.0f, 1.0f, 5.0f, 6.0f, 5.0f, 6.0f, 0.0f, 1.0f},  // data
+   {0, 1.41421, 0, 7.87401, 0, 7.87401, 0, 1.41421},  // dists
+   {0, 3, 1, 0, 2, 0, 3, 0},                          // inds
+   2,
+   2,
+   2,
+   cuvs::distance::DistanceType::L2SqrtExpanded}};
+typedef SparseKNNTest<int, float> SparseKNNTestF;
+TEST_P(SparseKNNTestF, Result) { compare(); }
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f));
+
+};  // end namespace neighbors
+};  // end namespace cuvs
diff --git a/dependencies.yaml b/dependencies.yaml
index 48f023ccc..f60ed59d3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -213,11 +213,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0
+              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
           - matrix:
               cuda: "11.*"
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
           - matrix:
             packages:
               - &cuda_python cuda-python
diff --git a/docs/source/c_api/neighbors_bruteforce_c.rst b/docs/source/c_api/neighbors_bruteforce_c.rst
index af0356eee..a12175209 100644
--- a/docs/source/c_api/neighbors_bruteforce_c.rst
+++ b/docs/source/c_api/neighbors_bruteforce_c.rst
@@ -32,3 +32,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: bruteforce_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst
index 4d83cd3e3..988e5b6f3 100644
--- a/docs/source/c_api/neighbors_hnsw_c.rst
+++ b/docs/source/c_api/neighbors_hnsw_c.rst
@@ -29,13 +29,13 @@ Index
 Index search
 ------------
 
-.. doxygengroup:: cagra_c_index_search
+.. doxygengroup:: hnsw_c_index_search
     :project: cuvs
     :members:
     :content-only:
 
 Index serialize
-------------
+---------------
 
 .. doxygengroup:: hnsw_c_index_serialize
     :project: cuvs
diff --git a/docs/source/c_api/neighbors_ivf_flat_c.rst b/docs/source/c_api/neighbors_ivf_flat_c.rst
index 9e1ccc0d1..1254d70ef 100644
--- a/docs/source/c_api/neighbors_ivf_flat_c.rst
+++ b/docs/source/c_api/neighbors_ivf_flat_c.rst
@@ -48,3 +48,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_flat_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/c_api/neighbors_ivf_pq_c.rst b/docs/source/c_api/neighbors_ivf_pq_c.rst
index 070719609..260057b8c 100644
--- a/docs/source/c_api/neighbors_ivf_pq_c.rst
+++ b/docs/source/c_api/neighbors_ivf_pq_c.rst
@@ -48,3 +48,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_pq_c_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/neighbors_bruteforce.rst b/docs/source/cpp_api/neighbors_bruteforce.rst
index 3adcb01c5..f75e26b3c 100644
--- a/docs/source/cpp_api/neighbors_bruteforce.rst
+++ b/docs/source/cpp_api/neighbors_bruteforce.rst
@@ -34,3 +34,11 @@ Index search
     :project: cuvs
     :members:
     :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: bruteforce_cpp_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 647061ae5..286836c18 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,19 +1,8 @@
 cuVS: Vector Search and Clustering on the GPU
 =============================================
 
-
 Welcome to cuVS, the premier library for GPU-accelerated vector search and clustering! cuVS provides several core building blocks for constructing new algorithms, as well as end-to-end vector search and clustering algorithms for use either standalone or through a growing list of :doc:`integrations <integrations>`.
 
-There are several benefits to using cuVS and GPUs for vector search, including
-
-#. Fast index build
-#. Latency critical and high throughput search
-#. Parameter tuning
-#. Cost savings
-#. Interoperability (build on GPU, deploy on CPU)
-#. Multiple language support
-#. Building blocks for composing new or accelerating existing algorithms
-
 Useful Resources
 ################
 
@@ -26,6 +15,67 @@ Useful Resources
 - `Issue tracker <https://github.com/rapidsai/cuvs/issues>`_: Report issues or request features.
 
 
+
+What is cuVS?
+#############
+
+cuVS contains state-of-the-art implementations of several algorithms for running approximate and exact nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering.
+
+Vector search is an information retrieval method that has been growing in popularity over the past few  years, partly because of the rising importance of multimedia embeddings created from unstructured data and the need to perform semantic search on the embeddings to find items which are semantically similar to each other.
+
+Vector search is also used in *data mining and machine learning* tasks and comprises an important step in many *clustering* and *visualization* algorithms like `UMAP <https://arxiv.org/abs/2008.00325>`_, `t-SNE <https://lvdmaaten.github.io/tsne/>`_, K-means, and `HDBSCAN <https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html>`_.
+
+Finally, faster vector search enables interactions between dense vectors and graphs. Converting a pile of dense vectors into nearest neighbors graphs unlocks the entire world of graph analysis algorithms, such as those found in `GraphBLAS <https://graphblas.org/>`_ and `cuGraph <https://github.com/rapidsai/cugraph>`_.
+
+Below are some common use-cases for vector search
+
+Semantic search
+~~~~~~~~~~~~~~~
+- Generative AI & Retrieval augmented generation (RAG)
+- Recommender systems
+- Computer vision
+- Image search
+- Text search
+- Audio search
+- Molecular search
+- Model training
+
+
+Data mining
+~~~~~~~~~~~
+- Clustering algorithms
+- Visualization algorithms
+- Sampling algorithms
+- Class balancing
+- Ensemble methods
+- k-NN graph construction
+
+Why cuVS?
+#########
+
+There are several benefits to using cuVS and GPUs for vector search, including
+
+1. Fast index build
+2. Latency critical and high throughput search
+3. Parameter tuning
+4. Cost savings
+5. Interoperability (build on GPU, deploy on CPU)
+6. Multiple language support
+7. Building blocks for composing new or accelerating existing algorithms
+
+In addition to the items above, cuVS shoulders the responsibility of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be receiving the best performance and scale.
+
+cuVS Technology Stack
+#####################
+
+cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU.
+
+.. image:: ../../img/tech_stack.png
+  :width: 600
+  :alt: cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU
+
+
+
 Contents
 ########
 
diff --git a/docs/source/python_api/neighbors_brute_force.rst b/docs/source/python_api/neighbors_brute_force.rst
index 5fdc3658f..d756a6c80 100644
--- a/docs/source/python_api/neighbors_brute_force.rst
+++ b/docs/source/python_api/neighbors_brute_force.rst
@@ -20,3 +20,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.brute_force.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.brute_force.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.brute_force.load
diff --git a/docs/source/python_api/neighbors_cagra.rst b/docs/source/python_api/neighbors_cagra.rst
index 09b2e2694..e7155efb8 100644
--- a/docs/source/python_api/neighbors_cagra.rst
+++ b/docs/source/python_api/neighbors_cagra.rst
@@ -34,3 +34,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.cagra.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.cagra.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.cagra.load
diff --git a/docs/source/python_api/neighbors_hnsw.rst b/docs/source/python_api/neighbors_hnsw.rst
index 9922805b3..64fe5493b 100644
--- a/docs/source/python_api/neighbors_hnsw.rst
+++ b/docs/source/python_api/neighbors_hnsw.rst
@@ -28,3 +28,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.hnsw.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.hnsw.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.hnsw.load
diff --git a/docs/source/python_api/neighbors_ivf_flat.rst b/docs/source/python_api/neighbors_ivf_flat.rst
index 5514e5e43..f2c21e68a 100644
--- a/docs/source/python_api/neighbors_ivf_flat.rst
+++ b/docs/source/python_api/neighbors_ivf_flat.rst
@@ -32,3 +32,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.ivf_flat.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_flat.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_flat.load
diff --git a/docs/source/python_api/neighbors_ivf_pq.rst b/docs/source/python_api/neighbors_ivf_pq.rst
index e3625ba67..57668fbc3 100644
--- a/docs/source/python_api/neighbors_ivf_pq.rst
+++ b/docs/source/python_api/neighbors_ivf_pq.rst
@@ -32,3 +32,13 @@ Index search
 ############
 
 .. autofunction:: cuvs.neighbors.ivf_pq.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_pq.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.ivf_pq.load
diff --git a/examples/cpp/src/common.cuh b/examples/cpp/src/common.cuh
index 1c93dec0e..8e109a764 100644
--- a/examples/cpp/src/common.cuh
+++ b/examples/cpp/src/common.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cstdint>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_resources.hpp>
@@ -28,6 +30,8 @@
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <fstream>
+
 // Fill dataset and queries with synthetic data.
 void generate_dataset(raft::device_resources const &dev_resources,
                       raft::device_matrix_view<float, int64_t> dataset,
diff --git a/img/tech_stack.png b/img/tech_stack.png
new file mode 100644
index 000000000..2b3eeedba
Binary files /dev/null and b/img/tech_stack.png differ
diff --git a/python/cuvs/cuvs/neighbors/brute_force/__init__.py b/python/cuvs/cuvs/neighbors/brute_force/__init__.py
index b88c4b464..6aa0e4bb2 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/__init__.py
+++ b/python/cuvs/cuvs/neighbors/brute_force/__init__.py
@@ -13,6 +13,6 @@
 # limitations under the License.
 
 
-from .brute_force import Index, build, search
+from .brute_force import Index, build, load, save, search
 
-__all__ = ["Index", "build", "search"]
+__all__ = ["Index", "build", "search", "save", "load"]
diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
index 183827916..f1fc14ba7 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd
@@ -47,3 +47,11 @@ cdef extern from "cuvs/neighbors/brute_force.h" nogil:
                                      DLManagedTensor* neighbors,
                                      DLManagedTensor* distances,
                                      cuvsFilter filter) except +
+
+    cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
+                                        const char * filename,
+                                        cuvsBruteForceIndex_t index) except +
+
+    cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res,
+                                          const char * filename,
+                                          cuvsBruteForceIndex_t index) except +
diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
index 559302ccc..9d43bfb29 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
@@ -24,6 +24,7 @@ from cuvs.common.resources import auto_sync_resources
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t
 from libcpp cimport bool
+from libcpp.string cimport string
 
 from cuvs.common cimport cydlpack
 from cuvs.distance_type cimport cuvsDistanceType
@@ -31,9 +32,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from cuvs.common.c_api cimport cuvsResources_t
 
@@ -256,3 +257,88 @@ def search(Index index,
         ))
 
     return (distances, neighbors)
+
+
+@auto_sync_resources
+def save(filename, Index index, bool include_dataset=True, resources=None):
+    """
+    Saves the index to a file.
+
+    The serialization format can be subject to changes, therefore loading
+    an index saved with a previous version of cuvs is not guaranteed
+    to work.
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    index : Index
+        Trained Brute Force index.
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import brute_force
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = brute_force.build(dataset)
+    >>> # Serialize and deserialize the brute_force index built
+    >>> brute_force.save("my_index.bin", index)
+    >>> index_loaded = brute_force.load("my_index.bin")
+    """
+    cdef string c_filename = filename.encode('utf-8')
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    check_cuvs(cuvsBruteForceSerialize(res,
+                                       c_filename.c_str(),
+                                       index.index))
+
+
+@auto_sync_resources
+def load(filename, resources=None):
+    """
+    Loads index from file.
+
+    The serialization format can be subject to changes, therefore loading
+    an index saved with a previous version of cuvs is not guaranteed
+    to work.
+
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    {resources_docstring}
+
+    Returns
+    -------
+    index : Index
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import brute_force
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = brute_force.build(dataset)
+    >>> # Serialize and deserialize the brute_force index built
+    >>> brute_force.save("my_index.bin", index)
+    >>> index_loaded = brute_force.load("my_index.bin")
+    """
+    cdef Index idx = Index()
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    cdef string c_filename = filename.encode('utf-8')
+
+    check_cuvs(cuvsBruteForceDeserialize(
+        res,
+        c_filename.c_str(),
+        idx.index
+    ))
+    idx.trained = True
+    return idx
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
index 95209dbeb..752aef741 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
@@ -32,7 +32,8 @@ from cuvs.common cimport cydlpack
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
+
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/common.py b/python/cuvs/cuvs/neighbors/common.py
new file mode 100644
index 000000000..c14b9f8c9
--- /dev/null
+++ b/python/cuvs/cuvs/neighbors/common.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _check_input_array(cai, exp_dt, exp_rows=None, exp_cols=None):
+    if cai.dtype not in exp_dt:
+        raise TypeError("dtype %s not supported" % cai.dtype)
+
+    if not cai.c_contiguous:
+        raise ValueError("Row major input is expected")
+
+    if exp_cols is not None and cai.shape[1] != exp_cols:
+        raise ValueError(
+            "Incorrect number of columns, expected {} got {}".format(
+                exp_cols, cai.shape[1]
+            )
+        )
+
+    if exp_rows is not None and cai.shape[0] != exp_rows:
+        raise ValueError(
+            "Incorrect number of rows, expected {} , got {}".format(
+                exp_rows, cai.shape[0]
+            )
+        )
diff --git a/python/cuvs/cuvs/neighbors/filters/filters.pyx b/python/cuvs/cuvs/neighbors/filters/filters.pyx
index 3a81cb786..9bc2a905c 100644
--- a/python/cuvs/cuvs/neighbors/filters/filters.pyx
+++ b/python/cuvs/cuvs/neighbors/filters/filters.pyx
@@ -20,11 +20,11 @@ import numpy as np
 from libc.stdint cimport uintptr_t
 
 from cuvs.common cimport cydlpack
+from cuvs.neighbors.common import _check_input_array
 
 from .filters cimport BITMAP, NO_FILTER, cuvsFilter
 
 from pylibraft.common.cai_wrapper import wrap_array
-from pylibraft.neighbors.common import _check_input_array
 
 
 cdef class Prefilter:
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index 018fcfef9..bcfaf167e 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -21,6 +21,7 @@ from libcpp.string cimport string
 
 from cuvs.common.exceptions import check_cuvs
 from cuvs.common.resources import auto_sync_resources
+from cuvs.neighbors.common import _check_input_array
 
 from cuvs.common cimport cydlpack
 
@@ -36,7 +37,6 @@ import uuid
 from pylibraft.common import auto_convert_output
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 
 cdef class SearchParams:
diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
index 25b9b2aee..7a169e1a0 100644
--- a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx
@@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
index 3add1df75..531302ee6 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
@@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
+from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
diff --git a/python/cuvs/cuvs/neighbors/refine.pyx b/python/cuvs/cuvs/neighbors/refine.pyx
index 0eccc4108..b7aa35dca 100644
--- a/python/cuvs/cuvs/neighbors/refine.pyx
+++ b/python/cuvs/cuvs/neighbors/refine.pyx
@@ -31,13 +31,13 @@ from cuvs.distance_type cimport cuvsDistanceType
 from pylibraft.common import auto_convert_output, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
-from pylibraft.neighbors.common import _check_input_array
 
 from cuvs.distance import DISTANCE_TYPES
 
 from cuvs.common.c_api cimport cuvsResources_t
 
 from cuvs.common.exceptions import check_cuvs
+from cuvs.neighbors.common import _check_input_array
 
 
 @auto_sync_resources
diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py
index 92b88f013..56e132c23 100644
--- a/python/cuvs/cuvs/test/test_cagra.py
+++ b/python/cuvs/cuvs/test/test_cagra.py
@@ -122,8 +122,9 @@ def run_cagra_build_search_test(
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("array_type", ["device", "host"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+@pytest.mark.parametrize("metric", ["euclidean"])
 def test_cagra_dataset_dtype_host_device(
-    dtype, array_type, inplace, build_algo
+    dtype, array_type, inplace, build_algo, metric
 ):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
@@ -132,6 +133,7 @@ def test_cagra_dataset_dtype_host_device(
         inplace=inplace,
         array_type=array_type,
         build_algo=build_algo,
+        metric=metric,
     )
 
 
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
index 0ae97266b..20a35401e 100644
--- a/python/cuvs/cuvs/test/test_hnsw.py
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -23,7 +23,7 @@
 
 
 def run_hnsw_build_search_test(
-    n_rows=1000,
+    n_rows=10000,
     n_cols=10,
     n_queries=100,
     k=10,
@@ -41,8 +41,6 @@ def run_hnsw_build_search_test(
             pytest.skip(
                 "inner_product metric is not supported for int8/uint8 data"
             )
-        if build_algo == "nn_descent":
-            pytest.skip("inner_product metric is not supported for nn_descent")
 
     build_params = cagra.IndexParams(
         metric=metric,
@@ -83,7 +81,7 @@ def run_hnsw_build_search_test(
 @pytest.mark.parametrize("k", [10, 20])
 @pytest.mark.parametrize("ef", [30, 40])
 @pytest.mark.parametrize("num_threads", [2, 4])
-@pytest.mark.parametrize("metric", ["sqeuclidean"])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
 def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
     # Note that inner_product tests use normalized input which we cannot
diff --git a/python/cuvs/cuvs/test/test_serialization.py b/python/cuvs/cuvs/test/test_serialization.py
index 4ffccf121..1f4a54e87 100644
--- a/python/cuvs/cuvs/test/test_serialization.py
+++ b/python/cuvs/cuvs/test/test_serialization.py
@@ -17,7 +17,7 @@
 import pytest
 from pylibraft.common import device_ndarray
 
-from cuvs.neighbors import cagra, ivf_flat, ivf_pq
+from cuvs.neighbors import brute_force, cagra, ivf_flat, ivf_pq
 from cuvs.test.ann_utils import generate_data
 
 
@@ -35,6 +35,10 @@ def test_save_load_ivf_pq():
     run_save_load(ivf_pq, np.float32)
 
 
+def test_save_load_brute_force():
+    run_save_load(brute_force, np.float32)
+
+
 def run_save_load(ann_module, dtype):
     n_rows = 10000
     n_cols = 50
@@ -43,8 +47,11 @@ def run_save_load(ann_module, dtype):
     dataset = generate_data((n_rows, n_cols), dtype)
     dataset_device = device_ndarray(dataset)
 
-    build_params = ann_module.IndexParams()
-    index = ann_module.build(build_params, dataset_device)
+    if ann_module == brute_force:
+        index = ann_module.build(dataset_device)
+    else:
+        build_params = ann_module.IndexParams()
+        index = ann_module.build(build_params, dataset_device)
 
     assert index.trained
     filename = "my_index.bin"
@@ -54,20 +61,29 @@ def run_save_load(ann_module, dtype):
     queries = generate_data((n_queries, n_cols), dtype)
 
     queries_device = device_ndarray(queries)
-    search_params = ann_module.SearchParams()
     k = 10
-
-    distance_dev, neighbors_dev = ann_module.search(
-        search_params, index, queries_device, k
-    )
+    if ann_module == brute_force:
+        distance_dev, neighbors_dev = ann_module.search(
+            index, queries_device, k
+        )
+    else:
+        search_params = ann_module.SearchParams()
+        distance_dev, neighbors_dev = ann_module.search(
+            search_params, index, queries_device, k
+        )
 
     neighbors = neighbors_dev.copy_to_host()
     dist = distance_dev.copy_to_host()
     del index
 
-    distance_dev, neighbors_dev = ann_module.search(
-        search_params, loaded_index, queries_device, k
-    )
+    if ann_module == brute_force:
+        distance_dev, neighbors_dev = ann_module.search(
+            loaded_index, queries_device, k
+        )
+    else:
+        distance_dev, neighbors_dev = ann_module.search(
+            search_params, loaded_index, queries_device, k
+        )
 
     neighbors2 = neighbors_dev.copy_to_host()
     dist2 = distance_dev.copy_to_host()
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index bf62f5adf..92e4993c7 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -133,7 +133,14 @@ build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 
+[tool.pydistcheck]
+select = [
+    # NOTE: size threshold is managed via CLI args in CI scripts
+    "distro-too-large-compressed",
+]
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
+    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning"
 ]