diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c8bd28d4bb..bd37b8ac01 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -15,6 +15,7 @@ jobs:
       - checks
       - conda-cpp-build
       - conda-cpp-tests
+      - conda-cpp-checks
       - conda-python-build
       - conda-python-tests
       - docs-build
@@ -43,6 +44,14 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-23.12
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+      symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass)
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1c2395cb68..0c46d83cf9 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,6 +14,16 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-23.12
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
+      symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass)
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 3dc7740a1e..5d06e46303 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -39,6 +39,8 @@ fi
 if [[ ${package_name} == "raft-dask" ]]; then
     sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
 fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 471987eb40..0be1f1ad82 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -57,6 +57,7 @@ DEPENDENCIES=(
   dask-cuda
   pylibraft
   rmm
+  rapids-dask-dependency
   # ucx-py is handled separately below
 )
 for FILE in dependencies.yaml conda/environments/*.yaml; do
@@ -75,11 +76,6 @@ done
 
 sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
 
-# Wheel builds install dask-cuda from source, update its branch
-for FILE in .github/workflows/*.yaml; do
-  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
-done
-
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index a20e950313..b70563b7a1 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -11,9 +11,6 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
 python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
-# Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
-
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/raft_dask*.whl)[test]
 
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 762d436028..c28f1961e6 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,10 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -49,6 +46,7 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
+- rapids-dask-dependency==23.12.*
 - recommonmark
 - rmm==23.12.*
 - scikit-build>=0.13.1
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 48cd34c6ca..9b7c110bc3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,10 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -49,6 +46,7 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
+- rapids-dask-dependency==23.12.*
 - recommonmark
 - rmm==23.12.*
 - scikit-build>=0.13.1
diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml
index 9108fb6215..8d614d3c2c 100644
--- a/conda/environments/all_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-120_arch-aarch64.yaml
@@ -21,10 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -45,6 +42,7 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
+- rapids-dask-dependency==23.12.*
 - recommonmark
 - rmm==23.12.*
 - scikit-build>=0.13.1
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 8f1fbf6744..f9d65cee39 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -21,10 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -45,6 +42,7 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
+- rapids-dask-dependency==23.12.*
 - recommonmark
 - rmm==23.12.*
 - scikit-build>=0.13.1
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index db4fc860df..eae5a6affe 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -60,10 +60,8 @@ requirements:
     - cudatoolkit
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
     - dask-cuda ={{ minor_version }}
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - joblib >=0.11
     - nccl >=2.9.9
     - pylibraft {{ version }}
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index d6a5fddb98..eb44e58cb5 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -30,6 +30,7 @@ option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB "Include raft's CAGRA in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_SINGLE_EXE
@@ -54,6 +55,7 @@ if(BUILD_CPU_ONLY)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF)
+  set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
   set(RAFT_ANN_BENCH_USE_GGNN OFF)
 else()
   # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled.
@@ -88,6 +90,7 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
    OR RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE
    OR RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
    OR RAFT_ANN_BENCH_USE_RAFT_CAGRA
+  OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB
 )
   set(RAFT_ANN_BENCH_USE_RAFT ON)
 endif()
@@ -95,7 +98,7 @@ endif()
 # ##################################################################################################
 # * Fetch requirements -------------------------------------------------------------
 
-if(RAFT_ANN_BENCH_USE_HNSWLIB)
+if(RAFT_ANN_BENCH_USE_HNSWLIB OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
   include(cmake/thirdparty/get_hnswlib.cmake)
 endif()
 
@@ -250,6 +253,20 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
   )
 endif()
 
+if(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
+  ConfigureAnnBench(
+    NAME
+    RAFT_CAGRA_HNSWLIB
+    PATH
+    bench/ann/src/raft/raft_cagra_hnswlib.cu
+    INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib
+    LINKS
+    raft::compiled
+    CXXFLAGS "${HNSW_CXX_FLAGS}"
+  )
+endif()
+
 set(RAFT_FAISS_TARGETS faiss::faiss)
 if(TARGET faiss::faiss_avx2)
   set(RAFT_FAISS_TARGETS faiss::faiss_avx2)
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 342eebe6e3..1cbd54cb7b 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -33,6 +33,7 @@
 #include <memory>
 #include <mutex>
 #include <numeric>
+#include <sstream>
 #include <string>
 #include <unistd.h>
 #include <vector>
@@ -175,7 +176,6 @@ void bench_search(::benchmark::State& state,
                   std::shared_ptr<const Dataset<T>> dataset,
                   Objective metric_objective)
 {
-  std::ptrdiff_t batch_offset   = 0;
   std::size_t queries_processed = 0;
 
   const auto& sp_json = index.search_params[search_param_ix];
@@ -189,6 +189,20 @@ void bench_search(::benchmark::State& state,
   // Round down the query data to a multiple of the batch size to loop over full batches of data
   const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries;
 
+  if (dataset->query_set_size() < n_queries) {
+    std::stringstream msg;
+    msg << "Not enough queries in benchmark set. Expected " << n_queries << ", actual "
+        << dataset->query_set_size();
+    return state.SkipWithError(msg.str());
+  }
+
+  // Each thread start from a different offset, so that the queries that they process do not
+  // overlap.
+  std::ptrdiff_t batch_offset   = (state.thread_index() * n_queries) % query_set_size;
+  std::ptrdiff_t queries_stride = state.threads() * n_queries;
+  // Output is saved into a contiguous buffer (separate buffers for each thread).
+  std::ptrdiff_t out_offset = 0;
+
   const T* query_set = nullptr;
 
   if (!file_exists(index.file)) {
@@ -278,7 +292,6 @@ void bench_search(::benchmark::State& state,
   {
     nvtx_case nvtx{state.name()};
 
-    // TODO: Have the odd threads load the queries backwards just to rule out caching.
     ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
@@ -289,15 +302,16 @@ void bench_search(::benchmark::State& state,
         algo->search(query_set + batch_offset * dataset->dim(),
                      n_queries,
                      k,
-                     neighbors->data + batch_offset * k,
-                     distances->data + batch_offset * k,
+                     neighbors->data + out_offset * k,
+                     distances->data + out_offset * k,
                      gpu_timer.stream());
       } catch (const std::exception& e) {
         state.SkipWithError(std::string(e.what()));
       }
 
       // advance to the next batch
-      batch_offset = (batch_offset + n_queries) % query_set_size;
+      batch_offset = (batch_offset + queries_stride) % query_set_size;
+      out_offset   = (out_offset + n_queries) % query_set_size;
 
       queries_processed += n_queries;
     }
@@ -323,31 +337,41 @@ void bench_search(::benchmark::State& state,
   // last thread to finish processing notifies all
   if (processed_threads-- == 0) { cond_var.notify_all(); }
 
-  // Use the last thread as a sanity check that all the threads are working.
-  if (state.thread_index() == state.threads() - 1) {
-    // evaluate recall
-    if (dataset->max_k() >= k) {
-      const std::int32_t* gt          = dataset->gt_set();
-      const std::uint32_t max_k       = dataset->max_k();
-      buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
-      std::size_t rows                = std::min(queries_processed, query_set_size);
-      std::size_t match_count         = 0;
-      std::size_t total_count         = rows * static_cast<size_t>(k);
-      for (std::size_t i = 0; i < rows; i++) {
-        for (std::uint32_t j = 0; j < k; j++) {
-          auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
-          for (std::uint32_t l = 0; l < k; l++) {
-            auto exp_idx = gt[i * max_k + l];
-            if (act_idx == exp_idx) {
-              match_count++;
-              break;
+  // Each thread calculates recall on their partition of queries.
+  // evaluate recall
+  if (dataset->max_k() >= k) {
+    const std::int32_t* gt          = dataset->gt_set();
+    const std::uint32_t max_k       = dataset->max_k();
+    buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
+    std::size_t rows                = std::min(queries_processed, query_set_size);
+    std::size_t match_count         = 0;
+    std::size_t total_count         = rows * static_cast<size_t>(k);
+
+    // We go through the groundtruth with same stride as the benchmark loop.
+    size_t out_offset   = 0;
+    size_t batch_offset = (state.thread_index() * n_queries) % query_set_size;
+    while (out_offset < rows) {
+      for (std::size_t i = 0; i < n_queries; i++) {
+        size_t i_orig_idx = batch_offset + i;
+        size_t i_out_idx  = out_offset + i;
+        if (i_out_idx < rows) {
+          for (std::uint32_t j = 0; j < k; j++) {
+            auto act_idx = std::int32_t(neighbors_host.data[i_out_idx * k + j]);
+            for (std::uint32_t l = 0; l < k; l++) {
+              auto exp_idx = gt[i_orig_idx * max_k + l];
+              if (act_idx == exp_idx) {
+                match_count++;
+                break;
+              }
             }
           }
         }
       }
-      double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-      state.counters.insert({{"Recall", actual_recall}});
+      out_offset += n_queries;
+      batch_offset = (batch_offset + queries_stride) % query_set_size;
     }
+    double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+    state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}});
   }
 }
 
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index 364da81f77..921d72decc 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -91,6 +91,8 @@ class HnswLib : public ANN<T> {
     return property;
   }
 
+  void set_base_layer_only() { appr_alg_->base_layer_only = true; }
+
  private:
   void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;
 
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
new file mode 100644
index 0000000000..479a90e3b5
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+#undef WARP_SIZE
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+#include "raft_wrapper.h"
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+#include "raft_ivf_flat_wrapper.h"
+extern template class raft::bench::ann::RaftIvfFlatGpu<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<int8_t, int64_t>;
+#endif
+#if defined(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || \
+  defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
+#include "raft_ivf_pq_wrapper.h"
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+extern template class raft::bench::ann::RaftIvfPQ<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<int8_t, int64_t>;
+#endif
+#if defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
+#include "raft_cagra_wrapper.h"
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA
+extern template class raft::bench::ann::RaftCagra<float, uint32_t>;
+extern template class raft::bench::ann::RaftCagra<uint8_t, uint32_t>;
+extern template class raft::bench::ann::RaftCagra<int8_t, uint32_t>;
+#endif
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
+{
+  param.ivf_flat_params.n_probes = conf.at("nprobe");
+}
+#endif
+
+#if defined(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || \
+  defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfPQ<T, IdxT>::BuildParam& param)
+{
+  if (conf.contains("nlist")) { param.n_lists = conf.at("nlist"); }
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+  if (conf.contains("codebook_kind")) {
+    std::string kind = conf.at("codebook_kind");
+    if (kind == "cluster") {
+      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER;
+    } else if (kind == "subspace") {
+      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
+    } else {
+      throw std::runtime_error("codebook_kind: '" + kind +
+                               "', should be either 'cluster' or 'subspace'");
+    }
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam& param)
+{
+  if (conf.contains("nprobe")) { param.pq_param.n_probes = conf.at("nprobe"); }
+  if (conf.contains("internalDistanceDtype")) {
+    std::string type = conf.at("internalDistanceDtype");
+    if (type == "float") {
+      param.pq_param.internal_distance_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.internal_distance_dtype = CUDA_R_16F;
+    } else {
+      throw std::runtime_error("internalDistanceDtype: '" + type +
+                               "', should be either 'float' or 'half'");
+    }
+  } else {
+    // set half as default type
+    param.pq_param.internal_distance_dtype = CUDA_R_16F;
+  }
+
+  if (conf.contains("smemLutDtype")) {
+    std::string type = conf.at("smemLutDtype");
+    if (type == "float") {
+      param.pq_param.lut_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.lut_dtype = CUDA_R_16F;
+    } else if (type == "fp8") {
+      param.pq_param.lut_dtype = CUDA_R_8U;
+    } else {
+      throw std::runtime_error("smemLutDtype: '" + type +
+                               "', should be either 'float', 'half' or 'fp8'");
+    }
+  } else {
+    // set half as default
+    param.pq_param.lut_dtype = CUDA_R_16F;
+  }
+  if (conf.contains("refine_ratio")) {
+    param.refine_ratio = conf.at("refine_ratio");
+    if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
+  }
+}
+#endif
+
+#if defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       raft::neighbors::experimental::nn_descent::index_params& param)
+{
+  if (conf.contains("graph_degree")) { param.graph_degree = conf.at("graph_degree"); }
+  if (conf.contains("intermediate_graph_degree")) {
+    param.intermediate_graph_degree = conf.at("intermediate_graph_degree");
+  }
+  // we allow niter shorthand for max_iterations
+  if (conf.contains("niter")) { param.max_iterations = conf.at("niter"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("termination_threshold")) {
+    param.termination_threshold = conf.at("termination_threshold");
+  }
+}
+
+nlohmann::json collect_conf_with_prefix(const nlohmann::json& conf,
+                                        const std::string& prefix,
+                                        bool remove_prefix = true)
+{
+  nlohmann::json out;
+  for (auto& i : conf.items()) {
+    if (i.key().compare(0, prefix.size(), prefix) == 0) {
+      auto new_key = remove_prefix ? i.key().substr(prefix.size()) : i.key();
+      out[new_key] = i.value();
+    }
+  }
+  return out;
+}
+
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftCagra<T, IdxT>::BuildParam& param)
+{
+  if (conf.contains("graph_degree")) {
+    param.cagra_params.graph_degree              = conf.at("graph_degree");
+    param.cagra_params.intermediate_graph_degree = param.cagra_params.graph_degree * 2;
+  }
+  if (conf.contains("intermediate_graph_degree")) {
+    param.cagra_params.intermediate_graph_degree = conf.at("intermediate_graph_degree");
+  }
+  if (conf.contains("graph_build_algo")) {
+    if (conf.at("graph_build_algo") == "IVF_PQ") {
+      param.cagra_params.build_algo = raft::neighbors::cagra::graph_build_algo::IVF_PQ;
+    } else if (conf.at("graph_build_algo") == "NN_DESCENT") {
+      param.cagra_params.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT;
+    }
+  }
+  nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_");
+  if (!ivf_pq_build_conf.empty()) {
+    raft::neighbors::ivf_pq::index_params bparam;
+    parse_build_param<T, IdxT>(ivf_pq_build_conf, bparam);
+    param.ivf_pq_build_params = bparam;
+  }
+  nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_");
+  if (!ivf_pq_search_conf.empty()) {
+    typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam sparam;
+    parse_search_param<T, IdxT>(ivf_pq_search_conf, sparam);
+    param.ivf_pq_search_params = sparam.pq_param;
+    param.ivf_pq_refine_rate   = sparam.refine_ratio;
+  }
+  nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_");
+  if (!nn_descent_conf.empty()) {
+    raft::neighbors::experimental::nn_descent::index_params nn_param;
+    nn_param.intermediate_graph_degree = 1.5 * param.cagra_params.intermediate_graph_degree;
+    parse_build_param<T, IdxT>(nn_descent_conf, nn_param);
+    if (nn_param.graph_degree != param.cagra_params.intermediate_graph_degree) {
+      nn_param.graph_degree = param.cagra_params.intermediate_graph_degree;
+    }
+    param.nn_descent_params = nn_param;
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftCagra<T, IdxT>::SearchParam& param)
+{
+  if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
+  if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
+  if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("algo")) {
+    if (conf.at("algo") == "single_cta") {
+      param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA;
+    } else if (conf.at("algo") == "multi_cta") {
+      param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_CTA;
+    } else if (conf.at("algo") == "multi_kernel") {
+      param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_KERNEL;
+    } else if (conf.at("algo") == "auto") {
+      param.p.algo = raft::neighbors::experimental::cagra::search_algo::AUTO;
+    } else {
+      std::string tmp = conf.at("algo");
+      THROW("Invalid value for algo: %s", tmp.c_str());
+    }
+  }
+}
+#endif
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index 6888340b4d..f8c65a2d6e 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -16,172 +16,23 @@
 
 #include "../common/ann_types.hpp"
 
+#include "raft_ann_bench_param_parser.h"
+
 #include <algorithm>
 #include <cmath>
 #include <memory>
+#include <raft/core/logger.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
 #include <utility>
 
-#undef WARP_SIZE
-#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
-#include "raft_wrapper.h"
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-#include "raft_ivf_flat_wrapper.h"
-extern template class raft::bench::ann::RaftIvfFlatGpu<float, int64_t>;
-extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, int64_t>;
-extern template class raft::bench::ann::RaftIvfFlatGpu<int8_t, int64_t>;
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-#include "raft_ivf_pq_wrapper.h"
-extern template class raft::bench::ann::RaftIvfPQ<float, int64_t>;
-extern template class raft::bench::ann::RaftIvfPQ<uint8_t, int64_t>;
-extern template class raft::bench::ann::RaftIvfPQ<int8_t, int64_t>;
-#endif
-#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA
-#include "raft_cagra_wrapper.h"
-extern template class raft::bench::ann::RaftCagra<float, uint32_t>;
-extern template class raft::bench::ann::RaftCagra<uint8_t, uint32_t>;
-extern template class raft::bench::ann::RaftCagra<int8_t, uint32_t>;
-#endif
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
 
 namespace raft::bench::ann {
 
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
-template <typename T, typename IdxT>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
-{
-  param.n_lists = conf.at("nlist");
-  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
-}
-
-template <typename T, typename IdxT>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
-{
-  param.ivf_flat_params.n_probes = conf.at("nprobe");
-}
-#endif
-
-#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
-template <typename T, typename IdxT>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::RaftIvfPQ<T, IdxT>::BuildParam& param)
-{
-  param.n_lists = conf.at("nlist");
-  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
-  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
-  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
-  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
-  if (conf.contains("codebook_kind")) {
-    std::string kind = conf.at("codebook_kind");
-    if (kind == "cluster") {
-      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER;
-    } else if (kind == "subspace") {
-      param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
-    } else {
-      throw std::runtime_error("codebook_kind: '" + kind +
-                               "', should be either 'cluster' or 'subspace'");
-    }
-  }
-}
-
-template <typename T, typename IdxT>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam& param)
-{
-  param.pq_param.n_probes = conf.at("nprobe");
-  if (conf.contains("internalDistanceDtype")) {
-    std::string type = conf.at("internalDistanceDtype");
-    if (type == "float") {
-      param.pq_param.internal_distance_dtype = CUDA_R_32F;
-    } else if (type == "half") {
-      param.pq_param.internal_distance_dtype = CUDA_R_16F;
-    } else {
-      throw std::runtime_error("internalDistanceDtype: '" + type +
-                               "', should be either 'float' or 'half'");
-    }
-  } else {
-    // set half as default type
-    param.pq_param.internal_distance_dtype = CUDA_R_16F;
-  }
-
-  if (conf.contains("smemLutDtype")) {
-    std::string type = conf.at("smemLutDtype");
-    if (type == "float") {
-      param.pq_param.lut_dtype = CUDA_R_32F;
-    } else if (type == "half") {
-      param.pq_param.lut_dtype = CUDA_R_16F;
-    } else if (type == "fp8") {
-      param.pq_param.lut_dtype = CUDA_R_8U;
-    } else {
-      throw std::runtime_error("smemLutDtype: '" + type +
-                               "', should be either 'float', 'half' or 'fp8'");
-    }
-  } else {
-    // set half as default
-    param.pq_param.lut_dtype = CUDA_R_16F;
-  }
-  if (conf.contains("refine_ratio")) {
-    param.refine_ratio = conf.at("refine_ratio");
-    if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
-  }
-}
-#endif
-
-#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA
-template <typename T, typename IdxT>
-void parse_build_param(const nlohmann::json& conf,
-                       typename raft::bench::ann::RaftCagra<T, IdxT>::BuildParam& param)
-{
-  if (conf.contains("graph_degree")) {
-    param.graph_degree              = conf.at("graph_degree");
-    param.intermediate_graph_degree = param.graph_degree * 2;
-  }
-  if (conf.contains("intermediate_graph_degree")) {
-    param.intermediate_graph_degree = conf.at("intermediate_graph_degree");
-  }
-  if (conf.contains("graph_build_algo")) {
-    if (conf.at("graph_build_algo") == "IVF_PQ") {
-      param.build_algo = raft::neighbors::cagra::graph_build_algo::IVF_PQ;
-    } else if (conf.at("graph_build_algo") == "NN_DESCENT") {
-      param.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT;
-    }
-  }
-  if (conf.contains("nn_descent_niter")) { param.nn_descent_niter = conf.at("nn_descent_niter"); }
-}
-
-template <typename T, typename IdxT>
-void parse_search_param(const nlohmann::json& conf,
-                        typename raft::bench::ann::RaftCagra<T, IdxT>::SearchParam& param)
-{
-  if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
-  if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
-  if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
-  if (conf.contains("algo")) {
-    if (conf.at("algo") == "single_cta") {
-      param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA;
-    } else if (conf.at("algo") == "multi_cta") {
-      param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_CTA;
-    } else if (conf.at("algo") == "multi_kernel") {
-      param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_KERNEL;
-    } else if (conf.at("algo") == "auto") {
-      param.p.algo = raft::neighbors::experimental::cagra::search_algo::AUTO;
-    } else {
-      std::string tmp = conf.at("algo");
-      THROW("Invalid value for algo: %s", tmp.c_str());
-    }
-  }
-}
-#endif
-
 template <typename T>
 std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
                                                       const std::string& distance,
@@ -224,6 +75,7 @@ std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
     ann = std::make_unique<raft::bench::ann::RaftCagra<T, uint32_t>>(metric, dim, param);
   }
 #endif
+
   if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
 
   return ann;
@@ -261,6 +113,7 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
     return param;
   }
 #endif
+
   // else
   throw std::runtime_error("invalid algo: '" + algo + "'");
 }
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
new file mode 100644
index 0000000000..ce6fa255b2
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "raft_ann_bench_param_parser.h"
+#include "raft_cagra_hnswlib_wrapper.h"
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftCagraHnswlib<T, IdxT>::SearchParam& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float> or std::is_same_v<T, std::uint8_t>) {
+    if (algo == "raft_cagra_hnswlib") {
+      typename raft::bench::ann::RaftCagraHnswlib<T, uint32_t>::BuildParam param;
+      parse_build_param<T, uint32_t>(conf, param);
+      ann = std::make_unique<raft::bench::ann::RaftCagraHnswlib<T, uint32_t>>(metric, dim, param);
+    }
+  }
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "raft_cagra_hnswlib") {
+    auto param =
+      std::make_unique<typename raft::bench::ann::RaftCagraHnswlib<T, uint32_t>::SearchParam>();
+    parse_search_param<T, uint32_t>(conf, *param);
+    return param;
+  }
+
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
+
+REGISTER_ALGO_INSTANCE(float);
+REGISTER_ALGO_INSTANCE(std::int8_t);
+REGISTER_ALGO_INSTANCE(std::uint8_t);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+int main(int argc, char** argv)
+{
+  rmm::mr::cuda_memory_resource cuda_mr;
+  // Construct a resource that uses a coalescing best-fit pool allocator
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{&cuda_mr};
+  rmm::mr::set_current_device_resource(
+    &pool_mr);  // Updates the current device resource pointer to `pool_mr`
+  rmm::mr::device_memory_resource* mr =
+    rmm::mr::get_current_device_resource();  // Points to `pool_mr`
+  return raft::bench::ann::run_main(argc, argv);
+}
+#endif
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
new file mode 100644
index 0000000000..432caecfcc
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../hnswlib/hnswlib_wrapper.h"
+#include "raft_cagra_wrapper.h"
+#include <memory>
+
+namespace raft::bench::ann {
+
+template <typename T, typename IdxT>
+class RaftCagraHnswlib : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+  using BuildParam  = typename RaftCagra<T, IdxT>::BuildParam;
+  using SearchParam = typename HnswLib<T>::SearchParam;
+
+  RaftCagraHnswlib(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1)
+    : ANN<T>(metric, dim),
+      metric_(metric),
+      index_params_(param),
+      dimension_(dim),
+      handle_(cudaStreamPerThread)
+  {
+  }
+
+  ~RaftCagraHnswlib() noexcept {}
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_preference() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type = MemoryType::HostMmap;
+    property.query_memory_type   = MemoryType::Host;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+
+ private:
+  raft::device_resources handle_;
+  Metric metric_;
+  BuildParam index_params_;
+  int dimension_;
+
+  std::unique_ptr<RaftCagra<T, IdxT>> cagra_build_;
+  std::unique_ptr<HnswLib<T>> hnswlib_search_;
+
+  Objective metric_objective_;
+};
+
+template <typename T, typename IdxT>
+void RaftCagraHnswlib<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t stream)
+{
+  if (not cagra_build_) {
+    cagra_build_ = std::make_unique<RaftCagra<T, IdxT>>(metric_, dimension_, index_params_);
+  }
+  cagra_build_->build(dataset, nrow, stream);
+}
+
+template <typename T, typename IdxT>
+void RaftCagraHnswlib<T, IdxT>::set_search_param(const AnnSearchParam& param_)
+{
+  hnswlib_search_->set_search_param(param_);
+}
+
+template <typename T, typename IdxT>
+void RaftCagraHnswlib<T, IdxT>::save(const std::string& file) const
+{
+  cagra_build_->save_to_hnswlib(file);
+}
+
+template <typename T, typename IdxT>
+void RaftCagraHnswlib<T, IdxT>::load(const std::string& file)
+{
+  typename HnswLib<T>::BuildParam param;
+  // these values don't matter since we don't build with HnswLib
+  param.M               = 50;
+  param.ef_construction = 100;
+  if (not hnswlib_search_) {
+    hnswlib_search_ = std::make_unique<HnswLib<T>>(metric_, dimension_, param);
+  }
+  hnswlib_search_->load(file);
+  hnswlib_search_->set_base_layer_only();
+}
+
+template <typename T, typename IdxT>
+void RaftCagraHnswlib<T, IdxT>::search(
+  const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
+{
+  hnswlib_search_->search(queries, batch_size, k, neighbors, distances);
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index f1c8154b7c..bf526101be 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -19,6 +19,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <optional>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
@@ -28,6 +29,9 @@
 #include <raft/neighbors/cagra.cuh>
 #include <raft/neighbors/cagra_serialize.cuh>
 #include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/detail/cagra/cagra_build.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+#include <raft/neighbors/nn_descent_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <stdexcept>
@@ -50,12 +54,20 @@ class RaftCagra : public ANN<T> {
     auto needs_dataset() const -> bool override { return true; }
   };
 
-  using BuildParam = raft::neighbors::cagra::index_params;
+  struct BuildParam {
+    raft::neighbors::cagra::index_params cagra_params;
+    std::optional<raft::neighbors::experimental::nn_descent::index_params> nn_descent_params =
+      std::nullopt;
+    std::optional<float> ivf_pq_refine_rate                                    = std::nullopt;
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_build_params   = std::nullopt;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params = std::nullopt;
+  };
 
   RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1)
     : ANN<T>(metric, dim), index_params_(param), dimension_(dim), handle_(cudaStreamPerThread)
   {
-    index_params_.metric = parse_metric_type(metric);
+    index_params_.cagra_params.metric         = parse_metric_type(metric);
+    index_params_.ivf_pq_build_params->metric = parse_metric_type(metric);
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
   }
 
@@ -86,6 +98,7 @@ class RaftCagra : public ANN<T> {
   }
   void save(const std::string& file) const override;
   void load(const std::string&) override;
+  void save_to_hnswlib(const std::string& file) const;
 
  private:
   raft::device_resources handle_;
@@ -99,17 +112,19 @@ class RaftCagra : public ANN<T> {
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
 {
-  if (raft::get_device_for_address(dataset) == -1) {
-    auto dataset_view =
-      raft::make_host_matrix_view<const T, int64_t>(dataset, IdxT(nrow), dimension_);
-    index_.emplace(raft::neighbors::cagra::build(handle_, index_params_, dataset_view));
-    return;
-  } else {
-    auto dataset_view =
-      raft::make_device_matrix_view<const T, int64_t>(dataset, IdxT(nrow), dimension_);
-    index_.emplace(raft::neighbors::cagra::build(handle_, index_params_, dataset_view));
-    return;
-  }
+  auto dataset_view =
+    raft::make_host_matrix_view<const T, int64_t>(dataset, IdxT(nrow), dimension_);
+
+  auto& params = index_params_.cagra_params;
+
+  index_.emplace(raft::neighbors::cagra::detail::build(handle_,
+                                                       params,
+                                                       dataset_view,
+                                                       index_params_.nn_descent_params,
+                                                       index_params_.ivf_pq_refine_rate,
+                                                       index_params_.ivf_pq_build_params,
+                                                       index_params_.ivf_pq_search_params));
+  return;
 }
 
 template <typename T, typename IdxT>
@@ -129,7 +144,13 @@ void RaftCagra<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::save(const std::string& file) const
 {
-  raft::neighbors::cagra::serialize(handle_, file, *index_, false);
+  raft::neighbors::cagra::serialize<T, IdxT>(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void RaftCagra<T, IdxT>::save_to_hnswlib(const std::string& file) const
+{
+  raft::neighbors::cagra::serialize_to_hnswlib<T, IdxT>(handle_, file, *index_);
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/cmake/patches/hnswlib.patch b/cpp/cmake/patches/hnswlib.patch
new file mode 100644
index 0000000000..32c1537c58
--- /dev/null
+++ b/cpp/cmake/patches/hnswlib.patch
@@ -0,0 +1,130 @@
+diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h
+index e95e0b5..f0fe50a 100644
+--- a/hnswlib/hnswalg.h
++++ b/hnswlib/hnswalg.h
+@@ -3,6 +3,7 @@
+ #include "visited_list_pool.h"
+ #include "hnswlib.h"
+ #include <atomic>
++#include <limits>
+ #include <random>
+ #include <stdlib.h>
+ #include <assert.h>
+@@ -16,6 +17,8 @@ namespace hnswlib {
+     template<typename dist_t>
+     class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+     public:
++        bool base_layer_only{false};
++        int num_seeds=32;
+         static const tableint max_update_element_locks = 65536;
+         HierarchicalNSW(SpaceInterface<dist_t> *s) {
+         }
+@@ -56,7 +59,7 @@ namespace hnswlib {
+             visited_list_pool_ = new VisitedListPool(1, max_elements);
+ 
+             //initializations for special treatment of the first node
+-            enterpoint_node_ = -1;
++            enterpoint_node_ = std::numeric_limits<tableint>::max();
+             maxlevel_ = -1;
+ 
+             linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
+@@ -527,7 +530,7 @@ namespace hnswlib {
+                     tableint *datal = (tableint *) (data + 1);
+                     for (int i = 0; i < size; i++) {
+                         tableint cand = datal[i];
+-                        if (cand < 0 || cand > max_elements_)
++                        if (cand > max_elements_)
+                             throw std::runtime_error("cand error");
+                         dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+ 
+@@ -1067,7 +1070,7 @@ namespace hnswlib {
+                             tableint *datal = (tableint *) (data + 1);
+                             for (int i = 0; i < size; i++) {
+                                 tableint cand = datal[i];
+-                                if (cand < 0 || cand > max_elements_)
++                                if (cand > max_elements_)
+                                     throw std::runtime_error("cand error");
+                                 dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
+                                 if (d < curdist) {
+@@ -1119,28 +1122,41 @@ namespace hnswlib {
+             tableint currObj = enterpoint_node_;
+             dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+ 
+-            for (int level = maxlevel_; level > 0; level--) {
+-                bool changed = true;
+-                while (changed) {
+-                    changed = false;
+-                    unsigned int *data;
++            if (base_layer_only) {
++                // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
++                for (int i = 0; i < num_seeds; i++) {
++                    tableint obj = i * (max_elements_ / num_seeds);
++                    dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
++                    if (dist < curdist) {
++                        curdist = dist;
++                        currObj = obj;
++                    }
++                }
++            }
++            else{
++                for (int level = maxlevel_; level > 0; level--) {
++                    bool changed = true;
++                    while (changed) {
++                        changed = false;
++                        unsigned int *data;
+ 
+-                    data = (unsigned int *) get_linklist(currObj, level);
+-                    int size = getListCount(data);
+-                    metric_hops++;
+-                    metric_distance_computations+=size;
++                        data = (unsigned int *) get_linklist(currObj, level);
++                        int size = getListCount(data);
++                        metric_hops++;
++                        metric_distance_computations+=size;
+ 
+-                    tableint *datal = (tableint *) (data + 1);
+-                    for (int i = 0; i < size; i++) {
+-                        tableint cand = datal[i];
+-                        if (cand < 0 || cand > max_elements_)
+-                            throw std::runtime_error("cand error");
+-                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
++                        tableint *datal = (tableint *) (data + 1);
++                        for (int i = 0; i < size; i++) {
++                            tableint cand = datal[i];
++                            if (cand > max_elements_)
++                                throw std::runtime_error("cand error");
++                            dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+ 
+-                        if (d < curdist) {
+-                            curdist = d;
+-                            currObj = cand;
+-                            changed = true;
++                            if (d < curdist) {
++                                curdist = d;
++                                currObj = cand;
++                                changed = true;
++                            }
+                         }
+                     }
+                 }
+diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h
+index 5e1a4a5..4195ebd 100644
+--- a/hnswlib/visited_list_pool.h
++++ b/hnswlib/visited_list_pool.h
+@@ -3,6 +3,7 @@
+ #include <mutex>
+ #include <string.h>
+ #include <deque>
++#include <limits>
+ 
+ namespace hnswlib {
+     typedef unsigned short int vl_type;
+@@ -14,7 +15,7 @@ namespace hnswlib {
+         unsigned int numelements;
+ 
+         VisitedList(int numelements1) {
+-            curV = -1;
++            curV = std::numeric_limits<vl_type>::max();
+             numelements = numelements1;
+             mass = new vl_type[numelements];
+         }
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 94033e8333..a4ceacae38 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -26,6 +26,11 @@ function(find_and_configure_hnswlib)
                 COMMAND git clone --branch=v0.6.2 https://github.com/nmslib/hnswlib.git hnswlib-src
                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
 
+        message("SOURCE ${CMAKE_CURRENT_SOURCE_DIR}")
+        execute_process (
+                COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/hnswlib.patch
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src
+        )
     endif ()
 
     include(cmake/modules/FindAVX.cmake)
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index b3c07b9d3a..fa12005df2 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -22,6 +22,7 @@
 #include <raft/linalg/map.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
@@ -103,15 +104,27 @@ _RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool select_min)
   return (twiddle_in(x, select_min) >> start_bit) & mask;
 }
 
-template <typename T, typename IdxT>
+// Strangely, RATIO_T has a strong impact on register usage and occupancy for sm80, e.g.
+// using RATIO_T=unsigned for radix_kernel decreases occupancy (with CUDA 12).
+// In the meanwhile, RATIO_T has no impact for sm90.
+template <typename T, typename IdxT, typename RATIO_T = float>
 _RAFT_HOST_DEVICE IdxT calc_buf_len(IdxT len)
 {
   // When writing is skipped, only read `in`(type T).
   // When writing is not skipped, read `in_buf`(T) and `in_idx_buf`(IdxT), and write `out_buf`(T)
   // and `out_idx_buf`(IdxT).
   // The ratio between these cases determines whether to skip writing and hence the buffer size.
-  constexpr float ratio = 2 + sizeof(IdxT) * 2.0 / sizeof(T);
-  return len / ratio;
+  constexpr RATIO_T ratio = 2 + sizeof(IdxT) * 2 / sizeof(T);
+  // Even such estimation is too conservative, so further decrease buf_len by 1/8
+  IdxT buf_len = len / (ratio * 8);
+
+  // one-block kernel splits one large buffer into smaller ones, so round buf size to 256 bytes to
+  // avoid alignment issues
+  static_assert(is_a_power_of_two(sizeof(T)));
+  static_assert(is_a_power_of_two(sizeof(IdxT)));
+  constexpr IdxT aligned = 256 / std::min(sizeof(T), sizeof(IdxT));
+  buf_len                = Pow2<aligned>::roundDown(buf_len);
+  return buf_len;
 }
 
 /**
@@ -208,6 +221,11 @@ struct alignas(128) Counter {
 /**
  * Fused filtering of the current pass and building histogram for the next pass
  * (see steps 4 & 1 in `radix_kernel` description).
+ *
+ * This function is more complicated than the one-block counterpart because this function handles
+ * the case of early stopping. When early stopping is triggered, it's desirable to do the final
+ * filtering in this function rather than in last_filter(), because this function is run by multiple
+ * blocks while last_filter is run by a single block.
  */
 template <typename T, typename IdxT, int BitsPerPass>
 _RAFT_DEVICE void filter_and_histogram(const T* in_buf,
@@ -397,7 +415,7 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
   const int start_bit       = calc_start_bit<T, BitsPerPass>(pass);
 
   // changed in choose_bucket(); need to reload
-  const IdxT needed_num_of_kth = counter->k;
+  const IdxT num_of_kth_needed = counter->k;
   IdxT* p_out_cnt              = &counter->out_cnt;
   IdxT* p_out_back_cnt         = &counter->out_back_cnt;
   for (IdxT i = threadIdx.x; i < current_len; i += blockDim.x) {
@@ -412,7 +430,7 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
       out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
     } else if (bits == kth_value_bits) {
       IdxT back_pos = atomicAdd(p_out_back_cnt, static_cast<IdxT>(1));
-      if (back_pos < needed_num_of_kth) {
+      if (back_pos < num_of_kth_needed) {
         IdxT pos     = k - 1 - back_pos;
         out[pos]     = value;
         out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
@@ -428,8 +446,8 @@ RAFT_KERNEL last_filter_kernel(const T* in,
                                const IdxT* in_idx_buf,
                                T* out,
                                IdxT* out_idx,
-                               IdxT len,
-                               IdxT k,
+                               const IdxT len,
+                               const IdxT k,
                                Counter<T, IdxT>* counters,
                                const bool select_min)
 {
@@ -454,14 +472,14 @@ RAFT_KERNEL last_filter_kernel(const T* in,
   constexpr int start_bit = calc_start_bit<T, BitsPerPass>(pass);
 
   const auto kth_value_bits    = counter->kth_value_bits;
-  const IdxT needed_num_of_kth = counter->k;
+  const IdxT num_of_kth_needed = counter->k;
   IdxT* p_out_cnt              = &counter->out_cnt;
   IdxT* p_out_back_cnt         = &counter->out_back_cnt;
 
   auto f = [k,
             select_min,
             kth_value_bits,
-            needed_num_of_kth,
+            num_of_kth_needed,
             p_out_cnt,
             p_out_back_cnt,
             in_idx_buf,
@@ -474,7 +492,7 @@ RAFT_KERNEL last_filter_kernel(const T* in,
       out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
     } else if (bits == kth_value_bits) {
       IdxT back_pos = atomicAdd(p_out_back_cnt, static_cast<IdxT>(1));
-      if (back_pos < needed_num_of_kth) {
+      if (back_pos < num_of_kth_needed) {
         IdxT pos     = k - 1 - back_pos;
         out[pos]     = value;
         out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
@@ -657,16 +675,35 @@ RAFT_KERNEL radix_kernel(const T* in,
 }
 
 template <typename T, typename IdxT, int BlockSize, typename Kernel>
-int calc_chunk_size(int batch_size, IdxT len, int sm_cnt, Kernel kernel)
+int calc_chunk_size(int batch_size, IdxT len, int sm_cnt, Kernel kernel, bool one_block)
 {
   int active_blocks;
   RAFT_CUDA_TRY(
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(&active_blocks, kernel, BlockSize, 0));
 
-  constexpr int items_per_thread = 32;
-  constexpr int num_waves        = 10;
-  int chunk_size =
-    std::max<int>(1, num_waves * sm_cnt * active_blocks * BlockSize * items_per_thread / len);
+  // The chunk size is chosen so that there is enough workload to fully utilize GPU.
+  // One full wave contains (sm_cnt * active_blocks) blocks, and 10 waves is an empirically safe
+  // estimation of enough workload. It also counteracts imbalance if some blocks run slower
+  // than others.
+  constexpr int num_waves = 10;
+  int chunk_size;
+  if (one_block) {
+    // For one-block version, one block processes one instance in the chunk. Just ensure that there
+    // are enough blocks.
+    chunk_size = num_waves * sm_cnt * active_blocks;
+  } else {
+    // One instance in the chunk contains len items and is processed by multiple blocks.
+    // The total number of items in a chunk (chunk_size * len) should be large enough that every
+    // thread has enough items to processes. So set it to num_waves * "max num of active threads"
+    // (sm_cnt * active_blocks * BlockSize) * items_per_thread.
+    //
+    // Also, the upper bound of the total number of items in a chunk is:
+    // 10 (num_waves) * ~100 (sm_cnt) * 2048 (active_blocks*BlockSize) * 32 (items_per_thread) =64M.
+    // So temporary buffer size required for one chunk won't be too large.
+    constexpr int items_per_thread = 32;
+    chunk_size =
+      std::max<int>(1, num_waves * sm_cnt * active_blocks * BlockSize * items_per_thread / len);
+  }
   return std::min(chunk_size, batch_size);
 }
 
@@ -709,17 +746,17 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 }
 
 template <typename T, typename IdxT>
-_RAFT_HOST_DEVICE void set_buf_pointers(const T* in,
-                                        const IdxT* in_idx,
-                                        T* buf1,
-                                        IdxT* idx_buf1,
-                                        T* buf2,
-                                        IdxT* idx_buf2,
-                                        int pass,
-                                        const T*& in_buf,
-                                        const IdxT*& in_idx_buf,
-                                        T*& out_buf,
-                                        IdxT*& out_idx_buf)
+_RAFT_HOST void set_buf_pointers(const T* in,
+                                 const IdxT* in_idx,
+                                 T* buf1,
+                                 IdxT* idx_buf1,
+                                 T* buf2,
+                                 IdxT* idx_buf2,
+                                 int pass,
+                                 const T*& in_buf,
+                                 const IdxT*& in_idx_buf,
+                                 T*& out_buf,
+                                 IdxT*& out_idx_buf)
 {
   if (pass == 0) {
     in_buf      = in;
@@ -744,6 +781,41 @@ _RAFT_HOST_DEVICE void set_buf_pointers(const T* in,
   }
 }
 
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   int pass,
+                                   const T*& in_buf,
+                                   const IdxT*& in_idx_buf,
+                                   T*& out_buf,
+                                   IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    in_buf      = in;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    in_buf      = in;
+    in_idx_buf  = in_idx;
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    in_buf      = reinterpret_cast<T*>(bufs);
+    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    out_buf     = const_cast<T*>(in_buf + buf_len);
+    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    in_buf      = out_buf + buf_len;
+    in_idx_buf  = out_idx_buf + buf_len;
+  }
+}
+
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
@@ -765,7 +837,7 @@ void radix_topk(const T* in,
 
   auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
   const size_t max_chunk_size =
-    calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel);
+    calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, false);
   if (max_chunk_size != static_cast<size_t>(batch_size)) {
     grid_dim = calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(max_chunk_size, len, sm_cnt);
   }
@@ -793,6 +865,7 @@ void radix_topk(const T* in,
     RAFT_CUDA_TRY(
       cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
+    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
 
     const T* chunk_in        = in + offset * len;
     const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr;
@@ -866,6 +939,7 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
                                                      IdxT* out_idx_buf,
                                                      T* out,
                                                      IdxT* out_idx,
+                                                     const IdxT previous_len,
                                                      Counter<T, IdxT>* counter,
                                                      IdxT* histogram,
                                                      bool select_min,
@@ -879,9 +953,8 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
   if (threadIdx.x == 0) { *p_filter_cnt = 0; }
   __syncthreads();
 
-  const int start_bit     = calc_start_bit<T, BitsPerPass>(pass);
-  const unsigned mask     = calc_mask<T, BitsPerPass>(pass);
-  const IdxT previous_len = counter->previous_len;
+  const int start_bit = calc_start_bit<T, BitsPerPass>(pass);
+  const unsigned mask = calc_mask<T, BitsPerPass>(pass);
 
   if (pass == 0) {
     auto f = [histogram, select_min, start_bit, mask](T value, IdxT) {
@@ -889,6 +962,20 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
       atomicAdd(histogram + bucket, static_cast<IdxT>(1));
     };
     vectorized_process(threadIdx.x, blockDim.x, in_buf, previous_len, f);
+  } else if (!out_buf) {
+    // not use vectorized_process here because it increases #registers a lot
+    const auto kth_value_bits    = counter->kth_value_bits;
+    const int previous_start_bit = calc_start_bit<T, BitsPerPass>(pass - 1);
+
+    for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+      const T value            = in_buf[i];
+      const auto previous_bits = (twiddle_in(value, select_min) >> previous_start_bit)
+                                 << previous_start_bit;
+      if (previous_bits == kth_value_bits) {
+        int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, select_min);
+        atomicAdd(histogram + bucket, static_cast<IdxT>(1));
+      }
+    }
   } else {
     // not use vectorized_process here because it increases #registers a lot
     IdxT* p_out_cnt              = &counter->out_cnt;
@@ -927,10 +1014,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         T* out,
                                         IdxT* out_idx,
                                         const bool select_min,
-                                        T* buf1,
-                                        IdxT* idx_buf1,
-                                        T* buf2,
-                                        IdxT* idx_buf2)
+                                        char* bufs)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ Counter<T, IdxT> counter;
@@ -951,22 +1035,30 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
   if (in_idx) { in_idx += batch_id * len; }
   out += batch_id * k;
   out_idx += batch_id * k;
-  buf1 += batch_id * len;
-  idx_buf1 += batch_id * len;
-  buf2 += batch_id * len;
-  idx_buf2 += batch_id * len;
-  const T* in_buf        = nullptr;
-  const IdxT* in_idx_buf = nullptr;
-  T* out_buf             = nullptr;
-  IdxT* out_idx_buf      = nullptr;
+  const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
   constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
   for (int pass = 0; pass < num_passes; ++pass) {
-    set_buf_pointers(
-      in, in_idx, buf1, idx_buf1, buf2, idx_buf2, pass, in_buf, in_idx_buf, out_buf, out_idx_buf);
-
-    IdxT current_len = counter.len;
-    IdxT current_k   = counter.k;
+    const T* in_buf;
+    const IdxT* in_idx_buf;
+    T* out_buf;
+    IdxT* out_idx_buf;
+    set_buf_pointers(in, in_idx, bufs, buf_len, pass, in_buf, in_idx_buf, out_buf, out_idx_buf);
+
+    const IdxT current_len = counter.len;
+    const IdxT current_k   = counter.k;
+    IdxT previous_len      = counter.previous_len;
+    if (previous_len > buf_len) {
+      in_buf       = in;
+      in_idx_buf   = in_idx;
+      previous_len = len;
+    }
+    if (current_len > buf_len) {
+      // so "out_buf==nullptr" denotes skipping writing buffer in current pass
+      out_buf     = nullptr;
+      out_idx_buf = nullptr;
+    }
 
     filter_and_histogram_for_one_block<T, IdxT, BitsPerPass>(in_buf,
                                                              in_idx_buf,
@@ -974,6 +1066,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                                              out_idx_buf,
                                                              out,
                                                              out_idx,
+                                                             previous_len,
                                                              &counter,
                                                              histogram,
                                                              select_min,
@@ -988,11 +1081,11 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
     __syncthreads();
 
     if (counter.len == counter.k || pass == num_passes - 1) {
-      last_filter<T, IdxT, BitsPerPass>(pass == 0 ? in : out_buf,
-                                        pass == 0 ? in_idx : out_idx_buf,
+      last_filter<T, IdxT, BitsPerPass>(out_buf ? out_buf : in,
+                                        out_buf ? out_idx_buf : in_idx,
                                         out,
                                         out_idx,
-                                        current_len,
+                                        out_buf ? current_len : len,
                                         k,
                                         &counter,
                                         select_min,
@@ -1022,21 +1115,17 @@ void radix_topk_one_block(const T* in,
 {
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
 
-  auto kernel = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   const size_t max_chunk_size =
-    calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel);
+    calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, true);
 
   auto pool_guard =
-    raft::get_pool_memory_resource(mr,
-                                   max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT)) +
-                                     256 * 4  // might need extra memory for alignment
-    );
+    raft::get_pool_memory_resource(mr, max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)));
   if (pool_guard) { RAFT_LOG_DEBUG("radix::select_k: using pool memory resource"); }
 
-  rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<T> buf2(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<char> bufs(
+    max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
@@ -1047,10 +1136,7 @@ void radix_topk_one_block(const T* in,
                                                  out + offset * k,
                                                  out_idx + offset * k,
                                                  select_min,
-                                                 buf1.data(),
-                                                 idx_buf1.data(),
-                                                 buf2.data(),
-                                                 idx_buf2.data());
+                                                 bufs.data());
   }
 }
 
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index 1efb4da95e..384ed05e1f 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -224,22 +224,7 @@ void optimize(raft::resources const& res,
               mdspan<IdxT, matrix_extent<int64_t>, row_major, g_accessor> knn_graph,
               raft::host_matrix_view<IdxT, int64_t, row_major> new_graph)
 {
-  using internal_IdxT = typename std::make_unsigned<IdxT>::type;
-
-  auto new_graph_internal = raft::make_host_matrix_view<internal_IdxT, int64_t>(
-    reinterpret_cast<internal_IdxT*>(new_graph.data_handle()),
-    new_graph.extent(0),
-    new_graph.extent(1));
-
-  using g_accessor_internal =
-    host_device_accessor<std::experimental::default_accessor<internal_IdxT>, memory_type::host>;
-  auto knn_graph_internal =
-    mdspan<internal_IdxT, matrix_extent<int64_t>, row_major, g_accessor_internal>(
-      reinterpret_cast<internal_IdxT*>(knn_graph.data_handle()),
-      knn_graph.extent(0),
-      knn_graph.extent(1));
-
-  cagra::detail::graph::optimize(res, knn_graph_internal, new_graph_internal);
+  detail::optimize(res, knn_graph, new_graph);
 }
 
 /**
@@ -290,47 +275,7 @@ index<T, IdxT> build(raft::resources const& res,
                      const index_params& params,
                      mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset)
 {
-  size_t intermediate_degree = params.intermediate_graph_degree;
-  size_t graph_degree        = params.graph_degree;
-  if (intermediate_degree >= static_cast<size_t>(dataset.extent(0))) {
-    RAFT_LOG_WARN(
-      "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu",
-      dataset.extent(0));
-    intermediate_degree = dataset.extent(0) - 1;
-  }
-  if (intermediate_degree < graph_degree) {
-    RAFT_LOG_WARN(
-      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
-      "graph_degree.",
-      graph_degree,
-      intermediate_degree);
-    graph_degree = intermediate_degree;
-  }
-
-  std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
-    raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));
-
-  if (params.build_algo == graph_build_algo::IVF_PQ) {
-    build_knn_graph(res, dataset, knn_graph->view());
-
-  } else {
-    // Use nn-descent to build CAGRA knn graph
-    auto nn_descent_params                      = experimental::nn_descent::index_params();
-    nn_descent_params.graph_degree              = intermediate_degree;
-    nn_descent_params.intermediate_graph_degree = 1.5 * intermediate_degree;
-    nn_descent_params.max_iterations            = params.nn_descent_niter;
-    build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), nn_descent_params);
-  }
-
-  auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
-
-  optimize<IdxT>(res, knn_graph->view(), cagra_graph.view());
-
-  // free intermediate graph before trying to create the index
-  knn_graph.reset();
-
-  // Construct an index from dataset and optimized knn graph.
-  return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));
+  return detail::build<T, IdxT, Accessor>(res, params, dataset);
 }
 
 /**
diff --git a/cpp/include/raft/neighbors/cagra_serialize.cuh b/cpp/include/raft/neighbors/cagra_serialize.cuh
index 0a806402d2..c801bc9eda 100644
--- a/cpp/include/raft/neighbors/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/cagra_serialize.cuh
@@ -93,6 +93,70 @@ void serialize(raft::resources const& handle,
   detail::serialize(handle, filename, index, include_dataset);
 }
 
+/**
+ * Write the CAGRA built index as a base layer HNSW index to an output stream
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cagra::build(...);`
+ * raft::serialize_to_hnswlib(handle, os, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index CAGRA index
+ *
+ */
+template <typename T, typename IdxT>
+void serialize_to_hnswlib(raft::resources const& handle,
+                          std::ostream& os,
+                          const index<T, IdxT>& index)
+{
+  detail::serialize_to_hnswlib<T, IdxT>(handle, os, index);
+}
+
+/**
+ * Write the CAGRA built index as a base layer HNSW index to file
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = cagra::build(...);`
+ * raft::serialize_to_hnswlib(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index CAGRA index
+ *
+ */
+template <typename T, typename IdxT>
+void serialize_to_hnswlib(raft::resources const& handle,
+                          const std::string& filename,
+                          const index<T, IdxT>& index)
+{
+  detail::serialize_to_hnswlib<T, IdxT>(handle, filename, index);
+}
+
 /**
  * Load index from input stream
  *
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index 40024a3deb..ddaf77a22f 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -264,4 +264,86 @@ void build_knn_graph(raft::resources const& res,
   graph::sort_knn_graph(res, dataset, knn_graph_internal);
 }
 
+template <typename IdxT = uint32_t,
+          typename g_accessor =
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
+void optimize(raft::resources const& res,
+              mdspan<IdxT, matrix_extent<int64_t>, row_major, g_accessor> knn_graph,
+              raft::host_matrix_view<IdxT, int64_t, row_major> new_graph)
+{
+  using internal_IdxT = typename std::make_unsigned<IdxT>::type;
+
+  auto new_graph_internal = raft::make_host_matrix_view<internal_IdxT, int64_t>(
+    reinterpret_cast<internal_IdxT*>(new_graph.data_handle()),
+    new_graph.extent(0),
+    new_graph.extent(1));
+
+  using g_accessor_internal =
+    host_device_accessor<std::experimental::default_accessor<internal_IdxT>, memory_type::host>;
+  auto knn_graph_internal =
+    mdspan<internal_IdxT, matrix_extent<int64_t>, row_major, g_accessor_internal>(
+      reinterpret_cast<internal_IdxT*>(knn_graph.data_handle()),
+      knn_graph.extent(0),
+      knn_graph.extent(1));
+
+  cagra::detail::graph::optimize(res, knn_graph_internal, new_graph_internal);
+}
+
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+index<T, IdxT> build(
+  raft::resources const& res,
+  const index_params& params,
+  mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> dataset,
+  std::optional<experimental::nn_descent::index_params> nn_descent_params = std::nullopt,
+  std::optional<float> refine_rate                                        = std::nullopt,
+  std::optional<ivf_pq::index_params> pq_build_params                     = std::nullopt,
+  std::optional<ivf_pq::search_params> search_params                      = std::nullopt)
+{
+  size_t intermediate_degree = params.intermediate_graph_degree;
+  size_t graph_degree        = params.graph_degree;
+  if (intermediate_degree >= static_cast<size_t>(dataset.extent(0))) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu",
+      dataset.extent(0));
+    intermediate_degree = dataset.extent(0) - 1;
+  }
+  if (intermediate_degree < graph_degree) {
+    RAFT_LOG_WARN(
+      "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing "
+      "graph_degree.",
+      graph_degree,
+      intermediate_degree);
+    graph_degree = intermediate_degree;
+  }
+
+  std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
+    raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));
+
+  if (params.build_algo == graph_build_algo::IVF_PQ) {
+    build_knn_graph(res, dataset, knn_graph->view(), refine_rate, pq_build_params, search_params);
+
+  } else {
+    // Use nn-descent to build CAGRA knn graph
+    if (!nn_descent_params) {
+      nn_descent_params                            = experimental::nn_descent::index_params();
+      nn_descent_params->graph_degree              = intermediate_degree;
+      nn_descent_params->intermediate_graph_degree = 1.5 * intermediate_degree;
+      nn_descent_params->max_iterations            = params.nn_descent_niter;
+    }
+    build_knn_graph<T, IdxT>(res, dataset, knn_graph->view(), *nn_descent_params);
+  }
+
+  auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
+
+  optimize<IdxT>(res, knn_graph->view(), cagra_graph.view());
+
+  // free intermediate graph before trying to create the index
+  knn_graph.reset();
+
+  // Construct an index from dataset and optimized knn graph.
+  return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));
+}
 }  // namespace raft::neighbors::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
index af3513c212..439d10cd5e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -16,12 +16,18 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+#include <raft/core/host_mdarray.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/mdspan_types.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 
 #include <fstream>
+#include <type_traits>
 
 namespace raft::neighbors::cagra::detail {
 
@@ -104,6 +110,129 @@ void serialize(raft::resources const& res,
   if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
 }
 
+template <typename T, typename IdxT>
+void serialize_to_hnswlib(raft::resources const& res,
+                          std::ostream& os,
+                          const index<T, IdxT>& index_)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::serialize_to_hnswlib");
+  RAFT_LOG_DEBUG("Saving CAGRA index to hnswlib format, size %zu, dim %u",
+                 static_cast<size_t>(index_.size()),
+                 index_.dim());
+
+  // offset_level_0
+  std::size_t offset_level_0 = 0;
+  os.write(reinterpret_cast<char*>(&offset_level_0), sizeof(std::size_t));
+  // max_element
+  std::size_t max_element = index_.size();
+  os.write(reinterpret_cast<char*>(&max_element), sizeof(std::size_t));
+  // curr_element_count
+  std::size_t curr_element_count = index_.size();
+  os.write(reinterpret_cast<char*>(&curr_element_count), sizeof(std::size_t));
+  // Example:M: 16, dim = 128, data_t = float, index_t = uint32_t, list_size_type = uint32_t,
+  // labeltype: size_t size_data_per_element_ = M * 2 * sizeof(index_t) + sizeof(list_size_type) +
+  // dim * sizeof(data_t) + sizeof(labeltype)
+  auto size_data_per_element =
+    static_cast<std::size_t>(index_.graph_degree() * 4 + 4 + index_.dim() * 4 + 8);
+  os.write(reinterpret_cast<char*>(&size_data_per_element), sizeof(std::size_t));
+  // label_offset
+  std::size_t label_offset = size_data_per_element - 8;
+  os.write(reinterpret_cast<char*>(&label_offset), sizeof(std::size_t));
+  // offset_data
+  auto offset_data = static_cast<std::size_t>(index_.graph_degree() * 4 + 4);
+  os.write(reinterpret_cast<char*>(&offset_data), sizeof(std::size_t));
+  // max_level
+  int max_level = 1;
+  os.write(reinterpret_cast<char*>(&max_level), sizeof(int));
+  // entrypoint_node
+  auto entrypoint_node = static_cast<int>(index_.size() / 2);
+  os.write(reinterpret_cast<char*>(&entrypoint_node), sizeof(int));
+  // max_M
+  auto max_M = static_cast<std::size_t>(index_.graph_degree() / 2);
+  os.write(reinterpret_cast<char*>(&max_M), sizeof(std::size_t));
+  // max_M0
+  std::size_t max_M0 = index_.graph_degree();
+  os.write(reinterpret_cast<char*>(&max_M0), sizeof(std::size_t));
+  // M
+  auto M = static_cast<std::size_t>(index_.graph_degree() / 2);
+  os.write(reinterpret_cast<char*>(&M), sizeof(std::size_t));
+  // mult, can be anything
+  double mult = 0.42424242;
+  os.write(reinterpret_cast<char*>(&mult), sizeof(double));
+  // efConstruction, can be anything
+  std::size_t efConstruction = 500;
+  os.write(reinterpret_cast<char*>(&efConstruction), sizeof(std::size_t));
+
+  auto dataset = index_.dataset();
+  // Remove padding before saving the dataset
+  auto host_dataset = make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
+                                  sizeof(T) * host_dataset.extent(1),
+                                  dataset.data_handle(),
+                                  sizeof(T) * dataset.stride(0),
+                                  sizeof(T) * host_dataset.extent(1),
+                                  dataset.extent(0),
+                                  cudaMemcpyDefault,
+                                  resource::get_cuda_stream(res)));
+  resource::sync_stream(res);
+
+  auto graph = index_.graph();
+  auto host_graph =
+    raft::make_host_matrix<IdxT, int64_t, raft::row_major>(graph.extent(0), graph.extent(1));
+  raft::copy(host_graph.data_handle(),
+             graph.data_handle(),
+             graph.size(),
+             raft::resource::get_cuda_stream(res));
+  resource::sync_stream(res);
+
+  // Write one dataset and graph row at a time
+  for (std::size_t i = 0; i < index_.size(); i++) {
+    auto graph_degree = static_cast<int>(index_.graph_degree());
+    os.write(reinterpret_cast<char*>(&graph_degree), sizeof(int));
+
+    for (std::size_t j = 0; j < index_.graph_degree(); ++j) {
+      auto graph_elem = host_graph(i, j);
+      os.write(reinterpret_cast<char*>(&graph_elem), sizeof(IdxT));
+    }
+
+    auto data_row = host_dataset.data_handle() + (index_.dim() * i);
+    if constexpr (std::is_same_v<T, float>) {
+      for (std::size_t j = 0; j < index_.dim(); ++j) {
+        auto data_elem = host_dataset(i, j);
+        os.write(reinterpret_cast<char*>(&data_elem), sizeof(T));
+      }
+    } else if constexpr (std::is_same_v<T, std::int8_t> or std::is_same_v<T, std::uint8_t>) {
+      for (std::size_t j = 0; j < index_.dim(); ++j) {
+        auto data_elem = static_cast<int>(host_dataset(i, j));
+        os.write(reinterpret_cast<char*>(&data_elem), sizeof(int));
+      }
+    }
+
+    os.write(reinterpret_cast<char*>(&i), sizeof(std::size_t));
+  }
+
+  for (std::size_t i = 0; i < index_.size(); i++) {
+    // zeroes
+    auto zero = 0;
+    os.write(reinterpret_cast<char*>(&zero), sizeof(int));
+  }
+  // delete [] host_graph;
+}
+
+template <typename T, typename IdxT>
+void serialize_to_hnswlib(raft::resources const& res,
+                          const std::string& filename,
+                          const index<T, IdxT>& index_)
+{
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  detail::serialize_to_hnswlib<T, IdxT>(res, of, index_);
+
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+}
+
 /** Load an index from file.
  *
  * Experimental, both the API and the serialization format are subject to change.
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 8fcba38883..192850cf2d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -528,8 +528,9 @@ void optimize(raft::resources const& res,
     constexpr int _omp_chunk = 1024;
 #pragma omp parallel for schedule(dynamic, _omp_chunk)
     for (uint64_t j = 0; j < graph_size; j++) {
-      for (uint64_t _k = 0; _k < rev_graph_count.data_handle()[j]; _k++) {
-        uint64_t k = rev_graph_count.data_handle()[j] - 1 - _k;
+      uint64_t k = std::min(rev_graph_count.data_handle()[j], output_graph_degree);
+      while (k) {
+        k--;
         uint64_t i = rev_graph.data_handle()[k + (output_graph_degree * j)];
 
         uint64_t pos =
diff --git a/dependencies.yaml b/dependencies.yaml
index aba81d7ed9..419cd0c22e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -436,16 +436,14 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask>=2023.9.2
           - dask-cuda==23.12.*
-          - distributed>=2023.9.2
           - joblib>=0.11
           - numba>=0.57
           - *numpy
+          - rapids-dask-dependency==23.12.*
           - ucx-py==0.35.*
       - output_types: conda
         packages:
-          - dask-core>=2023.9.2
           - ucx>=1.13.0
           - ucx-proc=*=gpu
           - &ucx_py_conda ucx-py==0.35.*
diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md
index 99a6bfbd3a..821345b07c 100644
--- a/docs/source/ann_benchmarks_dataset.md
+++ b/docs/source/ann_benchmarks_dataset.md
@@ -44,4 +44,20 @@ Commonly used datasets can be downloaded from two websites:
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
     popd
     ```
-    Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
\ No newline at end of file
+    Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
+
+## Generate ground truth
+
+If you have a dataset, but no corresponding ground truth file, then you can generate ground trunth using the `generate_groundtruth` utility. Example usage:
+
+```bash
+# With existing query file
+python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+
+# With randomly generated queries
+python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000
+
+# Using only a subset of the dataset. Define queries by randomly
+# selecting vectors from the (subset of the) dataset.
+python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000
+```
\ No newline at end of file
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index d787a96955..4c95b9e520 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -46,14 +46,13 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 
 
 ### `raft_cagra`
-CAGRA uses a graph-based index, which creates an intermediate, approximate kNN graph using IVF-PQ and then further refining and optimizing to create a final kNN graph. This kNN graph is used by CAGRA as an index for search.
+<a id='raft-cagra'></a>CAGRA uses a graph-based index, which creates an intermediate, approximate kNN graph using IVF-PQ and then further refining and optimizing to create a final kNN graph. This kNN graph is used by CAGRA as an index for search.
 
 | Parameter                   | Type           | Required | Data Type                  | Default | Description                                                                                                                                                                       |
 |-----------------------------|----------------|----------|----------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `graph_degree`              | `build_param`  | N        | Positive Integer >0        | 64 | Degree of the final kNN graph index. |
 | `intermediate_graph_degree` | `build_param`  | N        | Positive Integer >0        | 128 | Degree of the intermediate kNN graph. |
 | `graph_build_algo`          | `build_param`  | N | ["IVF_PQ", "NN_DESCENT"]   | "IVF_PQ" | Algorithm to use for search |
-| `nn_descent_niter`          | `build_param`  | N        | Positive Integer>0         | 20 | Number of iterations if using NN_DESCENT. |
 | `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
 | `query_memory_type`         | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
 | `itopk`                     | `search_wdith`  | N        | Positive Integer >0        | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |
@@ -61,6 +60,35 @@ CAGRA uses a graph-based index, which creates an intermediate, approximate kNN g
 | `max_iterations`            | `search_param`  | N        | Integer >=0                | 0 | Upper limit of search iterations. Auto select when 0. |
 | `algo`                      | `search_param`  | N        | string                     | "auto" | Algorithm to use for search. Possible values: {"auto", "single_cta", "multi_cta", "multi_kernel"} |
 
+To fine tune CAGRA index building we can customize IVF-PQ index builder options using the following settings. These take effect only if `graph_build_algo == "IVF_PQ"`. It is recommended to experiment using a separate IVF-PQ index to find the config that gives the largest QPS for large batch. Recall does not need to be very high, since CAGRA further optimizes the kNN neighbor graph. Some of the default values are derived from the dataset size which is assumed to be [n_vecs, dim].
+
+| Parameter              | Type           | Required | Data Type                        | Default | Description                                                                                                                                                                     |
+|------------------------|----------------|---|----------------------------------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `ivf_pq_build_nlist`                | `build_param`  | N | Positive Integer >0              | n_vecs / 2500        | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
+| `ivf_pq_build_niter`                | `build_param`  | N | Positive Integer >0              | 25      | Number of k-means iterations to use when training the clusters.                                                                                                                 |
+| `ivf_pq_build_ratio`                | `build_param`  | N | Positive Integer >0              | 10      | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
+| `ivf_pq_build_pq_dim`               | `build_param`  | N | Positive Integer. Multiple of 8. | dim/2 rounded up to 8     | Dimensionality of the vector after product quantization. When 0, a heuristic is used to select this value. `pq_dim` * `pq_bits` must be a multiple of 8.                        |
+| `ivf_pq_build_pq_bits`              | `build_param`  | N | Positive Integer. [4-8]          | 8       | Bit length of the vector element after quantization.                                                                                                                            |
+| `ivf_pq_build_codebook_kind`        | `build_param`  | N | ["cluster", "subspace"]          | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
+| `ivf_pq_search_nprobe`               | `build_params` | N | Positive Integer >0              | min(2*dim, nlist)        | The closest number of clusters to search for each query vector.                                    |
+| `ivf_pq_search_internalDistanceDtype` | `build_params` | N | [`float`, `half`]                | `fp8`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
+| `ivf_pq_search_smemLutDtype`         | `build_params` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
+| `ivf_pq_search_refine_ratio`         | `build_params` | N| Positive Number >=0              | 2       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
+
+Alternatively, if `graph_build_algo == "NN_DESCENT"`, then we can customize the following parameters
+| Parameter                   | Type           | Required | Data Type                  | Default | Description                                                                                                                                                                       |
+|-----------------------------|----------------|----------|----------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `nn_descent_niter`          | `build_param`  | N        | Positive Integer>0         | 20 | Number of NN Descent iterations. |
+| `nn_descent_intermediate_graph_degree`          | `build_param`  | N        | Positive Integer>0         | `intermediate_graph_degree` * 1.5 | Intermadiate graph degree during NN descent iterations |
+| `nn_descent_max_iterations`          | `build_param`  | N        | Positive Integer>0         | 20 | Alias for `nn_descent_niter` |
+| `nn_descent_termination_threshold`          | `build_param`  | N        | Positive float>0         | 0.0001 | Termination threshold for NN descent. |
+
+### `raft_cagra_hnswlib`
+This is a benchmark that enables interoperability between `CAGRA` built `HNSW` search. It uses the `CAGRA` built graph as the base layer of an `hnswlib` index to search queries only within the base layer (this is enabled with a simple patch to `hnswlib`).
+
+`build_param` : Same as `build_param` of [CAGRA](#raft-cagra)
+
+`search_param` : Same as `search_param` of [hnswlib](#hnswlib)
 
 ## FAISS Indexes
 
@@ -131,7 +159,7 @@ Use FAISS IVF-PQ index on CPU
 
 
 ## HNSW
-
+<a id='hnswlib'></a>
 ### `hnswlib`
 
 | Parameter        | Type            | Required | Data Type                            | Default | Description                                                                                                                                                                                                                                                                                       |
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index cf4da30896..24fc3801d9 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -152,6 +152,11 @@ options:
   -f, --force           re-run algorithms even if their results already exist (default: False)
   -m SEARCH_MODE, --search-mode SEARCH_MODE
                         run search in 'latency' (measure individual batches) or 'throughput' (pipeline batches and measure end-to-end) mode (default: throughput)
+  -t SEARCH_THREADS, --search-threads SEARCH_THREADS
+                        specify the number threads to use for throughput benchmark. Single value or a pair of min and max separated by ':'. Example --search-threads=1:4. Power of 2 values between 'min' and 'max' will be used. If only 'min' is
+                        specified, then a single test is run with 'min' threads. By default min=1, max=<num hyper threads>. (default: None)
+  -r, --dry-run         dry-run mode will convert the yaml config for the specified algorithms and datasets to the json format that's consumed by the lower-level c++ binaries and then print the command to run execute the benchmarks but
+                        will not actually execute the command. (default: False)
 ```
 
 `dataset`: name of the dataset to be searched in [datasets.yaml](#yaml-dataset-config)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
index dd338c0c45..fd6c2077e7 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
@@ -17,9 +17,35 @@
 import argparse
 import json
 import os
+import sys
+import traceback
+import warnings
 
 import pandas as pd
 
+skip_build_cols = set(
+    [
+        "algo_name",
+        "index_name",
+        "time",
+        "name",
+        "family_index",
+        "per_family_instance_index",
+        "run_name",
+        "run_type",
+        "repetitions",
+        "repetition_index",
+        "iterations",
+        "real_time",
+        "time_unit",
+        "index_size",
+    ]
+)
+
+skip_search_cols = (
+    set(["recall", "qps", "items_per_second", "Recall"]) | skip_build_cols
+)
+
 
 def read_file(dataset, dataset_path, method):
     dir = os.path.join(dataset_path, dataset, "result", method)
@@ -33,36 +59,89 @@ def read_file(dataset, dataset_path, method):
 
 def convert_json_to_csv_build(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "build"):
-        algo_name = algo_name.replace("_base", "")
-        df["name"] = df["name"].str.split("/").str[0]
-        write = pd.DataFrame(
-            {
-                "algo_name": [algo_name] * len(df),
-                "index_name": df["name"],
-                "time": df["real_time"],
-            }
-        )
-        filepath = os.path.normpath(file).split(os.sep)
-        filename = filepath[-1].split("-")[0] + ".csv"
-        write.to_csv(
-            os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
-            index=False,
-        )
+        try:
+            algo_name = algo_name.replace("_base", "")
+            df["name"] = df["name"].str.split("/").str[0]
+            write = pd.DataFrame(
+                {
+                    "algo_name": [algo_name] * len(df),
+                    "index_name": df["name"],
+                    "time": df["real_time"],
+                }
+            )
+            for name in df:
+                if name not in skip_build_cols:
+                    write[name] = df[name]
+            filepath = os.path.normpath(file).split(os.sep)
+            filename = filepath[-1].split("-")[0] + ".csv"
+            write.to_csv(
+                os.path.join(f"{os.sep}".join(filepath[:-1]), filename),
+                index=False,
+            )
+        except Exception as e:
+            print(
+                "An error occurred processing file %s (%s). Skipping..."
+                % (file, e)
+            )
+            traceback.print_exc()
 
 
 def convert_json_to_csv_search(dataset, dataset_path):
     for file, algo_name, df in read_file(dataset, dataset_path, "search"):
-        algo_name = algo_name.replace("_base", "")
-        df["name"] = df["name"].str.split("/").str[0]
-        write = pd.DataFrame(
-            {
-                "algo_name": [algo_name] * len(df),
-                "index_name": df["name"],
-                "recall": df["Recall"],
-                "qps": df["items_per_second"],
-            }
-        )
-        write.to_csv(file.replace(".json", ".csv"), index=False)
+        try:
+            build_file = os.path.join(
+                dataset_path, dataset, "result", "build", f"{algo_name}.csv"
+            )
+            algo_name = algo_name.replace("_base", "")
+            df["name"] = df["name"].str.split("/").str[0]
+            write = pd.DataFrame(
+                {
+                    "algo_name": [algo_name] * len(df),
+                    "index_name": df["name"],
+                    "recall": df["Recall"],
+                    "qps": df["items_per_second"],
+                }
+            )
+            for name in df:
+                if name not in skip_search_cols:
+                    write[name] = df[name]
+
+            if os.path.exists(build_file):
+                build_df = pd.read_csv(build_file)
+                write_ncols = len(write.columns)
+                write["build time"] = None
+                write["build threads"] = None
+                write["build cpu_time"] = None
+                write["build GPU"] = None
+
+                for col_idx in range(5, len(build_df.columns)):
+                    col_name = build_df.columns[col_idx]
+                    write[col_name] = None
+
+                for s_index, search_row in write.iterrows():
+                    for b_index, build_row in build_df.iterrows():
+                        if search_row["index_name"] == build_row["index_name"]:
+                            write.iloc[s_index, write_ncols] = build_df.iloc[
+                                b_index, 2
+                            ]
+                            write.iloc[
+                                s_index, write_ncols + 1 :
+                            ] = build_df.iloc[b_index, 3:]
+                            break
+            else:
+                warnings.warn(
+                    f"Build CSV not found for {algo_name}, "
+                    f"build params won't be "
+                    "appended in the Search CSV"
+                )
+
+            write.to_csv(file.replace(".json", ".csv"), index=False)
+        except Exception as e:
+            print(
+                "An error occurred processing file %s (%s). Skipping..."
+                % (file, e)
+            )
+            traceback.print_exc()
 
 
 def main():
@@ -85,6 +164,9 @@ def main():
         default=default_dataset_path,
     )
 
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     convert_json_to_csv_build(args.dataset, args.dataset_path)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
new file mode 100644
index 0000000000..f4d97edea5
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import os
+import sys
+
+import cupy as cp
+import numpy as np
+import rmm
+from pylibraft.common import DeviceResources
+from pylibraft.neighbors.brute_force import knn
+from rmm.allocators.cupy import rmm_cupy_allocator
+
+from .utils import memmap_bin_file, suffix_from_dtype, write_bin
+
+
+def generate_random_queries(n_queries, n_features, dtype=np.float32):
+    print("Generating random queries")
+    if np.issubdtype(dtype, np.integer):
+        queries = cp.random.randint(
+            0, 255, size=(n_queries, n_features), dtype=dtype
+        )
+    else:
+        queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
+    return queries
+
+
+def choose_random_queries(dataset, n_queries):
+    print("Choosing random vector from dataset as query vectors")
+    query_idx = np.random.choice(
+        dataset.shape[0], size=(n_queries,), replace=False
+    )
+    return dataset[query_idx, :]
+
+
+def calc_truth(dataset, queries, k, metric="sqeuclidean"):
+    handle = DeviceResources()
+    n_samples = dataset.shape[0]
+    n = 500000  # batch size for processing neighbors
+    i = 0
+    indices = None
+    distances = None
+    queries = cp.asarray(queries, dtype=cp.float32)
+
+    while i < n_samples:
+        print("Step {0}/{1}:".format(i // n, n_samples // n))
+        n_batch = n if i + n <= n_samples else n_samples - i
+
+        X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
+
+        D, Ind = knn(
+            X,
+            queries,
+            k,
+            metric=metric,
+            handle=handle,
+            global_id_offset=i,  # shift neighbor index by offset i
+        )
+        handle.sync()
+
+        D, Ind = cp.asarray(D), cp.asarray(Ind)
+        if distances is None:
+            distances = D
+            indices = Ind
+        else:
+            distances = cp.concatenate([distances, D], axis=1)
+            indices = cp.concatenate([indices, Ind], axis=1)
+            idx = cp.argsort(distances, axis=1)[:, :k]
+            distances = cp.take_along_axis(distances, idx, axis=1)
+            indices = cp.take_along_axis(indices, idx, axis=1)
+
+        i += n_batch
+
+    return distances, indices
+
+
+def main():
+    pool = rmm.mr.PoolMemoryResource(
+        rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
+    )
+    rmm.mr.set_current_device_resource(pool)
+    cp.cuda.set_allocator(rmm_cupy_allocator)
+
+    parser = argparse.ArgumentParser(
+        prog="generate_groundtruth",
+        description="Generate true neighbors using exact NN search. "
+        "The input and output files are in big-ann-benchmark's binary format.",
+        epilog="""Example usage
+    # With existing query file
+    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+
+    # With randomly generated queries
+    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+fbin --output=groundtruth_dir --queries=random --n_queries=10000
+
+    # Using only a subset of the dataset. Define queries by randomly
+    # selecting vectors from the (subset of the) dataset.
+    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
+--queries=random-choice --n_queries=10000
+    """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument("dataset", type=str, help="input dataset file name")
+    parser.add_argument(
+        "--queries",
+        type=str,
+        default="random",
+        help="Queries file name, or one of 'random-choice' or 'random' "
+        "(default). 'random-choice': select n_queries vectors from the input "
+        "dataset. 'random': generate n_queries as uniform random numbers.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="",
+        help="output directory name (default current dir)",
+    )
+
+    parser.add_argument(
+        "--n_queries",
+        type=int,
+        default=10000,
+        help="Number of quries to generate (if no query file is given). "
+        "Default: 10000.",
+    )
+
+    parser.add_argument(
+        "-N",
+        "--rows",
+        default=None,
+        type=int,
+        help="use only first N rows from dataset, by default the whole "
+        "dataset is used",
+    )
+    parser.add_argument(
+        "-D",
+        "--cols",
+        default=None,
+        type=int,
+        help="number of features (dataset columns). "
+        "Default: read from dataset file.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        help="Dataset dtype. When not specified, then derived from extension."
+        " Supported types: 'float32', 'float16', 'uint8', 'int8'",
+    )
+
+    parser.add_argument(
+        "-k",
+        type=int,
+        default=100,
+        help="Number of neighbors (per query) to calculate",
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="sqeuclidean",
+        help="Metric to use while calculating distances. Valid metrics are "
+        "those that are accepted by pylibraft.neighbors.brute_force.knn. Most"
+        " commonly used with RAFT ANN are 'sqeuclidean' and 'inner_product'",
+    )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    args = parser.parse_args()
+
+    if args.rows is not None:
+        print("Reading subset of the data, nrows=", args.rows)
+    else:
+        print("Reading whole dataset")
+
+    # Load input data
+    dataset = memmap_bin_file(
+        args.dataset, args.dtype, shape=(args.rows, args.cols)
+    )
+    n_features = dataset.shape[1]
+    dtype = dataset.dtype
+
+    print(
+        "Dataset size {:6.1f} GB, shape {}, dtype {}".format(
+            dataset.size * dataset.dtype.itemsize / 1e9,
+            dataset.shape,
+            np.dtype(dtype),
+        )
+    )
+
+    if len(args.output) > 0:
+        os.makedirs(args.output, exist_ok=True)
+
+    if args.queries == "random" or args.queries == "random-choice":
+        if args.n_queries is None:
+            raise RuntimeError(
+                "n_queries must be given to generate random queries"
+            )
+        if args.queries == "random":
+            queries = generate_random_queries(
+                args.n_queries, n_features, dtype
+            )
+        elif args.queries == "random-choice":
+            queries = choose_random_queries(dataset, args.n_queries)
+
+        queries_filename = os.path.join(
+            args.output, "queries" + suffix_from_dtype(dtype)
+        )
+        print("Writing queries file", queries_filename)
+        write_bin(queries_filename, queries)
+    else:
+        print("Reading queries from file", args.queries)
+        queries = memmap_bin_file(args.queries, dtype)
+
+    print("Calculating true nearest neighbors")
+    distances, indices = calc_truth(dataset, queries, args.k, args.metric)
+
+    write_bin(
+        os.path.join(args.output, "groundtruth.neighbors.ibin"),
+        indices.astype(np.uint32),
+    )
+    write_bin(
+        os.path.join(args.output, "groundtruth.distances.fbin"),
+        distances.astype(np.float32),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py
new file mode 100644
index 0000000000..3f2dd11a16
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py
@@ -0,0 +1,103 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+import numpy as np
+
+
+def dtype_from_filename(filename):
+    ext = os.path.splitext(filename)[1]
+    if ext == ".fbin":
+        return np.float32
+    if ext == ".hbin":
+        return np.float16
+    elif ext == ".ibin":
+        return np.int32
+    elif ext == ".u8bin":
+        return np.ubyte
+    elif ext == ".i8bin":
+        return np.byte
+    else:
+        raise RuntimeError("Not supported file extension" + ext)
+
+
+def suffix_from_dtype(dtype):
+    if dtype == np.float32:
+        return ".fbin"
+    if dtype == np.float16:
+        return ".hbin"
+    elif dtype == np.int32:
+        return ".ibin"
+    elif dtype == np.ubyte:
+        return ".u8bin"
+    elif dtype == np.byte:
+        return ".i8bin"
+    else:
+        raise RuntimeError("Not supported dtype extension" + dtype)
+
+
+def memmap_bin_file(
+    bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
+):
+    extent_itemsize = np.dtype(size_dtype).itemsize
+    offset = int(extent_itemsize) * 2
+    if bin_file is None:
+        return None
+    if dtype is None:
+        dtype = dtype_from_filename(bin_file)
+
+    if mode[0] == "r":
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        if shape is None:
+            shape = (a[0], a[1])
+        else:
+            shape = tuple(
+                [
+                    aval if sval is None else sval
+                    for aval, sval in zip(a, shape)
+                ]
+            )
+
+        return np.memmap(
+            bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape
+        )
+    elif mode[0] == "w":
+        if shape is None:
+            raise ValueError("Need to specify shape to map file in write mode")
+
+        print("creating file", bin_file)
+        dirname = os.path.dirname(bin_file)
+        if len(dirname) > 0:
+            os.makedirs(dirname, exist_ok=True)
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        a[0] = shape[0]
+        a[1] = shape[1]
+        a.flush()
+        del a
+        fp = np.memmap(
+            bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape
+        )
+        return fp
+
+    # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype))
+
+
+def write_bin(fname, data):
+    print("writing", fname, data.shape, data.dtype, "...")
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
index 4e6a0119b4..0a6c37aabc 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
@@ -16,6 +16,7 @@
 import argparse
 import os
 import subprocess
+import sys
 from urllib.request import urlretrieve
 
 
@@ -101,6 +102,10 @@ def main():
         help="normalize cosine distance to inner product",
         action="store_true",
     )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     download(args.dataset, args.normalize, args.dataset_path)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
index 78f8aea8b8..c45ff5b14e 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
@@ -22,6 +22,7 @@
 import argparse
 import itertools
 import os
+import sys
 from collections import OrderedDict
 
 import matplotlib as mpl
@@ -486,6 +487,9 @@ def main():
         action="store_true",
     )
 
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     if args.algorithms:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index ac5d83e4c2..a33467b554 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -18,6 +18,9 @@
 import json
 import os
 import subprocess
+import sys
+import uuid
+import warnings
 from importlib import import_module
 
 import yaml
@@ -80,6 +83,7 @@ def run_build_and_search(
     force,
     build,
     search,
+    dry_run,
     k,
     batch_size,
     search_threads,
@@ -87,16 +91,16 @@ def run_build_and_search(
 ):
     for executable, ann_executable_path, algo in executables_to_run.keys():
         # Need to write temporary configuration
-        temp_conf_filename = f"temporary_{conf_filename}"
-        temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename)
-        with open(temp_conf_filepath, "w") as f:
+        temp_conf_filename = f"{conf_filename}_{algo}_{uuid.uuid1()}.json"
+        with open(temp_conf_filename, "w") as f:
             temp_conf = dict()
             temp_conf["dataset"] = conf_file["dataset"]
             temp_conf["search_basic_param"] = conf_file["search_basic_param"]
             temp_conf["index"] = executables_to_run[
                 (executable, ann_executable_path, algo)
             ]["index"]
-            json.dump(temp_conf, f)
+            json_str = json.dumps(temp_conf, indent=2)
+            f.write(json_str)
 
         legacy_result_folder = os.path.join(
             dataset_path, conf_file["dataset"]["name"], "result"
@@ -116,8 +120,20 @@ def run_build_and_search(
             ]
             if force:
                 cmd = cmd + ["--overwrite"]
-            cmd = cmd + [temp_conf_filepath]
-            subprocess.run(cmd, check=True)
+            cmd = cmd + [temp_conf_filename]
+
+            if dry_run:
+                print(
+                    "Benchmark command for %s:\n%s\n" % (algo, " ".join(cmd))
+                )
+            else:
+                try:
+                    subprocess.run(cmd, check=True)
+                except Exception as e:
+                    print("Error occurred running benchmark: %s" % e)
+                finally:
+                    if not search:
+                        os.remove(temp_conf_filename)
 
         if search:
             search_folder = os.path.join(legacy_result_folder, "search")
@@ -141,10 +157,18 @@ def run_build_and_search(
             if search_threads:
                 cmd = cmd + ["--threads=%s" % search_threads]
 
-            cmd = cmd + [temp_conf_filepath]
-            subprocess.run(cmd, check=True)
-
-        os.remove(temp_conf_filepath)
+            cmd = cmd + [temp_conf_filename]
+            if dry_run:
+                print(
+                    "Benchmark command for %s:\n%s\n" % (algo, " ".join(cmd))
+                )
+            else:
+                try:
+                    subprocess.run(cmd, check=True)
+                except Exception as e:
+                    print("Error occurred running benchmark: %s" % e)
+                finally:
+                    os.remove(temp_conf_filename)
 
 
 def main():
@@ -253,13 +277,27 @@ def main():
         "--search-threads",
         help="specify the number threads to use for throughput benchmark."
         " Single value or a pair of min and max separated by ':'. "
-        "Example --threads=1:4. Power of 2 values between 'min' "
+        "Example: --search-threads=1:4. Power of 2 values between 'min' "
         "and 'max' will be used. If only 'min' is specified, then a "
         "single test is run with 'min' threads. By default min=1, "
         "max=<num hyper threads>.",
         default=None,
     )
 
+    parser.add_argument(
+        "-r",
+        "--dry-run",
+        help="dry-run mode will convert the yaml config for the specified "
+        "algorithms and datasets to the json format that's consumed "
+        "by the lower-level c++ binaries and then print the command "
+        "to run execute the benchmarks but will not actually execute "
+        "the command.",
+        action="store_true",
+    )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     # If both build and search are not provided,
@@ -271,6 +309,8 @@ def main():
         build = args.build
         search = args.search
 
+    dry_run = args.dry_run
+
     mode = args.search_mode
     k = args.count
     batch_size = args.batch_size
@@ -334,7 +374,14 @@ def main():
     algos_conf = dict()
     for algo_f in algos_conf_fs:
         with open(algo_f, "r") as f:
-            algo = yaml.safe_load(f)
+            try:
+                algo = yaml.safe_load(f)
+            except Exception as e:
+                warnings.warn(
+                    f"Could not load YAML config {algo_f} due to "
+                    + e.with_traceback()
+                )
+                continue
             insert_algo = True
             insert_algo_group = False
             if filter_algos:
@@ -452,13 +499,14 @@ def add_algo_group(group_list):
 
     run_build_and_search(
         conf_file,
-        f"{args.dataset}.json",
+        f"{args.dataset}",
         conf_filedir,
         executables_to_run,
         args.dataset_path,
         args.force,
         build,
         search,
+        dry_run,
         k,
         batch_size,
         args.search_threads,
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml
index 7ea360e0c9..e382bdcba6 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml
@@ -37,3 +37,6 @@ ggnn:
 hnswlib:
   executable: HNSWLIB_ANN_BENCH
   requires_gpu: false
+raft_cagra_hnswlib:
+  executable: RAFT_CAGRA_HNSWLIB_ANN_BENCH
+  requires_gpu: true
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
index 0f80608eef..d8015da5c6 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
@@ -10,8 +10,3 @@ groups:
     search:
       itopk: [32, 64, 128, 256, 512]
       search_width: [1, 2, 4, 8, 16, 32, 64]
-
-
-
-
-
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml
new file mode 100644
index 0000000000..787675d65d
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml
@@ -0,0 +1,11 @@
+name: raft_cagra_hnswlib
+constraints:
+  search: raft-ann-bench.constraints.hnswlib_search_constraints
+groups:
+  base:
+    build:
+      graph_degree: [32, 64, 128, 256]
+      intermediate_graph_degree: [32, 64, 96, 128]
+      graph_build_algo: ["NN_DESCENT"]
+    search:
+      ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json
index 6eb72a65a1..2d1ec1e322 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json
@@ -1,9 +1,10 @@
 {
   "dataset": {
     "name": "wiki_all_1M",
-    "base_file": "wiki_all_1M/base.88M.fbin",
+    "base_file": "wiki_all_1M/base.1M.fbin",
+    "subset_size": 1000000,
     "query_file": "wiki_all_1M/queries.fbin",
-    "groundtruth_neighbors_file": "wiki_all_1M/groundtruth.88M.neighbors.ibin",
+    "groundtruth_neighbors_file": "wiki_all_1M/groundtruth.1M.neighbors.ibin",
     "distance": "euclidean"
   },
   "search_basic_param": {
@@ -169,7 +170,22 @@
     {
       "name": "raft_cagra.dim32.multi_cta",
       "algo": "raft_cagra",
-      "build_param": { "graph_degree": 32, "intermediate_graph_degree": 48 },
+      "build_param": { "graph_degree": 32,
+        "intermediate_graph_degree": 48,
+        "graph_build_algo": "NN_DESCENT",
+        "ivf_pq_build_pq_dim": 32,
+        "ivf_pq_build_pq_bits": 8,
+        "ivf_pq_build_nlist": 16384,
+        "ivf_pq_build_niter": 10,
+        "ivf_pq_build_ratio": 10,
+        "ivf_pq_search_nprobe": 30,
+        "ivf_pq_search_internalDistanceDtype": "half",
+        "ivf_pq_search_smemLutDtype": "half",
+        "ivf_pq_search_refine_ratio": 8,
+        "nn_descent_max_iterations": 10,
+        "nn_descent_intermediate_graph_degree": 72,
+        "nn_descent_termination_threshold": 0.001
+      },
       "file": "wiki_all_1M/raft_cagra/dim32.ibin",
       "search_params": [
         { "itopk": 32, "search_width": 1, "max_iterations": 0, "algo": "multi_cta" },
diff --git a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
index b886d40ea7..c65360ebb0 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
@@ -16,6 +16,7 @@
 import argparse
 import os
 import subprocess
+import sys
 
 
 def split_groundtruth(groundtruth_filepath):
@@ -43,6 +44,10 @@ def main():
         help="Path to billion-scale dataset groundtruth file",
         required=True,
     )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
     args = parser.parse_args()
 
     split_groundtruth(args.groundtruth)
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 7abb6231a1..be030f839d 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -35,12 +35,11 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==23.12.*",
-    "dask>=2023.9.2",
-    "distributed>=2023.9.2",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.21",
     "pylibraft==23.12.*",
+    "rapids-dask-dependency==23.12.*",
     "ucx-py==0.35.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [