From 0f6ce2223b847b2992692763a07af9654e5feeb3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 16 Nov 2023 15:16:03 +0100
Subject: [PATCH 1/6] ANN bench options to specify CAGRA graph and dataset
 locations (#1896)

CAGRA index can be constructed using existing device_mdarrays, in which case just reference to these arrays are stored.
This way allocations are managed outside the index, and we can customize how these allocations are made.

This PR
- modifies the CAGRA ANN bench wrapper to manage the allocations locally,
- add options for the json file to specify whether the graph / dataset is allocated in device / host_pinned / host_huge_page memory.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Tamas Bela Feher (https://github.com/tfeher)
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Micka (https://github.com/lowener)
  - Divye Gala (https://github.com/divyegala)
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/raft/pull/1896
---
 cpp/bench/ann/src/common/benchmark.hpp        |  20 ++-
 .../src/common/cuda_huge_page_resource.hpp    | 132 ++++++++++++++++++
 .../ann/src/common/cuda_pinned_resource.hpp   | 130 +++++++++++++++++
 .../src/raft/raft_ann_bench_param_parser.h    |  21 +++
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h   |  99 ++++++++++++-
 cpp/include/raft/neighbors/cagra_types.hpp    |  27 +---
 .../raft/neighbors/detail/cagra/utils.hpp     |  34 +++++
 cpp/include/raft/util/integer_utils.hpp       |   2 +-
 docs/source/ann_benchmarks_param_tuning.md    |   9 +-
 .../src/raft-ann-bench/run/__main__.py        |  87 +++++++++++-
 10 files changed, 522 insertions(+), 39 deletions(-)
 create mode 100644 cpp/bench/ann/src/common/cuda_huge_page_resource.hpp
 create mode 100644 cpp/bench/ann/src/common/cuda_pinned_resource.hpp
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 7db5eab194..a2e77323c1 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -21,6 +21,7 @@
 #include "util.hpp"
 
 #include <benchmark/benchmark.h>
+#include <raft/core/logger.hpp>
 
 #include <algorithm>
 #include <atomic>
@@ -131,7 +132,7 @@ void bench_build(::benchmark::State& state,
       log_info("Overwriting file: %s", index.file.c_str());
     } else {
       return state.SkipWithMessage(
-        "Index file already exists (use --overwrite to overwrite the index).");
+        "Index file already exists (use --force to overwrite the index).");
     }
   }
 
@@ -380,7 +381,7 @@ inline void printf_usage()
   ::benchmark::PrintDefaultHelp();
   fprintf(stdout,
           "          [--build|--search] \n"
-          "          [--overwrite]\n"
+          "          [--force]\n"
           "          [--data_prefix=<prefix>]\n"
           "          [--index_prefix=<prefix>]\n"
           "          [--override_kv=<key:value1:value2:...:valueN>]\n"
@@ -392,7 +393,7 @@ inline void printf_usage()
           "  --build: build mode, will build index\n"
           "  --search: search mode, will search using the built index\n"
           "            one and only one of --build and --search should be specified\n"
-          "  --overwrite: force overwriting existing index files\n"
+          "  --force: force overwriting existing index files\n"
           "  --data_prefix=<prefix>:"
           " prepend <prefix> to dataset file paths specified in the <conf>.json (default = "
           "'data/').\n"
@@ -572,6 +573,8 @@ inline auto run_main(int argc, char** argv) -> int
   std::string mode            = "latency";
   std::string threads_arg_txt = "";
   std::vector<int> threads    = {1, -1};  // min_thread, max_thread
+  std::string log_level_str   = "";
+  int raft_log_level          = raft::logger::get(RAFT_NAME).get_level();
   kv_series override_kv{};
 
   char arg0_default[] = "benchmark";  // NOLINT
@@ -589,14 +592,19 @@ inline auto run_main(int argc, char** argv) -> int
   std::ifstream conf_stream(conf_path);
 
   for (int i = 1; i < argc; i++) {
-    if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) ||
+    if (parse_bool_flag(argv[i], "--force", force_overwrite) ||
         parse_bool_flag(argv[i], "--build", build_mode) ||
         parse_bool_flag(argv[i], "--search", search_mode) ||
         parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
         parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
         parse_string_flag(argv[i], "--mode", mode) ||
         parse_string_flag(argv[i], "--override_kv", new_override_kv) ||
-        parse_string_flag(argv[i], "--threads", threads_arg_txt)) {
+        parse_string_flag(argv[i], "--threads", threads_arg_txt) ||
+        parse_string_flag(argv[i], "--raft_log_level", log_level_str)) {
+      if (!log_level_str.empty()) {
+        raft_log_level = std::stoi(log_level_str);
+        log_level_str  = "";
+      }
       if (!threads_arg_txt.empty()) {
         auto threads_arg = split(threads_arg_txt, ':');
         threads[0]       = std::stoi(threads_arg[0]);
@@ -625,6 +633,8 @@ inline auto run_main(int argc, char** argv) -> int
     }
   }
 
+  raft::logger::get(RAFT_NAME).set_level(raft_log_level);
+
   Objective metric_objective = Objective::LATENCY;
   if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; }
 
diff --git a/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp
new file mode 100644
index 0000000000..9132db7c04
--- /dev/null
+++ b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <sys/mman.h>
+
+#include <cstddef>
+
+namespace raft::mr {
+/**
+ * @brief `device_memory_resource` derived class that uses mmap to allocate memory.
+ * This class enables memory allocation using huge pages.
+ * It is assumed that the allocated memory is directly accessible on device. This currently only
+ * works on GH systems.
+ *
+ * TODO(tfeher): consider improving or removing this helper once we made progress with
+ * https://github.com/rapidsai/raft/issues/1819
+ */
+class cuda_huge_page_resource final : public rmm::mr::device_memory_resource {
+ public:
+  cuda_huge_page_resource()                                          = default;
+  ~cuda_huge_page_resource() override                                = default;
+  cuda_huge_page_resource(cuda_huge_page_resource const&)            = default;
+  cuda_huge_page_resource(cuda_huge_page_resource&&)                 = default;
+  cuda_huge_page_resource& operator=(cuda_huge_page_resource const&) = default;
+  cuda_huge_page_resource& operator=(cuda_huge_page_resource&&)      = default;
+
+  /**
+   * @brief Query whether the resource supports use of non-null CUDA streams for
+   * allocation/deallocation. `cuda_huge_page_resource` does not support streams.
+   *
+   * @returns bool false
+   */
+  [[nodiscard]] bool supports_streams() const noexcept override { return false; }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return true
+   */
+  [[nodiscard]] bool supports_get_mem_info() const noexcept override { return true; }
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using cudaMalloc.
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @note Stream argument is ignored
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view) override
+  {
+    void* _addr{nullptr};
+    _addr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (_addr == MAP_FAILED) { RAFT_FAIL("huge_page_resource::MAP FAILED"); }
+    if (madvise(_addr, bytes, MADV_HUGEPAGE) == -1) {
+      munmap(_addr, bytes);
+      RAFT_FAIL("huge_page_resource::madvise MADV_HUGEPAGE");
+    }
+    memset(_addr, 0, bytes);
+    return _addr;
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @note Stream argument is ignored.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   */
+  void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) override
+  {
+    if (munmap(ptr, size) == -1) { RAFT_FAIL("huge_page_resource::munmap"); }
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * Two cuda_huge_page_resources always compare equal, because they can each
+   * deallocate memory allocated by the other.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    return dynamic_cast<cuda_huge_page_resource const*>(&other) != nullptr;
+  }
+
+  /**
+   * @brief Get free and available memory for memory resource
+   *
+   * @throws `rmm::cuda_error` if unable to retrieve memory info.
+   *
+   * @return std::pair contaiing free_size and total_size of memory
+   */
+  [[nodiscard]] std::pair<std::size_t, std::size_t> do_get_mem_info(
+    rmm::cuda_stream_view) const override
+  {
+    std::size_t free_size{};
+    std::size_t total_size{};
+    RMM_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+    return std::make_pair(free_size, total_size);
+  }
+};
+}  // namespace raft::mr
\ No newline at end of file
diff --git a/cpp/bench/ann/src/common/cuda_pinned_resource.hpp b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp
new file mode 100644
index 0000000000..28ca691f86
--- /dev/null
+++ b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
+
+#include <cstddef>
+
+namespace raft::mr {
+/**
+ * @brief `device_memory_resource` derived class that uses cudaMallocHost/Free for
+ * allocation/deallocation.
+ *
+ * This is almost the same as rmm::mr::host::pinned_memory_resource, but it has
+ * device_memory_resource as base class. Pinned memory can be accessed from device,
+ * and using this allocator we can create device_mdarray backed by pinned allocator.
+ *
+ * TODO(tfeher): it would be preferred to just rely on the existing allocator from rmm
+ * (pinned_memory_resource), but that is incompatible with the container_policy class
+ * for device matrix, because the latter expects a device_memory_resource. We shall
+ * revise this once we progress with Issue https://github.com/rapidsai/raft/issues/1819
+ */
+class cuda_pinned_resource final : public rmm::mr::device_memory_resource {
+ public:
+  cuda_pinned_resource()                                       = default;
+  ~cuda_pinned_resource() override                             = default;
+  cuda_pinned_resource(cuda_pinned_resource const&)            = default;
+  cuda_pinned_resource(cuda_pinned_resource&&)                 = default;
+  cuda_pinned_resource& operator=(cuda_pinned_resource const&) = default;
+  cuda_pinned_resource& operator=(cuda_pinned_resource&&)      = default;
+
+  /**
+   * @brief Query whether the resource supports use of non-null CUDA streams for
+   * allocation/deallocation. `cuda_pinned_resource` does not support streams.
+   *
+   * @returns bool false
+   */
+  [[nodiscard]] bool supports_streams() const noexcept override { return false; }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return true
+   */
+  [[nodiscard]] bool supports_get_mem_info() const noexcept override { return true; }
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using cudaMalloc.
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @note Stream argument is ignored
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view) override
+  {
+    void* ptr{nullptr};
+    RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, bytes));
+    return ptr;
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * @note Stream argument is ignored.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   */
+  void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view) override
+  {
+    RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr));
+  }
+
+  /**
+   * @brief Compare this resource to another.
+   *
+   * Two cuda_pinned_resources always compare equal, because they can each
+   * deallocate memory allocated by the other.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    return dynamic_cast<cuda_pinned_resource const*>(&other) != nullptr;
+  }
+
+  /**
+   * @brief Get free and available memory for memory resource
+   *
+   * @throws `rmm::cuda_error` if unable to retrieve memory info.
+   *
+   * @return std::pair contaiing free_size and total_size of memory
+   */
+  [[nodiscard]] std::pair<std::size_t, std::size_t> do_get_mem_info(
+    rmm::cuda_stream_view) const override
+  {
+    std::size_t free_size{};
+    std::size_t total_size{};
+    RMM_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+    return std::make_pair(free_size, total_size);
+  }
+};
+}  // namespace raft::mr
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 479a90e3b5..1eb0e53cc5 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -206,6 +206,21 @@ void parse_build_param(const nlohmann::json& conf,
   }
 }
 
+raft::bench::ann::AllocatorType parse_allocator(std::string mem_type)
+{
+  if (mem_type == "device") {
+    return raft::bench::ann::AllocatorType::Device;
+  } else if (mem_type == "host_pinned") {
+    return raft::bench::ann::AllocatorType::HostPinned;
+  } else if (mem_type == "host_huge_page") {
+    return raft::bench::ann::AllocatorType::HostHugePage;
+  }
+  THROW(
+    "Invalid value for memory type %s, must be one of [\"device\", \"host_pinned\", "
+    "\"host_huge_page\"",
+    mem_type.c_str());
+}
+
 template <typename T, typename IdxT>
 void parse_search_param(const nlohmann::json& conf,
                         typename raft::bench::ann::RaftCagra<T, IdxT>::SearchParam& param)
@@ -227,5 +242,11 @@ void parse_search_param(const nlohmann::json& conf,
       THROW("Invalid value for algo: %s", tmp.c_str());
     }
   }
+  if (conf.contains("graph_memory_type")) {
+    param.graph_mem = parse_allocator(conf.at("graph_memory_type"));
+  }
+  if (conf.contains("internal_dataset_memory_type")) {
+    param.dataset_mem = parse_allocator(conf.at("internal_dataset_memory_type"));
+  }
 }
 #endif
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index bf526101be..a3e481ec5a 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -42,8 +42,15 @@
 #include "raft_ann_bench_utils.h"
 #include <raft/util/cudart_utils.hpp>
 
+#include "../common/cuda_huge_page_resource.hpp"
+#include "../common/cuda_pinned_resource.hpp"
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
 namespace raft::bench::ann {
 
+enum class AllocatorType { HostPinned, HostHugePage, Device };
 template <typename T, typename IdxT>
 class RaftCagra : public ANN<T> {
  public:
@@ -51,6 +58,8 @@ class RaftCagra : public ANN<T> {
 
   struct SearchParam : public AnnSearchParam {
     raft::neighbors::experimental::cagra::search_params p;
+    AllocatorType graph_mem   = AllocatorType::Device;
+    AllocatorType dataset_mem = AllocatorType::Device;
     auto needs_dataset() const -> bool override { return true; }
   };
 
@@ -64,7 +73,16 @@ class RaftCagra : public ANN<T> {
   };
 
   RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1)
-    : ANN<T>(metric, dim), index_params_(param), dimension_(dim), handle_(cudaStreamPerThread)
+    : ANN<T>(metric, dim),
+      index_params_(param),
+      dimension_(dim),
+      handle_(cudaStreamPerThread),
+      need_dataset_update_(true),
+      dataset_(make_device_matrix<T, int64_t>(handle_, 0, 0)),
+      graph_(make_device_matrix<IdxT, int64_t>(handle_, 0, 0)),
+      input_dataset_v_(nullptr, 0, 0),
+      graph_mem_(AllocatorType::Device),
+      dataset_mem_(AllocatorType::Device)
   {
     index_params_.cagra_params.metric         = parse_metric_type(metric);
     index_params_.ivf_pq_build_params->metric = parse_metric_type(metric);
@@ -101,12 +119,28 @@ class RaftCagra : public ANN<T> {
   void save_to_hnswlib(const std::string& file) const;
 
  private:
+  inline rmm::mr::device_memory_resource* get_mr(AllocatorType mem_type)
+  {
+    switch (mem_type) {
+      case (AllocatorType::HostPinned): return &mr_pinned_;
+      case (AllocatorType::HostHugePage): return &mr_huge_page_;
+      default: return rmm::mr::get_current_device_resource();
+    }
+  }
+  raft ::mr::cuda_pinned_resource mr_pinned_;
+  raft ::mr::cuda_huge_page_resource mr_huge_page_;
   raft::device_resources handle_;
+  AllocatorType graph_mem_;
+  AllocatorType dataset_mem_;
   BuildParam index_params_;
+  bool need_dataset_update_;
   raft::neighbors::cagra::search_params search_params_;
   std::optional<raft::neighbors::cagra::index<T, IdxT>> index_;
   int device_;
   int dimension_;
+  raft::device_matrix<IdxT, int64_t, row_major> graph_;
+  raft::device_matrix<T, int64_t, row_major> dataset_;
+  raft::device_matrix_view<const T, int64_t, row_major> input_dataset_v_;
 };
 
 template <typename T, typename IdxT>
@@ -127,18 +161,77 @@ void RaftCagra<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
   return;
 }
 
+inline std::string allocator_to_string(AllocatorType mem_type)
+{
+  if (mem_type == AllocatorType::Device) {
+    return "device";
+  } else if (mem_type == AllocatorType::HostPinned) {
+    return "host_pinned";
+  } else if (mem_type == AllocatorType::HostHugePage) {
+    return "host_huge_page";
+  }
+  return "<invalid allocator type>";
+}
+
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::set_search_param(const AnnSearchParam& param)
 {
   auto search_param = dynamic_cast<const SearchParam&>(param);
   search_params_    = search_param.p;
+  if (search_param.graph_mem != graph_mem_) {
+    // Move graph to correct memory space
+    graph_mem_ = search_param.graph_mem;
+    RAFT_LOG_INFO("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str());
+    // We create a new graph and copy to it from existing graph
+    auto mr        = get_mr(graph_mem_);
+    auto new_graph = make_device_mdarray<IdxT, int64_t>(
+      handle_, mr, make_extents<int64_t>(index_->graph().extent(0), index_->graph_degree()));
+
+    raft::copy(new_graph.data_handle(),
+               index_->graph().data_handle(),
+               index_->graph().size(),
+               resource::get_cuda_stream(handle_));
+
+    index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
+    // update_graph() only stores a view in the index. We need to keep the graph object alive.
+    graph_ = std::move(new_graph);
+  }
+
+  if (search_param.dataset_mem != dataset_mem_ || need_dataset_update_) {
+    dataset_mem_ = search_param.dataset_mem;
+
+    // First free up existing memory
+    dataset_ = make_device_matrix<T, int64_t>(handle_, 0, 0);
+    index_->update_dataset(handle_, make_const_mdspan(dataset_.view()));
+
+    // Allocate space using the correct memory resource.
+    RAFT_LOG_INFO("moving dataset to new memory space: %s",
+                  allocator_to_string(dataset_mem_).c_str());
+
+    auto mr = get_mr(dataset_mem_);
+    raft::neighbors::cagra::detail::copy_with_padding(handle_, dataset_, input_dataset_v_, mr);
+
+    index_->update_dataset(handle_, make_const_mdspan(dataset_.view()));
+
+    // Ideally, instead of dataset_.view(), we should pass a strided matrix view to update.
+    // See Issue https://github.com/rapidsai/raft/issues/1972 for details.
+    // auto dataset_view = make_device_strided_matrix_view<const T, int64_t>(
+    //   dataset_.data_handle(), dataset_.extent(0), this->dim_, dataset_.extent(1));
+    // index_->update_dataset(handle_, dataset_view);
+    need_dataset_update_ = false;
+  }
 }
 
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
 {
-  index_->update_dataset(handle_,
-                         raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_));
+  // It can happen that we are re-using a previous algo object which already has
+  // the dataset set. Check if we need update.
+  if (static_cast<size_t>(input_dataset_v_.extent(0)) != nrow ||
+      input_dataset_v_.data_handle() != dataset) {
+    input_dataset_v_     = make_device_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+    need_dataset_update_ = true;
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 4db08110b9..e8a0b8a7bd 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -25,6 +25,7 @@
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/detail/cagra/utils.hpp>
 #include <raft/util/integer_utils.hpp>
 
 #include <memory>
@@ -330,32 +331,8 @@ struct index : ann::index {
   void copy_padded(raft::resources const& res,
                    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset)
   {
-    size_t padded_dim = round_up_safe<size_t>(dataset.extent(1) * sizeof(T), 16) / sizeof(T);
+    detail::copy_with_padding(res, dataset_, dataset);
 
-    if ((dataset_.extent(0) != dataset.extent(0)) ||
-        (static_cast<size_t>(dataset_.extent(1)) != padded_dim)) {
-      // clear existing memory before allocating to prevent OOM errors on large datasets
-      if (dataset_.size()) { dataset_ = make_device_matrix<T, int64_t>(res, 0, 0); }
-      dataset_ = make_device_matrix<T, int64_t>(res, dataset.extent(0), padded_dim);
-    }
-    if (dataset_.extent(1) == dataset.extent(1)) {
-      raft::copy(dataset_.data_handle(),
-                 dataset.data_handle(),
-                 dataset.size(),
-                 resource::get_cuda_stream(res));
-    } else {
-      // copy with padding
-      RAFT_CUDA_TRY(cudaMemsetAsync(
-        dataset_.data_handle(), 0, dataset_.size() * sizeof(T), resource::get_cuda_stream(res)));
-      RAFT_CUDA_TRY(cudaMemcpy2DAsync(dataset_.data_handle(),
-                                      sizeof(T) * dataset_.extent(1),
-                                      dataset.data_handle(),
-                                      sizeof(T) * dataset.extent(1),
-                                      sizeof(T) * dataset.extent(1),
-                                      dataset.extent(0),
-                                      cudaMemcpyDefault,
-                                      resource::get_cuda_stream(res)));
-    }
     dataset_view_ = make_device_strided_matrix_view<const T, int64_t>(
       dataset_.data_handle(), dataset_.extent(0), dataset.extent(1), dataset_.extent(1));
     RAFT_LOG_DEBUG("CAGRA dataset strided matrix view %zux%zu, stride %zu",
diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
index 22cbe6bbac..5e57a9589f 100644
--- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -22,6 +22,8 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/util/integer_utils.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <type_traits>
 
 namespace raft::neighbors::cagra::detail {
@@ -245,4 +247,36 @@ class host_matrix_view_from_device {
   device_matrix_view<T, IdxT> device_view_;
   T* host_ptr;
 };
+
+// Copy matrix src to dst. pad rows with 0 if necessary to make them 16 byte aligned.
+template <typename T, typename data_accessor>
+void copy_with_padding(raft::resources const& res,
+                       raft::device_matrix<T, int64_t, row_major>& dst,
+                       mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> src,
+                       rmm::mr::device_memory_resource* mr = nullptr)
+{
+  if (!mr) { mr = rmm::mr::get_current_device_resource(); }
+  size_t padded_dim = round_up_safe<size_t>(src.extent(1) * sizeof(T), 16) / sizeof(T);
+
+  if ((dst.extent(0) != src.extent(0)) || (static_cast<size_t>(dst.extent(1)) != padded_dim)) {
+    // clear existing memory before allocating to prevent OOM errors on large datasets
+    if (dst.size()) { dst = make_device_matrix<T, int64_t>(res, 0, 0); }
+    dst = make_device_mdarray<T>(res, mr, make_extents<int64_t>(src.extent(0), padded_dim));
+  }
+  if (dst.extent(1) == src.extent(1)) {
+    raft::copy(dst.data_handle(), src.data_handle(), src.size(), resource::get_cuda_stream(res));
+  } else {
+    // copy with padding
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      dst.data_handle(), 0, dst.size() * sizeof(T), resource::get_cuda_stream(res)));
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst.data_handle(),
+                                    sizeof(T) * dst.extent(1),
+                                    src.data_handle(),
+                                    sizeof(T) * src.extent(1),
+                                    sizeof(T) * src.extent(1),
+                                    src.extent(0),
+                                    cudaMemcpyDefault,
+                                    resource::get_cuda_stream(res)));
+  }
+}
 }  // namespace raft::neighbors::cagra::detail
diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index c14d4327c3..e9a386c426 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -36,7 +36,7 @@ namespace raft {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus)
+constexpr inline S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index 4c95b9e520..dd74f030ad 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -53,12 +53,16 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `graph_degree`              | `build_param`  | N        | Positive Integer >0        | 64 | Degree of the final kNN graph index. |
 | `intermediate_graph_degree` | `build_param`  | N        | Positive Integer >0        | 128 | Degree of the intermediate kNN graph. |
 | `graph_build_algo`          | `build_param`  | N | ["IVF_PQ", "NN_DESCENT"]   | "IVF_PQ" | Algorithm to use for search |
-| `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside while constructing the index?                                                                                                                                       |
 | `query_memory_type`         | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
 | `itopk`                     | `search_wdith`  | N        | Positive Integer >0        | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |
 | `search_width`              | `search_param`  | N        | Positive Integer >0        | 1 | Number of graph nodes to select as the starting point for the search in each iteration. |
 | `max_iterations`            | `search_param`  | N        | Integer >=0                | 0 | Upper limit of search iterations. Auto select when 0. |
 | `algo`                      | `search_param`  | N        | string                     | "auto" | Algorithm to use for search. Possible values: {"auto", "single_cta", "multi_cta", "multi_kernel"} |
+| `graph_memory_type`         | `search_param`  | N        | string                     | "device" | Memory type to store gaph. Must be one of {"device", "host_pinned", "host_huge_page"}. |
+| `internal_dataset_memory_type` | `search_param`  | N        | string                     | "device" | Memory type to store dataset in the index. Must be one of {"device", "host_pinned", "host_huge_page"}. |
+
+The `graph_memory_type` or `internal_dataset_memory_type` options can be useful for large datasets that do not fit the device memory. Setting `internal_dataset_memory_type` other than `device` has negative impact on search speed. Using `host_huge_page` option is only supported on systems with Heterogeneous Memory Management or on platforms that natively support GPU access to system allocated memory, for example Grace Hopper.
 
 To fine tune CAGRA index building we can customize IVF-PQ index builder options using the following settings. These take effect only if `graph_build_algo == "IVF_PQ"`. It is recommended to experiment using a separate IVF-PQ index to find the config that gives the largest QPS for large batch. Recall does not need to be very high, since CAGRA further optimizes the kNN neighbor graph. Some of the default values are derived from the dataset size which is assumed to be [n_vecs, dim].
 
@@ -76,6 +80,7 @@ To fine tune CAGRA index building we can customize IVF-PQ index builder options
 | `ivf_pq_search_refine_ratio`         | `build_params` | N| Positive Number >=0              | 2       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
 
 Alternatively, if `graph_build_algo == "NN_DESCENT"`, then we can customize the following parameters
+
 | Parameter                   | Type           | Required | Data Type                  | Default | Description                                                                                                                                                                       |
 |-----------------------------|----------------|----------|----------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `nn_descent_niter`          | `build_param`  | N        | Positive Integer>0         | 20 | Number of NN Descent iterations. |
@@ -170,4 +175,4 @@ Use FAISS IVF-PQ index on CPU
 | `ef`             | `search_param`  | Y        | Positive Integer >0                  |         | Size of the dynamic list for the nearest neighbors used for search. Higher value leads to more accurate but slower search. Cannot be lower than `k`.                                                                                                                                              |
 | `numThreads`     | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
 
-Please refer to [HNSW algorithm parameters guide] from `hnswlib` to learn more about these arguments.
\ No newline at end of file
+Please refer to [HNSW algorithm parameters guide](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) from `hnswlib` to learn more about these arguments.
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index a33467b554..4611f39264 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -25,6 +25,21 @@
 
 import yaml
 
+log_levels = {
+    "off": 0,
+    "error": 1,
+    "warn": 2,
+    "info": 3,
+    "debug": 4,
+    "trace": 5,
+}
+
+
+def parse_log_level(level_str):
+    if level_str not in log_levels:
+        raise ValueError("Invalid log level: %s" % level_str)
+    return log_levels[level_str.lower()]
+
 
 def positive_int(input_str: str) -> int:
     try:
@@ -39,6 +54,53 @@ def positive_int(input_str: str) -> int:
     return i
 
 
+def merge_build_files(build_dir, build_file, temp_build_file):
+
+    build_dict = {}
+
+    # If build file exists, read it
+    build_json_path = os.path.join(build_dir, build_file)
+    tmp_build_json_path = os.path.join(build_dir, temp_build_file)
+    if os.path.isfile(build_json_path):
+        try:
+            with open(build_json_path, "r") as f:
+                build_dict = json.load(f)
+        except Exception as e:
+            print(
+                "Error loading existing build file: %s (%s)"
+                % (build_json_path, e)
+            )
+
+    temp_build_dict = {}
+    if os.path.isfile(tmp_build_json_path):
+        with open(tmp_build_json_path, "r") as f:
+            temp_build_dict = json.load(f)
+    else:
+        raise ValueError("Temp build file not found: %s" % tmp_build_json_path)
+
+    tmp_benchmarks = (
+        temp_build_dict["benchmarks"]
+        if "benchmarks" in temp_build_dict
+        else {}
+    )
+    benchmarks = build_dict["benchmarks"] if "benchmarks" in build_dict else {}
+
+    # If the build time is absolute 0 then an error occurred
+    final_bench_dict = {}
+    for b in benchmarks:
+        if b["real_time"] > 0:
+            final_bench_dict[b["name"]] = b
+
+    for tmp_bench in tmp_benchmarks:
+        if tmp_bench["real_time"] > 0:
+            final_bench_dict[tmp_bench["name"]] = tmp_bench
+
+    temp_build_dict["benchmarks"] = [v for k, v in final_bench_dict.items()]
+    with open(build_json_path, "w") as f:
+        json_str = json.dumps(temp_build_dict, indent=2)
+        f.write(json_str)
+
+
 def validate_algorithm(algos_conf, algo, gpu_present):
     algos_conf_keys = set(algos_conf.keys())
     if gpu_present:
@@ -88,6 +150,7 @@ def run_build_and_search(
     batch_size,
     search_threads,
     mode="throughput",
+    raft_log_level="info",
 ):
     for executable, ann_executable_path, algo in executables_to_run.keys():
         # Need to write temporary configuration
@@ -109,6 +172,8 @@ def run_build_and_search(
         if build:
             build_folder = os.path.join(legacy_result_folder, "build")
             os.makedirs(build_folder, exist_ok=True)
+            build_file = f"{algo}.json"
+            temp_build_file = f"{build_file}.lock"
             cmd = [
                 ann_executable_path,
                 "--build",
@@ -116,10 +181,11 @@ def run_build_and_search(
                 "--benchmark_out_format=json",
                 "--benchmark_counters_tabular=true",
                 "--benchmark_out="
-                + f"{os.path.join(build_folder, f'{algo}.json')}",
+                + f"{os.path.join(build_folder, temp_build_file)}",
+                "--raft_log_level=" + f"{parse_log_level(raft_log_level)}",
             ]
             if force:
-                cmd = cmd + ["--overwrite"]
+                cmd = cmd + ["--force"]
             cmd = cmd + [temp_conf_filename]
 
             if dry_run:
@@ -129,9 +195,13 @@ def run_build_and_search(
             else:
                 try:
                     subprocess.run(cmd, check=True)
+                    merge_build_files(
+                        build_folder, build_file, temp_build_file
+                    )
                 except Exception as e:
                     print("Error occurred running benchmark: %s" % e)
                 finally:
+                    os.remove(os.path.join(build_folder, temp_build_file))
                     if not search:
                         os.remove(temp_conf_filename)
 
@@ -150,9 +220,10 @@ def run_build_and_search(
                 "--mode=%s" % mode,
                 "--benchmark_out="
                 + f"{os.path.join(search_folder, f'{algo}.json')}",
+                "--raft_log_level=" + f"{parse_log_level(raft_log_level)}",
             ]
             if force:
-                cmd = cmd + ["--overwrite"]
+                cmd = cmd + ["--force"]
 
             if search_threads:
                 cmd = cmd + ["--threads=%s" % search_threads]
@@ -294,6 +365,15 @@ def main():
         "the command.",
         action="store_true",
     )
+    parser.add_argument(
+        "--raft-log-level",
+        help="Log level, possible values are "
+        "[off, error, warn, info, debug, trace]. "
+        "Default: 'info'. Note that 'debug' or more detailed "
+        "logging level requires that the library is compiled with "
+        "-DRAFT_ACTIVE_LEVEL=<L> where <L> >= <requested log level>",
+        default="info",
+    )
 
     if len(sys.argv) == 1:
         parser.print_help()
@@ -511,6 +591,7 @@ def add_algo_group(group_list):
         batch_size,
         args.search_threads,
         mode,
+        args.raft_log_level,
     )
 
 

From 170429658dbc53e39a3789ae5f76b0bec0d02966 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 16 Nov 2023 18:40:56 +0100
Subject: [PATCH 2/6] Using `EXPORT_SET` in `rapids_find_package_root` (#2006)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/raft/pull/2006
---
 cpp/cmake/thirdparty/get_spdlog.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index 7c8f3a4722..7be7804c7e 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -26,7 +26,7 @@ function(find_and_configure_spdlog)
                 GLOBAL_TARGETS spdlog spdlog_header_only
                 NAMESPACE spdlog::)
         include("${rapids-cmake-dir}/export/find_package_root.cmake")
-        rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] rmm-exports)
+        rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET rmm-exports)
     endif()
 endfunction()
 

From a59405d65bcdf6458600a48a200cdb4b7c455306 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Thu, 16 Nov 2023 23:17:35 -0800
Subject: [PATCH 3/6] Add support for iterating over batches in bfknn (#1947)

This adds support for iterating over batches of nearest neighbors in the brute force knn. This lets you query for the nearest neighbors, and then filter down the results - and if you have filtered out too many results, get the next batch of nearest neighbors.

The challenge here is balancing memory consumption versus efficieny: we could store the results of the full gemm in the distance calculation - but for large indices this discards the benefits of using the tiling strategy and risks running OOM. Instead we exponentially grow the number of neighbors being returned, and also cache both the query norms and index norms between calls.

Authors:
  - Ben Frederickson (https://github.com/benfred)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1947
---
 cpp/bench/ann/CMakeLists.txt                  |   5 +-
 .../distance/detail/distance_ops/l2_exp.cuh   |   2 +-
 .../raft/neighbors/brute_force-ext.cuh        |   2 +-
 .../raft/neighbors/brute_force-inl.cuh        |  31 +----
 cpp/include/raft/neighbors/brute_force.cuh    |  68 ++++++++++
 .../raft/neighbors/brute_force_types.hpp      | 119 +++++++++++++++++-
 .../raft/neighbors/detail/knn_brute_force.cuh |  93 ++++++++++----
 .../detail/knn_brute_force_batch_k_query.cuh  |  98 +++++++++++++++
 .../raft/neighbors/neighbors_types.hpp        |  63 ++++++++++
 .../neighbors/brute_force_knn_index_float.cu  |   2 +-
 cpp/test/neighbors/tiled_knn.cu               |  74 ++++++++++-
 11 files changed, 497 insertions(+), 60 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/detail/knn_brute_force_batch_k_query.cuh
 create mode 100644 cpp/include/raft/neighbors/neighbors_types.hpp

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index eb44e58cb5..5919de07e7 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -90,7 +90,7 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
    OR RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE
    OR RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
    OR RAFT_ANN_BENCH_USE_RAFT_CAGRA
-  OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB
+   OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB
 )
   set(RAFT_ANN_BENCH_USE_RAFT ON)
 endif()
@@ -263,7 +263,8 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
     ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib
     LINKS
     raft::compiled
-    CXXFLAGS "${HNSW_CXX_FLAGS}"
+    CXXFLAGS
+    "${HNSW_CXX_FLAGS}"
   )
 endif()
 
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index 5b4048c1c3..a218c85a0a 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -44,7 +44,7 @@ struct l2_exp_cutlass_op {
 
   __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {}
   __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {}
-  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  inline __device__ AccT operator()(DataT aNorm, DataT bNorm, DataT accVal) const noexcept
   {
     AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
 
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
index b8c00616da..4c1f7ea21e 100644
--- a/cpp/include/raft/neighbors/brute_force-ext.cuh
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -50,7 +50,7 @@ void search(raft::resources const& res,
             const index<T>& idx,
             raft::device_matrix_view<const T, int64_t, row_major> queries,
             raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
-            raft::device_matrix_view<float, int64_t, row_major> distances) RAFT_EXPLICIT;
+            raft::device_matrix_view<T, int64_t, row_major> distances) RAFT_EXPLICIT;
 
 template <typename idx_t,
           typename value_t,
diff --git a/cpp/include/raft/neighbors/brute_force-inl.cuh b/cpp/include/raft/neighbors/brute_force-inl.cuh
index 52a40da272..6b86f2463f 100644
--- a/cpp/include/raft/neighbors/brute_force-inl.cuh
+++ b/cpp/include/raft/neighbors/brute_force-inl.cuh
@@ -346,36 +346,9 @@ void search(raft::resources const& res,
             const index<T>& idx,
             raft::device_matrix_view<const T, int64_t, row_major> queries,
             raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
-            raft::device_matrix_view<float, int64_t, row_major> distances)
+            raft::device_matrix_view<T, int64_t, row_major> distances)
 {
-  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1), "Value of k must match for outputs");
-  RAFT_EXPECTS(idx.dataset().extent(1) == queries.extent(1),
-               "Number of columns in queries must match brute force index");
-
-  auto k = neighbors.extent(1);
-  auto d = idx.dataset().extent(1);
-
-  std::vector<T*> dataset    = {const_cast<T*>(idx.dataset().data_handle())};
-  std::vector<int64_t> sizes = {idx.dataset().extent(0)};
-  std::vector<T*> norms;
-  if (idx.has_norms()) { norms.push_back(const_cast<T*>(idx.norms().data_handle())); }
-
-  detail::brute_force_knn_impl<int64_t, IdxT, T>(res,
-                                                 dataset,
-                                                 sizes,
-                                                 d,
-                                                 const_cast<T*>(queries.data_handle()),
-                                                 queries.extent(0),
-                                                 neighbors.data_handle(),
-                                                 distances.data_handle(),
-                                                 k,
-                                                 true,
-                                                 true,
-                                                 nullptr,
-                                                 idx.metric(),
-                                                 idx.metric_arg(),
-                                                 raft::identity_op(),
-                                                 norms.size() ? &norms : nullptr);
+  raft::neighbors::detail::brute_force_search<T, IdxT>(res, idx, queries, neighbors, distances);
 }
 /** @} */  // end group brute_force_knn
 }  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
index 6cebf4b52a..4ba9159556 100644
--- a/cpp/include/raft/neighbors/brute_force.cuh
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #pragma once
+#include <memory>
 
 #ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
 #include "brute_force-inl.cuh"
@@ -22,3 +23,70 @@
 #ifdef RAFT_COMPILED
 #include "brute_force-ext.cuh"
 #endif
+
+#include <raft/neighbors/detail/knn_brute_force_batch_k_query.cuh>
+
+namespace raft::neighbors::brute_force {
+/**
+ * @brief Make a brute force query over batches of k
+ *
+ * This lets you query for batches of k. For example, you can get
+ * the first 100 neighbors, then the next 100 neighbors etc.
+ *
+ * Example usage:
+ * @code{.cpp}
+ * #include <raft/neighbors/brute_force.cuh>
+ * #include <raft/core/device_mdarray.hpp>
+ * #include <raft/random/make_blobs.cuh>
+
+ * // create a random dataset
+ * int n_rows = 10000;
+ * int n_cols = 10000;
+
+ * raft::device_resources res;
+ * auto dataset = raft::make_device_matrix<float, int>(res, n_rows, n_cols);
+ * auto labels = raft::make_device_vector<float, int>(res, n_rows);
+
+ * raft::make_blobs(res, dataset.view(), labels.view());
+ *
+ * // create a brute_force knn index from the dataset
+ * auto index = raft::neighbors::brute_force::build(res,
+ *                                                  raft::make_const_mdspan(dataset.view()));
+ *
+ * // search the index in batches of 128 nearest neighbors
+ * auto search = raft::make_const_mdspan(dataset.view());
+ * auto query = make_batch_k_query<float, int>(res, index, search, 128);
+ * for (auto & batch: *query) {
+ *  // batch.indices() and batch.distances() contain the information on the current batch
+ * }
+ *
+ * // we can also support variable sized batches - loaded up a different number
+ * // of neighbors at each iteration through the ::advance method
+ * int64_t batch_size = 128;
+ * query = make_batch_k_query<float, int>(res, index, search, batch_size);
+ * for (auto it = query->begin(); it != query->end(); it.advance(batch_size)) {
+ *  // batch.indices() and batch.distances() contain the information on the current batch
+ *
+ *  batch_size += 16; // load up an extra 16 items in the next batch
+ * }
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ * @param[in] res
+ * @param[in] index The index to query
+ * @param[in] query A device matrix view to query for [n_queries, index->dim()]
+ * @param[in] batch_size The size of each batch
+ */
+
+template <typename T, typename IdxT>
+std::shared_ptr<batch_k_query<T, IdxT>> make_batch_k_query(
+  const raft::resources& res,
+  const raft::neighbors::brute_force::index<T>& index,
+  raft::device_matrix_view<const T, int64_t, row_major> query,
+  int64_t batch_size)
+{
+  return std::shared_ptr<batch_k_query<T, IdxT>>(
+    new detail::gpu_batch_k_query<T, IdxT>(res, index, query, batch_size));
+}
+}  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/brute_force_types.hpp b/cpp/include/raft/neighbors/brute_force_types.hpp
index f7030503f1..039599845e 100644
--- a/cpp/include/raft/neighbors/brute_force_types.hpp
+++ b/cpp/include/raft/neighbors/brute_force_types.hpp
@@ -25,6 +25,7 @@
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/neighbors_types.hpp>
 
 #include <raft/core/logger.hpp>
 
@@ -69,7 +70,7 @@ struct index : ann::index {
     return norms_view_.value();
   }
 
-  /** Whether ot not this index has dataset norms */
+  /** Whether or not this index has dataset norms */
   [[nodiscard]] inline bool has_norms() const noexcept { return norms_view_.has_value(); }
 
   [[nodiscard]] inline T metric_arg() const noexcept { return metric_arg_; }
@@ -160,6 +161,122 @@ struct index : ann::index {
   T metric_arg_;
 };
 
+/**
+ * @brief Interface for performing queries over values of k
+ *
+ * This interface lets you iterate over batches of k from a brute_force::index.
+ * This lets you do things like retrieve the first 100 neighbors for a query,
+ * apply post processing to remove any unwanted items and then if needed get the
+ * next 100 closest neighbors for the query.
+ *
+ * This query interface exposes C++ iterators through the ::begin and ::end, and
+ * is compatible with range based for loops.
+ *
+ * Note that this class is an abstract class without any cuda dependencies, meaning
+ * that it doesn't require a cuda compiler to use - but also means it can't be directly
+ * instantiated.  See the raft::neighbors::brute_force::make_batch_k_query
+ * function for usage examples.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ */
+template <typename T, typename IdxT = int64_t>
+class batch_k_query {
+ public:
+  batch_k_query(const raft::resources& res,
+                int64_t index_size,
+                int64_t query_size,
+                int64_t batch_size)
+    : res(res), index_size(index_size), query_size(query_size), batch_size(batch_size)
+  {
+  }
+  virtual ~batch_k_query() {}
+
+  using value_type = raft::neighbors::batch<T, IdxT>;
+
+  class iterator {
+   public:
+    using value_type = raft::neighbors::batch<T, IdxT>;
+    using reference  = const value_type&;
+    using pointer    = const value_type*;
+
+    iterator(const batch_k_query<T, IdxT>* query, int64_t offset = 0)
+      : current(query->res, 0, 0), batches(query->res, 0, 0), query(query), offset(offset)
+    {
+      query->load_batch(offset, query->batch_size, &batches);
+      query->slice_batch(batches, offset, query->batch_size, &current);
+    }
+
+    reference operator*() const { return current; }
+
+    pointer operator->() const { return &current; }
+
+    iterator& operator++()
+    {
+      advance(query->batch_size);
+      return *this;
+    }
+
+    iterator operator++(int)
+    {
+      iterator previous(*this);
+      operator++();
+      return previous;
+    }
+
+    /**
+     * @brief Advance the iterator, using a custom size for the next batch
+     *
+     * Using operator++ means that we will load up the same batch_size for each
+     * batch. This method allows us to get around this restriction, and load up
+     * arbitrary batch sizes on each iteration.
+     * See raft::neighbors::brute_force::make_batch_k_query for a usage example.
+     *
+     * @param[in] next_batch_size: size of the next batch to load up
+     */
+    void advance(int64_t next_batch_size)
+    {
+      offset = std::min(offset + current.batch_size(), query->index_size);
+      if (offset + next_batch_size > batches.batch_size()) {
+        query->load_batch(offset, next_batch_size, &batches);
+      }
+      query->slice_batch(batches, offset, next_batch_size, &current);
+    }
+
+    friend bool operator==(const iterator& lhs, const iterator& rhs)
+    {
+      return (lhs.query == rhs.query) && (lhs.offset == rhs.offset);
+    };
+    friend bool operator!=(const iterator& lhs, const iterator& rhs) { return !(lhs == rhs); };
+
+   protected:
+    // the current batch of data
+    value_type current;
+
+    // the currently loaded group of data (containing multiple batches of data that we can iterate
+    // through)
+    value_type batches;
+
+    const batch_k_query<T, IdxT>* query;
+    int64_t offset, current_batch_size;
+  };
+
+  iterator begin() const { return iterator(this); }
+  iterator end() const { return iterator(this, index_size); }
+
+ protected:
+  // these two methods need cuda code, and are implemented in the subclass
+  virtual void load_batch(int64_t offset,
+                          int64_t next_batch_size,
+                          batch<T, IdxT>* output) const = 0;
+  virtual void slice_batch(const value_type& input,
+                           int64_t offset,
+                           int64_t batch_size,
+                           value_type* output) const    = 0;
+
+  const raft::resources& res;
+  int64_t index_size, query_size, batch_size;
+};
 /** @} */
 
 }  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index 5da4e77874..27ef00e385 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -36,6 +36,7 @@
 #include <raft/linalg/transpose.cuh>
 #include <raft/matrix/init.cuh>
 #include <raft/matrix/select_k.cuh>
+#include <raft/neighbors/brute_force_types.hpp>
 #include <raft/neighbors/detail/faiss_select/DistanceUtils.h>
 #include <raft/neighbors/detail/knn_merge_parts.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
@@ -65,11 +66,12 @@ void tiled_brute_force_knn(const raft::resources& handle,
                            ElementType* distances,  // size (m, k)
                            IndexType* indices,      // size (m, k)
                            raft::distance::DistanceType metric,
-                           float metric_arg                           = 2.0,
-                           size_t max_row_tile_size                   = 0,
-                           size_t max_col_tile_size                   = 0,
-                           DistanceEpilogue distance_epilogue         = raft::identity_op(),
-                           const ElementType* precomputed_index_norms = nullptr)
+                           float metric_arg                            = 2.0,
+                           size_t max_row_tile_size                    = 0,
+                           size_t max_col_tile_size                    = 0,
+                           DistanceEpilogue distance_epilogue          = raft::identity_op(),
+                           const ElementType* precomputed_index_norms  = nullptr,
+                           const ElementType* precomputed_search_norms = nullptr)
 {
   // Figure out the number of rows/cols to tile for
   size_t tile_rows   = 0;
@@ -98,18 +100,20 @@ void tiled_brute_force_knn(const raft::resources& handle,
   if (metric == raft::distance::DistanceType::L2Expanded ||
       metric == raft::distance::DistanceType::L2SqrtExpanded ||
       metric == raft::distance::DistanceType::CosineExpanded) {
-    search_norms.resize(m, stream);
+    if (!precomputed_search_norms) { search_norms.resize(m, stream); }
     if (!precomputed_index_norms) { index_norms.resize(n, stream); }
     // cosine needs the l2norm, where as l2 distances needs the squared norm
     if (metric == raft::distance::DistanceType::CosineExpanded) {
-      raft::linalg::rowNorm(search_norms.data(),
-                            search,
-                            d,
-                            m,
-                            raft::linalg::NormType::L2Norm,
-                            true,
-                            stream,
-                            raft::sqrt_op{});
+      if (!precomputed_search_norms) {
+        raft::linalg::rowNorm(search_norms.data(),
+                              search,
+                              d,
+                              m,
+                              raft::linalg::NormType::L2Norm,
+                              true,
+                              stream,
+                              raft::sqrt_op{});
+      }
       if (!precomputed_index_norms) {
         raft::linalg::rowNorm(index_norms.data(),
                               index,
@@ -121,9 +125,10 @@ void tiled_brute_force_knn(const raft::resources& handle,
                               raft::sqrt_op{});
       }
     } else {
-      raft::linalg::rowNorm(
-        search_norms.data(), search, d, m, raft::linalg::NormType::L2Norm, true, stream);
-
+      if (!precomputed_search_norms) {
+        raft::linalg::rowNorm(
+          search_norms.data(), search, d, m, raft::linalg::NormType::L2Norm, true, stream);
+      }
       if (!precomputed_index_norms) {
         raft::linalg::rowNorm(
           index_norms.data(), index, d, n, raft::linalg::NormType::L2Norm, true, stream);
@@ -184,7 +189,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
                                                     metric_arg);
       if (metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::L2SqrtExpanded) {
-        auto row_norms = search_norms.data();
+        auto row_norms = precomputed_search_norms ? precomputed_search_norms : search_norms.data();
         auto col_norms = precomputed_index_norms ? precomputed_index_norms : index_norms.data();
         auto dist      = temp_distances.data();
         bool sqrt      = metric == raft::distance::DistanceType::L2SqrtExpanded;
@@ -201,7 +206,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
             return distance_epilogue(val, row, col);
           });
       } else if (metric == raft::distance::DistanceType::CosineExpanded) {
-        auto row_norms = search_norms.data();
+        auto row_norms = precomputed_search_norms ? precomputed_search_norms : search_norms.data();
         auto col_norms = precomputed_index_norms ? precomputed_index_norms : index_norms.data();
         auto dist      = temp_distances.data();
 
@@ -333,7 +338,8 @@ void brute_force_knn_impl(
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
   float metricArg                     = 0,
   DistanceEpilogue distance_epilogue  = raft::identity_op(),
-  std::vector<value_t*>* input_norms  = nullptr)
+  std::vector<value_t*>* input_norms  = nullptr,
+  const value_t* search_norms         = nullptr)
 {
   auto userStream = resource::get_cuda_stream(handle);
 
@@ -376,7 +382,7 @@ void brute_force_knn_impl(
   }
 
   // currently we don't support col_major inside tiled_brute_force_knn, because
-  // of limitattions of the pairwise_distance API:
+  // of limitations of the pairwise_distance API:
   // 1) paiwise_distance takes a single 'isRowMajor' parameter - and we have
   // multiple options here (like rowMajorQuery/rowMajorIndex)
   // 2) because of tiling, we need to be able to set a custom stride in the PW
@@ -428,7 +434,8 @@ void brute_force_knn_impl(
                  rowMajorQuery,
                  stream,
                  metric,
-                 input_norms ? (*input_norms)[i] : nullptr);
+                 input_norms ? (*input_norms)[i] : nullptr,
+                 search_norms);
 
       // Perform necessary post-processing
       if (metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -478,7 +485,8 @@ void brute_force_knn_impl(
                                                   0,
                                                   0,
                                                   distance_epilogue,
-                                                  input_norms ? (*input_norms)[i] : nullptr);
+                                                  input_norms ? (*input_norms)[i] : nullptr,
+                                                  search_norms);
           break;
       }
     }
@@ -500,4 +508,43 @@ void brute_force_knn_impl(
   if (translations == nullptr) delete id_ranges;
 };
 
+template <typename T, typename IdxT>
+void brute_force_search(
+  raft::resources const& res,
+  const raft::neighbors::brute_force::index<T>& idx,
+  raft::device_matrix_view<const T, int64_t, row_major> queries,
+  raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+  raft::device_matrix_view<T, int64_t, row_major> distances,
+  std::optional<raft::device_vector_view<const T, int64_t>> query_norms = std::nullopt)
+{
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1), "Value of k must match for outputs");
+  RAFT_EXPECTS(idx.dataset().extent(1) == queries.extent(1),
+               "Number of columns in queries must match brute force index");
+
+  auto k = neighbors.extent(1);
+  auto d = idx.dataset().extent(1);
+
+  std::vector<T*> dataset    = {const_cast<T*>(idx.dataset().data_handle())};
+  std::vector<int64_t> sizes = {idx.dataset().extent(0)};
+  std::vector<T*> norms;
+  if (idx.has_norms()) { norms.push_back(const_cast<T*>(idx.norms().data_handle())); }
+
+  brute_force_knn_impl<int64_t, IdxT, T>(res,
+                                         dataset,
+                                         sizes,
+                                         d,
+                                         const_cast<T*>(queries.data_handle()),
+                                         queries.extent(0),
+                                         neighbors.data_handle(),
+                                         distances.data_handle(),
+                                         k,
+                                         true,
+                                         true,
+                                         nullptr,
+                                         idx.metric(),
+                                         idx.metric_arg(),
+                                         raft::identity_op(),
+                                         norms.size() ? &norms : nullptr,
+                                         query_norms ? query_norms->data_handle() : nullptr);
+}
 }  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force_batch_k_query.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force_batch_k_query.cuh
new file mode 100644
index 0000000000..384eacae79
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force_batch_k_query.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/linalg/norm.cuh>
+#include <raft/matrix/slice.cuh>
+#include <raft/neighbors/brute_force_types.hpp>
+#include <raft/neighbors/detail/knn_brute_force.cuh>
+
+namespace raft::neighbors::brute_force::detail {
+template <typename T, typename IdxT = int64_t>
+class gpu_batch_k_query : public batch_k_query<T, IdxT> {
+ public:
+  gpu_batch_k_query(const raft::resources& res,
+                    const raft::neighbors::brute_force::index<T>& index,
+                    raft::device_matrix_view<const T, int64_t, row_major> query,
+                    int64_t batch_size)
+    : batch_k_query<T, IdxT>(res, index.size(), query.extent(0), batch_size),
+      index(index),
+      query(query)
+  {
+    auto metric = index.metric();
+
+    // precompute query norms, and re-use across batches
+    if (metric == raft::distance::DistanceType::L2Expanded ||
+        metric == raft::distance::DistanceType::L2SqrtExpanded ||
+        metric == raft::distance::DistanceType::CosineExpanded) {
+      query_norms = make_device_vector<T, int64_t>(res, query.extent(0));
+
+      if (metric == raft::distance::DistanceType::CosineExpanded) {
+        raft::linalg::norm(res,
+                           query,
+                           query_norms->view(),
+                           raft::linalg::NormType::L2Norm,
+                           raft::linalg::Apply::ALONG_ROWS,
+                           raft::sqrt_op{});
+      } else {
+        raft::linalg::norm(res,
+                           query,
+                           query_norms->view(),
+                           raft::linalg::NormType::L2Norm,
+                           raft::linalg::Apply::ALONG_ROWS);
+      }
+    }
+  }
+
+ protected:
+  void load_batch(int64_t offset, int64_t next_batch_size, batch<T, IdxT>* output) const override
+  {
+    if (offset >= index.size()) { return; }
+
+    // we're aiming to load multiple batches here - since we don't know the max iteration
+    // grow the size we're loading exponentially
+    int64_t batch_size = std::min(std::max(offset * 2, next_batch_size * 2), this->index_size);
+    output->resize(this->res, this->query_size, batch_size);
+
+    std::optional<raft::device_vector_view<const float, int64_t>> query_norms_view;
+    if (query_norms) { query_norms_view = query_norms->view(); }
+
+    raft::neighbors::detail::brute_force_search<T, IdxT>(
+      this->res, index, query, output->indices(), output->distances(), query_norms_view);
+  };
+
+  void slice_batch(const batch<T, IdxT>& input,
+                   int64_t offset,
+                   int64_t batch_size,
+                   batch<T, IdxT>* output) const override
+  {
+    auto num_queries = input.indices().extent(0);
+    batch_size       = std::min(batch_size, index.size() - offset);
+
+    output->resize(this->res, num_queries, batch_size);
+
+    if (!num_queries || !batch_size) { return; }
+
+    matrix::slice_coordinates<int64_t> coords{0, offset, num_queries, offset + batch_size};
+    matrix::slice(this->res, input.indices(), output->indices(), coords);
+    matrix::slice(this->res, input.distances(), output->distances(), coords);
+  }
+
+  const raft::neighbors::brute_force::index<T>& index;
+  raft::device_matrix_view<const T, int64_t, row_major> query;
+  std::optional<device_vector<T, int64_t>> query_norms;
+};
+}  // namespace raft::neighbors::brute_force::detail
diff --git a/cpp/include/raft/neighbors/neighbors_types.hpp b/cpp/include/raft/neighbors/neighbors_types.hpp
new file mode 100644
index 0000000000..d503779741
--- /dev/null
+++ b/cpp/include/raft/neighbors/neighbors_types.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::neighbors {
+
+/** A single batch of nearest neighbors in device memory */
+template <typename T, typename IdxT>
+class batch {
+ public:
+  /** Create a new empty batch of data */
+  batch(raft::resources const& res, int64_t rows, int64_t cols)
+    : indices_(make_device_matrix<IdxT, int64_t>(res, rows, cols)),
+      distances_(make_device_matrix<T, int64_t>(res, rows, cols))
+  {
+  }
+
+  void resize(raft::resources const& res, int64_t rows, int64_t cols)
+  {
+    indices_   = make_device_matrix<IdxT, int64_t>(res, rows, cols);
+    distances_ = make_device_matrix<T, int64_t>(res, rows, cols);
+  }
+
+  /** Returns the indices for the batch */
+  device_matrix_view<const IdxT, int64_t> indices() const
+  {
+    return raft::make_const_mdspan(indices_.view());
+  }
+  device_matrix_view<IdxT, int64_t> indices() { return indices_.view(); }
+
+  /** Returns the distances for the batch */
+  device_matrix_view<const T, int64_t> distances() const
+  {
+    return raft::make_const_mdspan(distances_.view());
+  }
+  device_matrix_view<T, int64_t> distances() { return distances_.view(); }
+
+  /** Returns the size of the batch */
+  int64_t batch_size() const { return indices().extent(1); }
+
+ protected:
+  raft::device_matrix<IdxT, int64_t> indices_;
+  raft::device_matrix<T, int64_t> distances_;
+};
+}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/brute_force_knn_index_float.cu b/cpp/src/neighbors/brute_force_knn_index_float.cu
index f2fda93a97..d4f902c087 100644
--- a/cpp/src/neighbors/brute_force_knn_index_float.cu
+++ b/cpp/src/neighbors/brute_force_knn_index_float.cu
@@ -36,4 +36,4 @@ template raft::neighbors::brute_force::index<float> raft::neighbors::brute_force
   raft::resources const& res,
   raft::device_matrix_view<const float, int64_t, row_major> dataset,
   raft::distance::DistanceType metric,
-  float metric_arg);
\ No newline at end of file
+  float metric_arg);
diff --git a/cpp/test/neighbors/tiled_knn.cu b/cpp/test/neighbors/tiled_knn.cu
index ebde8e6d35..a84c9749d7 100644
--- a/cpp/test/neighbors/tiled_knn.cu
+++ b/cpp/test/neighbors/tiled_knn.cu
@@ -38,6 +38,7 @@
 #include <vector>
 
 namespace raft::neighbors::brute_force {
+
 struct TiledKNNInputs {
   int num_queries;
   int num_db_vecs;
@@ -190,11 +191,13 @@ class TiledKNNTest : public ::testing::TestWithParam<TiledKNNInputs> {
                                                metric,
                                                metric_arg);
 
+      auto query_view = raft::make_device_matrix_view<const T, int64_t>(
+        search_queries.data(), params_.num_queries, params_.dim);
+
       raft::neighbors::brute_force::search<T, int>(
         handle_,
         idx,
-        raft::make_device_matrix_view<const T, int64_t>(
-          search_queries.data(), params_.num_queries, params_.dim),
+        query_view,
         raft::make_device_matrix_view<int, int64_t>(
           raft_indices_.data(), params_.num_queries, params_.k),
         raft::make_device_matrix_view<T, int64_t>(
@@ -209,6 +212,73 @@ class TiledKNNTest : public ::testing::TestWithParam<TiledKNNInputs> {
                                                          float(0.001),
                                                          stream_,
                                                          true));
+      // also test out the batch api. First get new reference results (all k, up to a certain
+      // max size)
+      auto all_size      = std::min(params_.num_db_vecs, 1024);
+      auto all_indices   = raft::make_device_matrix<int, int64_t>(handle_, num_queries, all_size);
+      auto all_distances = raft::make_device_matrix<T, int64_t>(handle_, num_queries, all_size);
+      raft::neighbors::brute_force::search<T, int>(
+        handle_, idx, query_view, all_indices.view(), all_distances.view());
+
+      int64_t offset = 0;
+      auto query     = make_batch_k_query<T, int>(handle_, idx, query_view, k_);
+      for (auto batch : *query) {
+        auto batch_size = batch.batch_size();
+        auto indices    = raft::make_device_matrix<int, int64_t>(handle_, num_queries, batch_size);
+        auto distances  = raft::make_device_matrix<T, int64_t>(handle_, num_queries, batch_size);
+
+        matrix::slice_coordinates<int64_t> coords{0, offset, num_queries, offset + batch_size};
+
+        matrix::slice(handle_, raft::make_const_mdspan(all_indices.view()), indices.view(), coords);
+        matrix::slice(
+          handle_, raft::make_const_mdspan(all_distances.view()), distances.view(), coords);
+
+        ASSERT_TRUE(raft::spatial::knn::devArrMatchKnnPair(indices.data_handle(),
+                                                           batch.indices().data_handle(),
+                                                           distances.data_handle(),
+                                                           batch.distances().data_handle(),
+                                                           num_queries,
+                                                           batch_size,
+                                                           float(0.001),
+                                                           stream_,
+                                                           true));
+
+        offset += batch_size;
+        if (offset + batch_size > all_size) break;
+      }
+
+      // also test out with variable batch sizes
+      offset             = 0;
+      int64_t batch_size = k_;
+      query              = make_batch_k_query<T, int>(handle_, idx, query_view, batch_size);
+      for (auto it = query->begin(); it != query->end(); it.advance(batch_size)) {
+        // batch_size could be less than requested (in the case of final batch). handle.
+        ASSERT_TRUE(it->indices().extent(1) <= batch_size);
+        batch_size = it->indices().extent(1);
+
+        auto indices   = raft::make_device_matrix<int, int64_t>(handle_, num_queries, batch_size);
+        auto distances = raft::make_device_matrix<T, int64_t>(handle_, num_queries, batch_size);
+
+        matrix::slice_coordinates<int64_t> coords{0, offset, num_queries, offset + batch_size};
+        matrix::slice(handle_, raft::make_const_mdspan(all_indices.view()), indices.view(), coords);
+        matrix::slice(
+          handle_, raft::make_const_mdspan(all_distances.view()), distances.view(), coords);
+
+        ASSERT_TRUE(raft::spatial::knn::devArrMatchKnnPair(indices.data_handle(),
+                                                           it->indices().data_handle(),
+                                                           distances.data_handle(),
+                                                           it->distances().data_handle(),
+                                                           num_queries,
+                                                           batch_size,
+                                                           float(0.001),
+                                                           stream_,
+                                                           true));
+
+        offset += batch_size;
+        if (offset + batch_size > all_size) break;
+
+        batch_size += 2;
+      }
     }
   }
 

From 6e64e0fdce70c458232e52079d70035ead9462eb Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 17 Nov 2023 10:37:37 -0600
Subject: [PATCH 4/6] Enable build concurrency for nightly and merge triggers.
 (#2011)

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 5a883b64ed..2435c477ca 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,7 +22,7 @@ on:
         default: nightly
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:

From 52a5e4c9a57d927865dfb0d461580ad7ac24c98d Mon Sep 17 00:00:00 2001
From: Tarang Jain <40517122+tarang-jain@users.noreply.github.com>
Date: Fri, 17 Nov 2023 15:11:26 -0800
Subject: [PATCH 5/6] [FEA] Helpers and CodePacker for IVF-PQ (#1826)

- [x] Codepacking for compressed on-device (flat) PQ codes
- [x] Testing

Authors:
  - Tarang Jain (https://github.com/tarang-jain)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1826
---
 .../raft/neighbors/detail/div_utils.hpp       |  66 +++
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 290 ++++++++++---
 .../raft/neighbors/ivf_flat_codepacker.hpp    |  39 +-
 .../raft/neighbors/ivf_flat_helpers.cuh       |  35 ++
 cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 392 +++++++++++++++++-
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  24 ++
 cpp/test/neighbors/ann_ivf_pq.cuh             | 104 +++++
 7 files changed, 866 insertions(+), 84 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/detail/div_utils.hpp

diff --git a/cpp/include/raft/neighbors/detail/div_utils.hpp b/cpp/include/raft/neighbors/detail/div_utils.hpp
new file mode 100644
index 0000000000..0455d0ec9b
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/div_utils.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef _RAFT_HAS_CUDA
+#include <raft/util/pow2_utils.cuh>
+#else
+#include <raft/util/integer_utils.hpp>
+#endif
+
+/**
+ * @brief A simple wrapper for raft::Pow2 which uses Pow2 utils only when available and regular
+ * integer division otherwise. This is done to allow a common interface for division arithmetic for
+ * non CUDA headers.
+ *
+ * @tparam Value_ a compile-time value representable as a power-of-two.
+ */
+namespace raft::neighbors::detail {
+template <auto Value_>
+struct div_utils {
+  typedef decltype(Value_) Type;
+  static constexpr Type Value = Value_;
+
+  template <typename T>
+  static constexpr _RAFT_HOST_DEVICE inline auto roundDown(T x)
+  {
+#if defined(_RAFT_HAS_CUDA)
+    return Pow2<Value_>::roundDown(x);
+#else
+    return raft::round_down_safe(x, Value_);
+#endif
+  }
+
+  template <typename T>
+  static constexpr _RAFT_HOST_DEVICE inline auto mod(T x)
+  {
+#if defined(_RAFT_HAS_CUDA)
+    return Pow2<Value_>::mod(x);
+#else
+    return x % Value_;
+#endif
+  }
+
+  template <typename T>
+  static constexpr _RAFT_HOST_DEVICE inline auto div(T x)
+  {
+#if defined(_RAFT_HAS_CUDA)
+    return Pow2<Value_>::div(x);
+#else
+    return x / Value_;
+#endif
+  }
+};
+}  // namespace raft::neighbors::detail
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 33ed51ad05..e57133fc23 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -313,6 +313,59 @@ auto calculate_offsets_and_indices(IdxT n_rows,
   return max_cluster_size;
 }
 
+template <typename IdxT>
+void set_centers(raft::resources const& handle, index<IdxT>* index, const float* cluster_centers)
+{
+  auto stream         = resource::get_cuda_stream(handle);
+  auto* device_memory = resource::get_workspace_resource(handle);
+
+  // combine cluster_centers and their norms
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(index->centers().data_handle(),
+                                  sizeof(float) * index->dim_ext(),
+                                  cluster_centers,
+                                  sizeof(float) * index->dim(),
+                                  sizeof(float) * index->dim(),
+                                  index->n_lists(),
+                                  cudaMemcpyDefault,
+                                  stream));
+
+  rmm::device_uvector<float> center_norms(index->n_lists(), stream, device_memory);
+  raft::linalg::rowNorm(center_norms.data(),
+                        cluster_centers,
+                        index->dim(),
+                        index->n_lists(),
+                        raft::linalg::L2Norm,
+                        true,
+                        stream);
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(index->centers().data_handle() + index->dim(),
+                                  sizeof(float) * index->dim_ext(),
+                                  center_norms.data(),
+                                  sizeof(float),
+                                  sizeof(float),
+                                  index->n_lists(),
+                                  cudaMemcpyDefault,
+                                  stream));
+
+  //     Rotate cluster_centers
+  float alpha = 1.0;
+  float beta  = 0.0;
+  linalg::gemm(handle,
+               true,
+               false,
+               index->rot_dim(),
+               index->n_lists(),
+               index->dim(),
+               &alpha,
+               index->rotation_matrix().data_handle(),
+               index->dim(),
+               cluster_centers,
+               index->dim(),
+               &beta,
+               index->centers_rot().data_handle(),
+               index->rot_dim(),
+               resource::get_cuda_stream(handle));
+}
+
 template <typename IdxT>
 void transpose_pq_centers(const resources& handle,
                           index<IdxT>& index,
@@ -613,6 +666,100 @@ void unpack_list_data(raft::resources const& res,
                    resource::get_cuda_stream(res));
 }
 
+/**
+ * A consumer for the `run_on_vector` that just flattens PQ codes
+ * into a tightly packed matrix. That is, the codes are not expanded to one code-per-byte.
+ */
+template <uint32_t PqBits>
+struct unpack_contiguous {
+  uint8_t* codes;
+  uint32_t code_size;
+
+  /**
+   * Create a callable to be passed to `run_on_vector`.
+   *
+   * @param[in] codes flat compressed PQ codes
+   */
+  __host__ __device__ inline unpack_contiguous(uint8_t* codes, uint32_t pq_dim)
+    : codes{codes}, code_size{raft::ceildiv<uint32_t>(pq_dim * PqBits, 8)}
+  {
+  }
+
+  /**  Write j-th component (code) of the i-th vector into the output array. */
+  __host__ __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    bitfield_view_t<PqBits> code_view{codes + i * code_size};
+    code_view[j] = code;
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) RAFT_KERNEL unpack_contiguous_list_data_kernel(
+  uint8_t* out_codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  uint32_t n_rows,
+  uint32_t pq_dim,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  run_on_list<PqBits>(
+    in_list_data, offset_or_indices, n_rows, pq_dim, unpack_contiguous<PqBits>(out_codes, pq_dim));
+}
+
+/**
+ * Unpack flat PQ codes from an existing list by the given offset.
+ *
+ * @param[out] codes flat compressed PQ codes [n_rows, ceildiv(pq_dim * pq_bits, 8)]
+ * @param[in] list_data the packed ivf::list data.
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void unpack_contiguous_list_data(
+  uint8_t* codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  uint32_t n_rows,
+  uint32_t pq_dim,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return unpack_contiguous_list_data_kernel<kBlockSize, 4>;
+      case 5: return unpack_contiguous_list_data_kernel<kBlockSize, 5>;
+      case 6: return unpack_contiguous_list_data_kernel<kBlockSize, 6>;
+      case 7: return unpack_contiguous_list_data_kernel<kBlockSize, 7>;
+      case 8: return unpack_contiguous_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(codes, list_data, n_rows, pq_dim, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Unpack the list data; see the public interface for the api and usage. */
+template <typename IdxT>
+void unpack_contiguous_list_data(raft::resources const& res,
+                                 const index<IdxT>& index,
+                                 uint8_t* out_codes,
+                                 uint32_t n_rows,
+                                 uint32_t label,
+                                 std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  unpack_contiguous_list_data(out_codes,
+                              index.lists()[label]->data.view(),
+                              n_rows,
+                              index.pq_dim(),
+                              offset_or_indices,
+                              index.pq_bits(),
+                              resource::get_cuda_stream(res));
+}
+
 /** A consumer for the `run_on_list` and `run_on_vector` that approximates the original input data.
  */
 struct reconstruct_vectors {
@@ -850,6 +997,101 @@ void pack_list_data(raft::resources const& res,
                  resource::get_cuda_stream(res));
 }
 
+/**
+ * A producer for the `write_vector` reads tightly packed flat codes. That is,
+ * the codes are not expanded to one code-per-byte.
+ */
+template <uint32_t PqBits>
+struct pack_contiguous {
+  const uint8_t* codes;
+  uint32_t code_size;
+
+  /**
+   * Create a callable to be passed to `write_vector`.
+   *
+   * @param[in] codes flat compressed PQ codes
+   */
+  __host__ __device__ inline pack_contiguous(const uint8_t* codes, uint32_t pq_dim)
+    : codes{codes}, code_size{raft::ceildiv<uint32_t>(pq_dim * PqBits, 8)}
+  {
+  }
+
+  /** Read j-th component (code) of the i-th vector from the source. */
+  __host__ __device__ inline auto operator()(uint32_t i, uint32_t j) -> uint8_t
+  {
+    bitfield_view_t<PqBits> code_view{const_cast<uint8_t*>(codes + i * code_size)};
+    return uint8_t(code_view[j]);
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) RAFT_KERNEL pack_contiguous_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  const uint8_t* codes,
+  uint32_t n_rows,
+  uint32_t pq_dim,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  write_list<PqBits, 1>(
+    list_data, offset_or_indices, n_rows, pq_dim, pack_contiguous<PqBits>(codes, pq_dim));
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_rows).
+ *
+ * @param[out] list_data the packed ivf::list data.
+ * @param[in] codes flat compressed PQ codes [n_rows, ceildiv(pq_dim * pq_bits, 8)]
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void pack_contiguous_list_data(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  const uint8_t* codes,
+  uint32_t n_rows,
+  uint32_t pq_dim,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return pack_contiguous_list_data_kernel<kBlockSize, 4>;
+      case 5: return pack_contiguous_list_data_kernel<kBlockSize, 5>;
+      case 6: return pack_contiguous_list_data_kernel<kBlockSize, 6>;
+      case 7: return pack_contiguous_list_data_kernel<kBlockSize, 7>;
+      case 8: return pack_contiguous_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(list_data, codes, n_rows, pq_dim, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename IdxT>
+void pack_contiguous_list_data(raft::resources const& res,
+                               index<IdxT>* index,
+                               const uint8_t* new_codes,
+                               uint32_t n_rows,
+                               uint32_t label,
+                               std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  pack_contiguous_list_data(index->lists()[label]->data.view(),
+                            new_codes,
+                            n_rows,
+                            index->pq_dim(),
+                            offset_or_indices,
+                            index->pq_bits(),
+                            resource::get_cuda_stream(res));
+}
+
 /**
  *
  * A producer for the `write_list` and `write_vector` that encodes level-1 input vector residuals
@@ -1634,35 +1876,6 @@ auto build(raft::resources const& handle,
                                             labels_view,
                                             utils::mapping<float>());
 
-    {
-      // combine cluster_centers and their norms
-      RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle(),
-                                      sizeof(float) * index.dim_ext(),
-                                      cluster_centers,
-                                      sizeof(float) * index.dim(),
-                                      sizeof(float) * index.dim(),
-                                      index.n_lists(),
-                                      cudaMemcpyDefault,
-                                      stream));
-
-      rmm::device_uvector<float> center_norms(index.n_lists(), stream, device_memory);
-      raft::linalg::rowNorm(center_norms.data(),
-                            cluster_centers,
-                            index.dim(),
-                            index.n_lists(),
-                            raft::linalg::L2Norm,
-                            true,
-                            stream);
-      RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle() + index.dim(),
-                                      sizeof(float) * index.dim_ext(),
-                                      center_norms.data(),
-                                      sizeof(float),
-                                      sizeof(float),
-                                      index.n_lists(),
-                                      cudaMemcpyDefault,
-                                      stream));
-    }
-
     // Make rotation matrix
     make_rotation_matrix(handle,
                          params.force_random_rotation,
@@ -1670,24 +1883,7 @@ auto build(raft::resources const& handle,
                          index.dim(),
                          index.rotation_matrix().data_handle());
 
-    // Rotate cluster_centers
-    float alpha = 1.0;
-    float beta  = 0.0;
-    linalg::gemm(handle,
-                 true,
-                 false,
-                 index.rot_dim(),
-                 index.n_lists(),
-                 index.dim(),
-                 &alpha,
-                 index.rotation_matrix().data_handle(),
-                 index.dim(),
-                 cluster_centers,
-                 index.dim(),
-                 &beta,
-                 index.centers_rot().data_handle(),
-                 index.rot_dim(),
-                 stream);
+    set_centers(handle, &index, cluster_centers);
 
     // Train PQ codebooks
     switch (index.codebook_kind()) {
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
index 4594332fdf..5379788ab4 100644
--- a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
@@ -19,36 +19,11 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/neighbors/detail/div_utils.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
 
-#ifdef _RAFT_HAS_CUDA
-#include <raft/util/pow2_utils.cuh>
-#else
-#include <raft/util/integer_utils.hpp>
-#endif
-
 namespace raft::neighbors::ivf_flat::codepacker {
 
-template <typename T>
-_RAFT_HOST_DEVICE inline auto roundDown(T x)
-{
-#if defined(_RAFT_HAS_CUDA)
-  return Pow2<kIndexGroupSize>::roundDown(x);
-#else
-  return raft::round_down_safe(x, kIndexGroupSize);
-#endif
-}
-
-template <typename T>
-_RAFT_HOST_DEVICE inline auto mod(T x)
-{
-#if defined(_RAFT_HAS_CUDA)
-  return Pow2<kIndexGroupSize>::mod(x);
-#else
-  return x % kIndexGroupSize;
-#endif
-}
-
 /**
  * Write one flat code into a block by the given offset. The offset indicates the id of the record
  * in the list. This function interleaves the code and is intended to later copy the interleaved
@@ -68,12 +43,12 @@ _RAFT_HOST_DEVICE void pack_1(
   const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
 {
   // The data is written in interleaved groups of `index::kGroupSize` vectors
-  // using interleaved_group = Pow2<kIndexGroupSize>;
+  using interleaved_group = neighbors::detail::div_utils<kIndexGroupSize>;
 
   // Interleave dimensions of the source vector while recording it.
   // NB: such `veclen` is selected, that `dim % veclen == 0`
-  auto group_offset = roundDown(offset);
-  auto ingroup_id   = mod(offset) * veclen;
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
 
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
@@ -100,11 +75,11 @@ _RAFT_HOST_DEVICE void unpack_1(
   const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
 {
   // The data is written in interleaved groups of `index::kGroupSize` vectors
-  // using interleaved_group = Pow2<kIndexGroupSize>;
+  using interleaved_group = neighbors::detail::div_utils<kIndexGroupSize>;
 
   // NB: such `veclen` is selected, that `dim % veclen == 0`
-  auto group_offset = roundDown(offset);
-  auto ingroup_id   = mod(offset) * veclen;
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
 
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index 096e8051c3..7a05c9991c 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -22,7 +22,10 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 namespace raft::neighbors::ivf_flat::helpers {
+using namespace raft::spatial::knn::detail;  // NOLINT
 /**
  * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index
  * @{
@@ -106,5 +109,37 @@ void unpack(
     res, list_data, veclen, offset, codes);
 }
 }  // namespace codepacker
+
+/**
+ * @brief Public helper API to reset the data and indices ptrs, and the list sizes. Useful for
+ * externally modifying the index without going through the build stage. The data and indices of the
+ * IVF lists will be lost.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_flat::index<int64_t> index(res, index_params, D);
+ *   // reset the index's state and list sizes
+ *   ivf_flat::helpers::reset_index(res, &index);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-PQ index
+ */
+template <typename T, typename IdxT>
+void reset_index(const raft::resources& res, index<T, IdxT>* index)
+{
+  auto stream = resource::get_cuda_stream(res);
+
+  utils::memzero(index->list_sizes().data_handle(), index->list_sizes().size(), stream);
+  utils::memzero(index->data_ptrs().data_handle(), index->data_ptrs().size(), stream);
+  utils::memzero(index->inds_ptrs().data_handle(), index->inds_ptrs().size(), stream);
+}
 /** @} */
 }  // namespace raft::neighbors::ivf_flat::helpers
diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
index f00107f629..fec31f1c61 100644
--- a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdio>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/neighbors/detail/ivf_pq_build.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
@@ -23,7 +24,10 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 namespace raft::neighbors::ivf_pq::helpers {
+using namespace raft::spatial::knn::detail;  // NOLINT
 /**
  * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index
  * @{
@@ -71,6 +75,53 @@ inline void unpack(
     codes, list_data, offset, pq_bits, resource::get_cuda_stream(res));
 }
 
+/**
+ * @brief Unpack `n_rows` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`. The output codes of a single vector are contiguous, not expanded to
+ * one code per byte, which means the output has ceildiv(pq_dim * pq_bits, 8) bytes per PQ encoded
+ * vector.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_rows = 4;
+ *   auto codes = raft::make_device_matrix<uint8_t>(
+ *     res, n_rows, raft::ceildiv(index.pq_dim() * index.pq_bits(), 8));
+ *   uint32_t offset = 0;
+ *   // unpack n_rows elements from the list
+ *   ivf_pq::helpers::codepacker::unpack_contiguous(
+ *     res, list_data, index.pq_bits(), offset, n_rows, index.pq_dim(), codes.data_handle());
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[in] n_rows How many records to unpack
+ * @param[in] pq_dim The dimensionality of the PQ compressed records
+ * @param[out] codes
+ *   the destination buffer [n_rows, ceildiv(pq_dim * pq_bits, 8)].
+ *   The length `n_rows` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ */
+inline void unpack_contiguous(
+  raft::resources const& res,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  uint32_t pq_bits,
+  uint32_t offset,
+  uint32_t n_rows,
+  uint32_t pq_dim,
+  uint8_t* codes)
+{
+  ivf_pq::detail::unpack_contiguous_list_data(
+    codes, list_data, n_rows, pq_dim, offset, pq_bits, resource::get_cuda_stream(res));
+}
+
 /**
  * Write flat PQ codes into an existing list by the given offset.
  *
@@ -87,7 +138,7 @@ inline void unpack(
  *       res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data);
  * @endcode
  *
- * @param[in] res
+ * @param[in] res raft resource
  * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim]
  * @param[in] pq_bits bit length of encoded vector elements
  * @param[in] offset how many records to skip before writing the data into the list
@@ -102,6 +153,47 @@ inline void pack(
 {
   ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, resource::get_cuda_stream(res));
 }
+
+/**
+ * Write flat PQ codes into an existing list by the given offset. The input codes of a single vector
+ * are contiguous (not expanded to one code per byte).
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_rows records).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<uint8_t>(
+ *     res, n_rows, raft::ceildiv(index.pq_dim() * index.pq_bits(), 8));
+ *   ... prepare compressed vectors to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position. If the current size of the list
+ *   // is greater than 42, this will overwrite the codes starting at this offset.
+ *   ivf_pq::helpers::codepacker::pack_contiguous(
+ *     res, codes.data_handle(), n_rows, index.pq_dim(), index.pq_bits(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] codes flat PQ codes, [n_vec, ceildiv(pq_dim * pq_bits, 8)]
+ * @param[in] n_rows number of records
+ * @param[in] pq_dim
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[in] list_data block to write into
+ */
+inline void pack_contiguous(
+  raft::resources const& res,
+  const uint8_t* codes,
+  uint32_t n_rows,
+  uint32_t pq_dim,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data)
+{
+  ivf_pq::detail::pack_contiguous_list_data(
+    list_data, codes, n_rows, pq_dim, offset, pq_bits, resource::get_cuda_stream(res));
+}
 }  // namespace codepacker
 
 /**
@@ -122,7 +214,7 @@ inline void pack(
  *   ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
  * @endcode
  *
- * @param[in] res
+ * @param[in] res raft resource
  * @param[inout] index IVF-PQ index.
  * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
  * @param[in] label The id of the list (cluster) into which we write.
@@ -138,6 +230,56 @@ void pack_list_data(raft::resources const& res,
   ivf_pq::detail::pack_list_data(res, index, codes, label, offset);
 }
 
+/**
+ * Write flat PQ codes into an existing list by the given offset. Use this when the input
+ * vectors are PQ encoded and not expanded to one code per byte.
+ *
+ * The list is identified by its label.
+ *
+ * NB: no memory allocation happens here; the list into which the vectors are packed must fit offset
+ * + n_rows rows.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   raft::resources res;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(res, index_params, dataset, N, D);
+ *   // allocate the buffer for n_rows input codes. Each vector occupies
+ *   // raft::ceildiv(index.pq_dim() * index.pq_bits(), 8) bytes because
+ *   // codes are compressed and without gaps.
+ *   auto codes = raft::make_device_matrix<const uint8_t>(
+ *     res, n_rows, raft::ceildiv(index.pq_dim() * index.pq_bits(), 8));
+ *   ... prepare the compressed vectors to pack into the list in codes ...
+ *   // the first n_rows codes in the fourth IVF list are to be overwritten.
+ *   uint32_t label = 3;
+ *   // write codes into the list starting from the 0th position
+ *   ivf_pq::helpers::pack_contiguous_list_data(
+ *     res, &index, codes.data_handle(), n_rows, label, 0);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-PQ index
+ * @param[in] codes flat contiguous PQ codes [n_rows, ceildiv(pq_dim * pq_bits, 8)]
+ * @param[in] n_rows how many records to pack
+ * @param[in] label The id of the list (cluster) into which we write.
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename IdxT>
+void pack_contiguous_list_data(raft::resources const& res,
+                               index<IdxT>* index,
+                               uint8_t* codes,
+                               uint32_t n_rows,
+                               uint32_t label,
+                               uint32_t offset)
+{
+  ivf_pq::detail::pack_contiguous_list_data(res, index, codes, n_rows, label, offset);
+}
+
 /**
  * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
  * starting at given `offset`, one code per byte (independently of pq_bits).
@@ -200,8 +342,8 @@ void unpack_list_data(raft::resources const& res,
  *
  * @tparam IdxT type of the indices in the source dataset
  *
- * @param[in] res
- * @param[in] index
+ * @param[in] res raft resource
+ * @param[in] index IVF-PQ index (passed by reference)
  * @param[in] in_cluster_indices
  *   The offsets of the selected indices within the cluster.
  * @param[out] out_codes
@@ -221,6 +363,53 @@ void unpack_list_data(raft::resources const& res,
   return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
 }
 
+/**
+ * @brief Unpack `n_rows` consecutive PQ encoded vectors of a single list (cluster) in the
+ * compressed index starting at given `offset`, not expanded to one code per byte. Each code in the
+ * output buffer occupies ceildiv(index.pq_dim() * index.pq_bits(), 8) bytes.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   // We will unpack the whole fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::update_host(&list_size, index.list_sizes().data_handle() + label, 1,
+ * raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res);
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, list_size, raft::ceildiv(index.pq_dim() *
+ * index.pq_bits(), 8));
+ *   // unpack the whole list
+ *   ivf_pq::helpers::unpack_list_data(res, index, codes.data_handle(), list_size, label, 0);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resource
+ * @param[in] index IVF-PQ index (passed by reference)
+ * @param[out] out_codes
+ *   the destination buffer [n_rows, ceildiv(index.pq_dim() * index.pq_bits(), 8)].
+ *   The length `n_rows` defines how many records to unpack,
+ *   offset + n_rows must be smaller than or equal to the list size.
+ * @param[in] n_rows how many codes to unpack
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename IdxT>
+void unpack_contiguous_list_data(raft::resources const& res,
+                                 const index<IdxT>& index,
+                                 uint8_t* out_codes,
+                                 uint32_t n_rows,
+                                 uint32_t label,
+                                 uint32_t offset)
+{
+  return ivf_pq::detail::unpack_contiguous_list_data<IdxT>(
+    res, index, out_codes, n_rows, label, offset);
+}
+
 /**
  * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
  * starting at given `offset`.
@@ -232,7 +421,7 @@ void unpack_list_data(raft::resources const& res,
  *   // Get the list size
  *   uint32_t list_size = 0;
  *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1,
- * resource::get_cuda_stream(res)); resource::sync_stream(res);
+ *   resource::get_cuda_stream(res)); resource::sync_stream(res);
  *   // allocate the buffer for the output
  *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
  *   // decode the whole list
@@ -397,6 +586,7 @@ void extend_list(raft::resources const& res,
  * @endcode
  *
  * @tparam IdxT
+ *
  * @param[in] res
  * @param[inout] index
  * @param[in] label the id of the target list (cluster).
@@ -407,5 +597,197 @@ void erase_list(raft::resources const& res, index<IdxT>* index, uint32_t label)
   ivf_pq::detail::erase_list(res, index, label);
 }
 
+/**
+ * @brief Public helper API to reset the data and indices ptrs, and the list sizes. Useful for
+ * externally modifying the index without going through the build stage. The data and indices of the
+ * IVF lists will be lost.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_pq::index<int64_t> index(res, index_params, D);
+ *   // reset the index's state and list sizes
+ *   ivf_pq::helpers::reset_index(res, &index);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-PQ index
+ */
+template <typename IdxT>
+void reset_index(const raft::resources& res, index<IdxT>* index)
+{
+  auto stream = resource::get_cuda_stream(res);
+
+  utils::memzero(
+    index->accum_sorted_sizes().data_handle(), index->accum_sorted_sizes().size(), stream);
+  utils::memzero(index->list_sizes().data_handle(), index->list_sizes().size(), stream);
+  utils::memzero(index->data_ptrs().data_handle(), index->data_ptrs().size(), stream);
+  utils::memzero(index->inds_ptrs().data_handle(), index->inds_ptrs().size(), stream);
+}
+
+/**
+ * @brief Public helper API exposing the computation of the index's rotation matrix.
+ * NB: This is to be used only when the rotation matrix is not already computed through
+ * raft::neighbors::ivf_pq::build.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // force random rotation
+ *   index_params.force_random_rotation = true;
+ *   // initialize an empty index
+ *   raft::neighbors::ivf_pq::index<int64_t> index(res, index_params, D);
+ *   // reset the index
+ *   reset_index(res, &index);
+ *   // compute the rotation matrix with random_rotation
+ *   raft::neighbors::ivf_pq::helpers::make_rotation_matrix(
+ *     res, &index, index_params.force_random_rotation);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-PQ index
+ * @param[in] force_random_rotation whether to apply a random rotation matrix on the input data. See
+ * raft::neighbors::ivf_pq::index_params for more details.
+ */
+template <typename IdxT>
+void make_rotation_matrix(raft::resources const& res,
+                          index<IdxT>* index,
+                          bool force_random_rotation)
+{
+  raft::neighbors::ivf_pq::detail::make_rotation_matrix(res,
+                                                        force_random_rotation,
+                                                        index->rot_dim(),
+                                                        index->dim(),
+                                                        index->rotation_matrix().data_handle());
+}
+
+/**
+ * @brief Public helper API for externally modifying the index's IVF centroids.
+ * NB: The index must be reset before this. Use raft::neighbors::ivf_pq::extend to construct IVF
+ lists according to new centroids.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   // allocate the buffer for the input centers
+ *   auto cluster_centers = raft::make_device_matrix<float, uint32_t>(res, index.n_lists(),
+ index.dim());
+ *   ... prepare ivf centroids in cluster_centers ...
+ *   // reset the index
+ *   reset_index(res, &index);
+ *   // recompute the state of the index
+ *   raft::neighbors::ivf_pq::helpers::recompute_internal_state(res, index);
+ *   // Write the IVF centroids
+ *   raft::neighbors::ivf_pq::helpers::set_centers(
+                    res,
+                    &index,
+                    cluster_centers);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-PQ index
+ * @param[in] cluster_centers new cluster centers [index.n_lists(), index.dim()]
+ */
+template <typename IdxT>
+void set_centers(raft::resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const float, uint32_t> cluster_centers)
+{
+  RAFT_EXPECTS(cluster_centers.extent(0) == index->n_lists(),
+               "Number of rows in the new centers must be equal to the number of IVF lists");
+  RAFT_EXPECTS(cluster_centers.extent(1) == index->dim(),
+               "Number of columns in the new cluster centers and index dim are different");
+  RAFT_EXPECTS(index->size() == 0, "Index must be empty");
+  ivf_pq::detail::set_centers(res, index, cluster_centers.data_handle());
+}
+
+/**
+ * @brief Helper exposing the re-computation of list sizes and related arrays if IVF lists have been
+ * modified.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   raft::resources res;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_pq::index<int64_t> index(res, index_params, D);
+ *   ivf_pq::helpers::reset_index(res, &index);
+ *   // resize the first IVF list to hold 5 records
+ *   auto spec = list_spec<uint32_t, int64_t>{
+ *     index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()};
+ *   uint32_t new_size = 5;
+ *   ivf::resize_list(res, list, spec, new_size, 0);
+ *   raft::update_device(index.list_sizes(), &new_size, 1, stream);
+ *   // recompute the internal state of the index
+ *   ivf_pq::recompute_internal_state(res, &index);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-PQ index
+ */
+template <typename IdxT>
+void recompute_internal_state(const raft::resources& res, index<IdxT>* index)
+{
+  auto& list = index->lists()[0];
+  ivf_pq::detail::recompute_internal_state(res, *index);
+}
+
+/**
+ * @brief Public helper API for fetching a trained index's IVF centroids into a buffer that may be
+ * allocated on either host or device.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   // allocate the buffer for the output centers
+ *   auto cluster_centers = raft::make_device_matrix<float, uint32_t>(
+ *     res, index.n_lists(), index.dim());
+ *   // Extract the IVF centroids into the buffer
+ *   raft::neighbors::ivf_pq::helpers::extract_centers(res, index, cluster_centers.data_handle());
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[in] index IVF-PQ index (passed by reference)
+ * @param[out] cluster_centers IVF cluster centers [index.n_lists(), index.dim]
+ */
+template <typename IdxT>
+void extract_centers(raft::resources const& res,
+                     const index<IdxT>& index,
+                     raft::device_matrix_view<float> cluster_centers)
+{
+  RAFT_EXPECTS(cluster_centers.extent(0) == index.n_lists(),
+               "Number of rows in the output buffer for cluster centers must be equal to the "
+               "number of IVF lists");
+  RAFT_EXPECTS(
+    cluster_centers.extent(1) == index.dim(),
+    "Number of columns in the output buffer for cluster centers and index dim are different");
+  auto stream = resource::get_cuda_stream(res);
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data_handle(),
+                                  sizeof(float) * index.dim(),
+                                  index.centers().data_handle(),
+                                  sizeof(float) * index.dim_ext(),
+                                  sizeof(float) * index.dim(),
+                                  index.n_lists(),
+                                  cudaMemcpyDefault,
+                                  stream));
+}
 /** @} */
 }  // namespace raft::neighbors::ivf_pq::helpers
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 24df77b35a..45ab18c84f 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -487,6 +487,30 @@ struct index : ann::index {
     return centers_rot_.view();
   }
 
+  /** fetch size of a particular IVF list in bytes using the list extents.
+   * Usage example:
+   * @code{.cpp}
+   *   raft::resources res;
+   *   // use default index params
+   *   ivf_pq::index_params index_params;
+   *   // extend the IVF lists while building the index
+   *   index_params.add_data_on_build = true;
+   *   // create and fill the index from a [N, D] dataset
+   *   auto index = raft::neighbors::ivf_pq::build<int64_t>(res, index_params, dataset, N, D);
+   *   // Fetch the size of the fourth list
+   *   uint32_t size = index.get_list_size_in_bytes(3);
+   * @endcode
+   *
+   * @param[in] label list ID
+   */
+  inline auto get_list_size_in_bytes(uint32_t label) -> uint32_t
+  {
+    RAFT_EXPECTS(label < this->n_lists(),
+                 "Expected label to be less than number of lists in the index");
+    auto list_data = this->lists()[label]->data;
+    return list_data.size();
+  }
+
  private:
   raft::distance::DistanceType metric_;
   codebook_gen codebook_kind_;
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index bdb83ecfdc..eb30b60eca 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -380,6 +380,83 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                             list_data_size,
                             Compare<uint8_t>{}));
   }
+  void check_packing_contiguous(index<IdxT>* index, uint32_t label)
+  {
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+
+    if (n_rows == 0) { return; }
+
+    auto codes   = make_device_matrix<uint8_t>(handle_, n_rows, index->pq_dim());
+    auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    uint32_t code_size = ceildiv<uint32_t>(index->pq_dim() * index->pq_bits(), 8);
+
+    auto codes_compressed = make_device_matrix<uint8_t>(handle_, n_rows, code_size);
+
+    ivf_pq::helpers::unpack_contiguous_list_data(
+      handle_, *index, codes_compressed.data_handle(), n_rows, label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    ivf_pq::detail::extend_list_prepare(handle_, index, make_const_mdspan(indices.view()), label);
+    ivf_pq::helpers::pack_contiguous_list_data<IdxT>(
+      handle_, index, codes_compressed.data_handle(), n_rows, label, 0);
+    ivf_pq::helpers::recompute_internal_state(handle_, index);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+    auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) *
+                          new_list->data.extent(2) * new_list->data.extent(3);
+
+    ASSERT_TRUE(old_list->data.size() >= list_data_size);
+    ASSERT_TRUE(new_list->data.size() >= list_data_size);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // Pack a few vectors back to the list.
+    uint32_t row_offset = 9;
+    uint32_t n_vec      = 3;
+    ASSERT_TRUE(row_offset + n_vec < n_rows);
+    size_t offset      = row_offset * code_size;
+    auto codes_to_pack = make_device_matrix_view<uint8_t, uint32_t>(
+      codes_compressed.data_handle() + offset, n_vec, index->pq_dim());
+    ivf_pq::helpers::pack_contiguous_list_data(
+      handle_, index, codes_to_pack.data_handle(), n_vec, label, row_offset);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // // Another test with the API that take list_data directly
+    auto list_data  = index->lists()[label]->data.view();
+    uint32_t n_take = 4;
+    ASSERT_TRUE(row_offset + n_take < n_rows);
+    auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, code_size);
+    ivf_pq::helpers::codepacker::unpack_contiguous(handle_,
+                                                   list_data,
+                                                   index->pq_bits(),
+                                                   row_offset,
+                                                   n_take,
+                                                   index->pq_dim(),
+                                                   codes2.data_handle());
+
+    // Write it back
+    ivf_pq::helpers::codepacker::pack_contiguous(handle_,
+                                                 codes2.data_handle(),
+                                                 n_vec,
+                                                 index->pq_dim(),
+                                                 index->pq_bits(),
+                                                 row_offset,
+                                                 list_data);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+  }
 
   template <typename BuildIndex>
   void run(BuildIndex build_index)
@@ -398,6 +475,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
         case 1: {
           // Dump and re-write codes for one label
           check_packing(&index, label);
+          check_packing_contiguous(&index, label);
         } break;
         default: {
           // check a small subset of data in a randomly chosen cluster to see if the data
@@ -962,6 +1040,32 @@ inline auto special_cases() -> test_cases_t
     x.search_params.n_probes     = 100;
   });
 
+  ADD_CASE({
+    x.num_db_vecs                = 4335;
+    x.dim                        = 4;
+    x.num_queries                = 100000;
+    x.k                          = 12;
+    x.index_params.metric        = distance::DistanceType::L2Expanded;
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE;
+    x.index_params.pq_dim        = 2;
+    x.index_params.pq_bits       = 8;
+    x.index_params.n_lists       = 69;
+    x.search_params.n_probes     = 69;
+  });
+
+  ADD_CASE({
+    x.num_db_vecs                = 4335;
+    x.dim                        = 4;
+    x.num_queries                = 100000;
+    x.k                          = 12;
+    x.index_params.metric        = distance::DistanceType::L2Expanded;
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER;
+    x.index_params.pq_dim        = 2;
+    x.index_params.pq_bits       = 8;
+    x.index_params.n_lists       = 69;
+    x.search_params.n_probes     = 69;
+  });
+
   return xs;
 }
 

From 3a7f33f57e35c3076c0f1527c3310ffcd66a866c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 17 Nov 2023 20:57:13 -0600
Subject: [PATCH 6/6] pylibraft only depends on numpy at runtime, not build
 time. (#2013)

pylibraft specifies its numpy pinning incorrectly in conda packages. It is only a runtime dependency, not a build dependency. Because of run-exports from numpy, the resulting pinning on pylibraft requires a very new numpy version, and is thus causing cuml to fall back to old raft builds.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2013
---
 conda/recipes/pylibraft/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 454cac0d77..b8a088d0f3 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -48,7 +48,6 @@ requirements:
     - cython >=3.0.0
     - libraft {{ version }}
     - libraft-headers {{ version }}
-    - numpy >=1.21
     - python x.x
     - rmm ={{ minor_version }}
     - scikit-build >=0.13.1
@@ -60,6 +59,7 @@ requirements:
     {% endif %}
     - libraft {{ version }}
     - libraft-headers {{ version }}
+    - numpy >=1.21
     - python x.x
     - rmm ={{ minor_version }}