From cf2885c9d0b8a5d839378939a29154a4d165fefe Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Dec 2024 13:11:32 -0500
Subject: [PATCH 1/2] Update Changelog [skip ci]

---
 CHANGELOG.md | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ce4a14c3..ed9429d55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,67 @@
+# cuvs 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Skip IVF-PQ packing test for lists with not enough data ([#512](https://github.com/rapidsai/cuvs/pull/512)) [@achirkin](https://github.com/achirkin)
+- [BUG] Fix CAGRA filter ([#489](https://github.com/rapidsai/cuvs/pull/489)) [@enp1s0](https://github.com/enp1s0)
+- Add `kIsSingleSource` to `PairwiseDistanceEpilogueElementwise` ([#485](https://github.com/rapidsai/cuvs/pull/485)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix include errors, header, and unsafe locks in iface.hpp ([#467](https://github.com/rapidsai/cuvs/pull/467)) [@achirkin](https://github.com/achirkin)
+- Fix an OOB error in device-side cuvs::neighbors::refine and CAGRA kern_prune ([#460](https://github.com/rapidsai/cuvs/pull/460)) [@achirkin](https://github.com/achirkin)
+- Put a ceiling on cuda-python ([#445](https://github.com/rapidsai/cuvs/pull/445)) [@bdice](https://github.com/bdice)
+- Enable NVTX in cuvs-cagra-search component ([#439](https://github.com/rapidsai/cuvs/pull/439)) [@achirkin](https://github.com/achirkin)
+- BUG: CAGRA multi-cta illegal access with bad queries ([#438](https://github.com/rapidsai/cuvs/pull/438)) [@achirkin](https://github.com/achirkin)
+- Fix index overflow in edge cases of CAGRA graph optimize ([#435](https://github.com/rapidsai/cuvs/pull/435)) [@achirkin](https://github.com/achirkin)
+- Fix correct call to brute force in generate groundtruth of cuvs-bench ([#427](https://github.com/rapidsai/cuvs/pull/427)) [@dantegd](https://github.com/dantegd)
+- Use Python for sccache hit rate computation. ([#420](https://github.com/rapidsai/cuvs/pull/420)) [@bdice](https://github.com/bdice)
+- Add `click` package to `cuvs-bench` conda recipe ([#408](https://github.com/rapidsai/cuvs/pull/408)) [@divyegala](https://github.com/divyegala)
+- Fix NVTX annotations ([#400](https://github.com/rapidsai/cuvs/pull/400)) [@achirkin](https://github.com/achirkin)
+
+## 📖 Documentation
+
+- [Doc] Fix CAGRA search sample code ([#484](https://github.com/rapidsai/cuvs/pull/484)) [@enp1s0](https://github.com/enp1s0)
+- Fix broken link in README.md references ([#473](https://github.com/rapidsai/cuvs/pull/473)) [@Azurethi](https://github.com/Azurethi)
+- Adding tech stack to docs ([#448](https://github.com/rapidsai/cuvs/pull/448)) [@cjnolet](https://github.com/cjnolet)
+- Fix Question Retrieval notebook ([#352](https://github.com/rapidsai/cuvs/pull/352)) [@lowener](https://github.com/lowener)
+
+## 🚀 New Features
+
+- Add C++ API scalar quantization ([#494](https://github.com/rapidsai/cuvs/pull/494)) [@mfoerste4](https://github.com/mfoerste4)
+- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala)
+- Add serialization API to brute-force ([#461](https://github.com/rapidsai/cuvs/pull/461)) [@lowener](https://github.com/lowener)
+- Add Question Retrieval notebook using Milvus ([#451](https://github.com/rapidsai/cuvs/pull/451)) [@lowener](https://github.com/lowener)
+- Migrate feature diff for NN Descent from RAFT to cuVS ([#421](https://github.com/rapidsai/cuvs/pull/421)) [@divyegala](https://github.com/divyegala)
+- Add --no-lap-sync cmd option to ann-bench ([#405](https://github.com/rapidsai/cuvs/pull/405)) [@achirkin](https://github.com/achirkin)
+- Add `InnerProduct` and `CosineExpanded` metric support in NN Descent ([#177](https://github.com/rapidsai/cuvs/pull/177)) [@divyegala](https://github.com/divyegala)
+
+## 🛠️ Improvements
+
+- Update cuvs to match raft&#39;s cutlass changes ([#516](https://github.com/rapidsai/cuvs/pull/516)) [@vyasr](https://github.com/vyasr)
+- add a README for wheels ([#504](https://github.com/rapidsai/cuvs/pull/504)) [@jameslamb](https://github.com/jameslamb)
+- Move check_input_array from pylibraft ([#474](https://github.com/rapidsai/cuvs/pull/474)) [@benfred](https://github.com/benfred)
+- use different wheel-size thresholds based on CUDA version ([#469](https://github.com/rapidsai/cuvs/pull/469)) [@jameslamb](https://github.com/jameslamb)
+- Modify cuvs-bench to be able to generate ground truth in CPU systems ([#466](https://github.com/rapidsai/cuvs/pull/466)) [@dantegd](https://github.com/dantegd)
+- enforce wheel size limits, README formatting in CI ([#464](https://github.com/rapidsai/cuvs/pull/464)) [@jameslamb](https://github.com/jameslamb)
+- Moving spectral embedding and kernel gramm APIs to cuVS ([#463](https://github.com/rapidsai/cuvs/pull/463)) [@cjnolet](https://github.com/cjnolet)
+- Migrate sparse knn and distances code from raft ([#457](https://github.com/rapidsai/cuvs/pull/457)) [@benfred](https://github.com/benfred)
+- Don&#39;t presume pointers location infers usability. ([#441](https://github.com/rapidsai/cuvs/pull/441)) [@robertmaynard](https://github.com/robertmaynard)
+- call `enable_testing` in root CMakeLists.txt ([#437](https://github.com/rapidsai/cuvs/pull/437)) [@robertmaynard](https://github.com/robertmaynard)
+- CAGRA tech debt: distance descriptor and workspace memory ([#436](https://github.com/rapidsai/cuvs/pull/436)) [@achirkin](https://github.com/achirkin)
+- Add ci run_ scripts needed for build infra ([#434](https://github.com/rapidsai/cuvs/pull/434)) [@robertmaynard](https://github.com/robertmaynard)
+- Use environment variables in cache hit rate computation. ([#422](https://github.com/rapidsai/cuvs/pull/422)) [@bdice](https://github.com/bdice)
+- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice)
+- We need to enable the c_api by default ([#416](https://github.com/rapidsai/cuvs/pull/416)) [@robertmaynard](https://github.com/robertmaynard)
+- print sccache stats in builds ([#413](https://github.com/rapidsai/cuvs/pull/413)) [@jameslamb](https://github.com/jameslamb)
+- make conda installs in CI stricter ([#406](https://github.com/rapidsai/cuvs/pull/406)) [@jameslamb](https://github.com/jameslamb)
+- Ivf c example ([#404](https://github.com/rapidsai/cuvs/pull/404)) [@abner-ma](https://github.com/abner-ma)
+- Prune workflows based on changed files ([#392](https://github.com/rapidsai/cuvs/pull/392)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- [WIP] Add pinned memory resource to C API ([#311](https://github.com/rapidsai/cuvs/pull/311)) [@ajit283](https://github.com/ajit283)
+- Dynamic Batching ([#261](https://github.com/rapidsai/cuvs/pull/261)) [@achirkin](https://github.com/achirkin)
+
 # cuvs 24.10.00 (9 Oct 2024)
 
 ## 🐛 Bug Fixes

From ef16a9e7fa7af418019e8cc7bcdd33828aee9f67 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 12 Dec 2024 00:04:37 +0100
Subject: [PATCH 2/2] Fix Grace-specific issues in CAGRA (#527)

Fix Grace-specific test failures:
  1. Add stream syncs at the places where host-allocated memory may be destructed while still being accessed by GPU to avoid relying on stream-ordered semantics of memory allocations.
  2. A bug in tests: CAGRA index produced by `cagra::build` is not guaranteed to be owning. The tests assumed otherwise; when the host dataset is passed, and it's accessible on the device (it's the case with Grace), the created index ended up non-owning. The lifetime of the host dataset in the tests was smaller than of the index, which led to invalid host accesses from the device.
  3. A bug in `dataset_deserialize.hpp`: `deserialize_strided` function constructed a non-owning strided dataset, because the host data was accessible by the GPU. The current fix is to add a move-semantics overload of `make_strided_dataset` that always owns the passed data (either via moving the mdarray or by copying the data).

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/527
---
 cpp/include/cuvs/neighbors/common.hpp         | 78 ++++++++++++++++++-
 cpp/src/neighbors/detail/ann_utils.cuh        | 11 +++
 cpp/src/neighbors/detail/cagra/utils.hpp      | 16 +++-
 .../neighbors/detail/dataset_serialize.hpp    |  2 +-
 cpp/test/neighbors/ann_cagra.cuh              | 23 +++---
 5 files changed, 115 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 60b8cc122..bd9ea4834 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -264,6 +264,77 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t
   return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
 }
 
+/**
+ * @brief Contstruct a strided matrix from any mdarray.
+ *
+ * This function constructs an owning device matrix and copies the data.
+ * When the data is copied, padding elements are filled with zeroes.
+ *
+ * @tparam DataT
+ * @tparam IdxT
+ * @tparam LayoutPolicy
+ * @tparam ContainerPolicy
+ *
+ * @param[in] res raft resources handle
+ * @param[in] src the source mdarray or mdspan
+ * @param[in] required_stride the leading dimension (in elements)
+ * @return owning current-device-accessible strided matrix
+ */
+template <typename DataT, typename IdxT, typename LayoutPolicy, typename ContainerPolicy>
+auto make_strided_dataset(
+  const raft::resources& res,
+  raft::mdarray<DataT, raft::matrix_extent<IdxT>, LayoutPolicy, ContainerPolicy>&& src,
+  uint32_t required_stride) -> std::unique_ptr<strided_dataset<DataT, IdxT>>
+{
+  using value_type            = DataT;
+  using index_type            = IdxT;
+  using layout_type           = LayoutPolicy;
+  using container_policy_type = ContainerPolicy;
+  static_assert(std::is_same_v<layout_type, raft::layout_right> ||
+                  std::is_same_v<layout_type, raft::layout_right_padded<value_type>> ||
+                  std::is_same_v<layout_type, raft::layout_stride>,
+                "The input must be row-major");
+  RAFT_EXPECTS(src.extent(1) <= required_stride,
+               "The input row length must be not larger than the desired stride.");
+  const uint32_t src_stride = src.stride(0) > 0 ? src.stride(0) : src.extent(1);
+  const bool stride_matches = required_stride == src_stride;
+
+  auto out_layout =
+    raft::make_strided_layout(src.extents(), std::array<index_type, 2>{required_stride, 1});
+
+  using out_mdarray_type          = raft::device_matrix<value_type, index_type>;
+  using out_layout_type           = typename out_mdarray_type::layout_type;
+  using out_container_policy_type = typename out_mdarray_type::container_policy_type;
+  using out_owning_type =
+    owning_dataset<value_type, index_type, out_layout_type, out_container_policy_type>;
+
+  if constexpr (std::is_same_v<layout_type, out_layout_type> &&
+                std::is_same_v<container_policy_type, out_container_policy_type>) {
+    if (stride_matches) {
+      // Everything matches, we can own the mdarray
+      return std::make_unique<out_owning_type>(std::move(src), out_layout);
+    }
+  }
+  // Something is wrong: have to make a copy and produce an owning dataset
+  auto out_array =
+    raft::make_device_matrix<value_type, index_type>(res, src.extent(0), required_stride);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_array.data_handle(),
+                                0,
+                                out_array.size() * sizeof(value_type),
+                                raft::resource::get_cuda_stream(res)));
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(),
+                                  sizeof(value_type) * required_stride,
+                                  src.data_handle(),
+                                  sizeof(value_type) * src_stride,
+                                  sizeof(value_type) * src.extent(1),
+                                  src.extent(0),
+                                  cudaMemcpyDefault,
+                                  raft::resource::get_cuda_stream(res)));
+
+  return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
+}
+
 /**
  * @brief Contstruct a strided matrix from any mdarray or mdspan.
  *
@@ -278,14 +349,15 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t
  * @return maybe owning current-device-accessible strided matrix
  */
 template <typename SrcT>
-auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t align_bytes = 16)
+auto make_aligned_dataset(const raft::resources& res, SrcT src, uint32_t align_bytes = 16)
   -> std::unique_ptr<strided_dataset<typename SrcT::value_type, typename SrcT::index_type>>
 {
-  using value_type       = typename SrcT::value_type;
+  using source_type      = std::remove_cv_t<std::remove_reference_t<SrcT>>;
+  using value_type       = typename source_type::value_type;
   constexpr size_t kSize = sizeof(value_type);
   uint32_t required_stride =
     raft::round_up_safe<size_t>(src.extent(1) * kSize, std::lcm(align_bytes, kSize)) / kSize;
-  return make_strided_dataset(res, src, required_stride);
+  return make_strided_dataset(res, std::forward<SrcT>(src), required_stride);
 }
 /**
  * @brief VPQ compressed dataset.
diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 652d41c85..529356351 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -403,6 +403,17 @@ struct batch_load_iterator {
 
   /** A single batch of data residing in device memory. */
   struct batch {
+    ~batch() noexcept
+    {
+      /*
+      If there's no copy, there's no allocation owned by the batch.
+      If there's no allocation, there's no guarantee that the device pointer is stream-ordered.
+      If there's no stream order guarantee, we must synchronize with the stream before the batch is
+      destroyed to make sure all GPU operations in that stream finish earlier.
+      */
+      if (!does_copy()) { RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream_)); }
+    }
+
     /** Logical width of a single row in a batch, in elements of type `T`. */
     [[nodiscard]] auto row_width() const -> size_type { return row_width_; }
     /** Logical offset of the batch, in rows (`row_width()`) */
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 0f8309328..9f95c5b1c 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -179,7 +179,7 @@ class device_matrix_view_from_host {
  public:
   device_matrix_view_from_host(raft::resources const& res,
                                raft::host_matrix_view<T, IdxT> host_view)
-    : host_view_(host_view)
+    : res_(res), host_view_(host_view)
   {
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
@@ -199,6 +199,17 @@ class device_matrix_view_from_host {
     }
   }
 
+  ~device_matrix_view_from_host() noexcept
+  {
+    /*
+    If there's no copy, there's no allocation owned by this struct.
+    If there's no allocation, there's no guarantee that the device pointer is stream-ordered.
+    If there's no stream order guarantee, we must synchronize with the stream before the struct is
+    destroyed to make sure all GPU operations in that stream finish earlier.
+    */
+    if (!allocated_memory()) { raft::resource::sync_stream(res_); }
+  }
+
   raft::device_matrix_view<T, IdxT> view()
   {
     return raft::make_device_matrix_view<T, IdxT>(
@@ -207,9 +218,10 @@ class device_matrix_view_from_host {
 
   T* data_handle() { return device_ptr; }
 
-  bool allocated_memory() const { return device_mem_.has_value(); }
+  [[nodiscard]] bool allocated_memory() const { return device_mem_.has_value(); }
 
  private:
+  const raft::resources& res_;
   std::optional<raft::device_matrix<T, IdxT>> device_mem_;
   raft::host_matrix_view<T, IdxT> host_view_;
   T* device_ptr;
diff --git a/cpp/src/neighbors/detail/dataset_serialize.hpp b/cpp/src/neighbors/detail/dataset_serialize.hpp
index 40d9df930..0ecc2cf5d 100644
--- a/cpp/src/neighbors/detail/dataset_serialize.hpp
+++ b/cpp/src/neighbors/detail/dataset_serialize.hpp
@@ -140,7 +140,7 @@ auto deserialize_strided(raft::resources const& res, std::istream& is)
   auto stride     = raft::deserialize_scalar<uint32_t>(res, is);
   auto host_array = raft::make_host_matrix<DataT, IdxT>(n_rows, dim);
   raft::deserialize_mdspan(res, is, host_array.view());
-  return make_strided_dataset(res, host_array, stride);
+  return make_strided_dataset(res, std::move(host_array), stride);
 }
 
 template <typename MathT, typename IdxT>
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 8d5701439..c1cd3ca09 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -389,12 +389,13 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
           (const DataT*)database.data(), ps.n_rows, ps.dim);
 
         {
+          std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt};
           cagra::index<DataT, IdxT> index(handle_, index_params.metric);
           if (ps.host_dataset) {
-            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
-            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host->data_handle(), database.data(), database.size(), stream_);
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
-              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+              (const DataT*)database_host->data_handle(), ps.n_rows, ps.dim);
 
             index = cagra::build(handle_, index_params, database_host_view);
           } else {
@@ -567,13 +568,16 @@ class AnnCagraAddNodesTest : public ::testing::TestWithParam<AnnCagraInputs> {
         auto initial_database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), initial_database_size, ps.dim);
 
+        std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt};
         cagra::index<DataT, IdxT> index(handle_);
         if (ps.host_dataset) {
-          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
           raft::copy(
-            database_host.data_handle(), database.data(), initial_database_view.size(), stream_);
+            database_host->data_handle(), database.data(), initial_database_view.size(), stream_);
           auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
-            (const DataT*)database_host.data_handle(), initial_database_size, ps.dim);
+            (const DataT*)database_host->data_handle(), initial_database_size, ps.dim);
+          // NB: database_host must live no less than the index, because the index _may_be_
+          //     non-onwning
           index = cagra::build(handle_, index_params, database_host_view);
         } else {
           index = cagra::build(handle_, index_params, initial_database_view);
@@ -763,12 +767,13 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
 
+        std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt};
         cagra::index<DataT, IdxT> index(handle_);
         if (ps.host_dataset) {
-          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
-          raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+          database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          raft::copy(database_host->data_handle(), database.data(), database.size(), stream_);
           auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
-            (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            (const DataT*)database_host->data_handle(), ps.n_rows, ps.dim);
           index = cagra::build(handle_, index_params, database_host_view);
         } else {
           index = cagra::build(handle_, index_params, database_view);