From 80d007c3620905aa50b27222035e92c6df371179 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 16 May 2024 00:20:52 +0200
Subject: [PATCH 01/10] Enable random subsampling

---
 cpp/include/cuvs/neighbors/ivf_pq.h       |   9 ++
 cpp/include/cuvs/neighbors/ivf_pq.hpp     |   9 ++
 cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh | 148 ++++++----------------
 3 files changed, 59 insertions(+), 107 deletions(-)
diff --git a/cpp/include/cuvs/neighbors/ivf_pq.h b/cpp/include/cuvs/neighbors/ivf_pq.h
index c4ebc31f4..5d560413c 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq.h
+++ b/cpp/include/cuvs/neighbors/ivf_pq.h
@@ -116,6 +116,15 @@ struct cuvsIvfPqIndexParams {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation;
+
+  /**
+   * The max number of data points to use per PQ code during PQ codebook training. Using more data
+   * points per PQ code may increase the quality of PQ codebook but may also increase the build
+   * time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
+   * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
+   * points to train each codebook.
+   */
+  uint32_t max_train_points_per_pq_code = 256;
 };
 
 typedef struct cuvsIvfPqIndexParams* cuvsIvfPqIndexParams_t;
diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp
index f013615de..f57c55fe3 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp
@@ -100,6 +100,15 @@ struct index_params : ann::index_params {
    */
   bool conservative_memory_allocation = false;
 
+  /**
+   * The max number of data points to use per PQ code during PQ codebook training. Using more data
+   * points per PQ code may increase the quality of PQ codebook but may also increase the build
+   * time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
+   * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
+   * points to train each codebook.
+   */
+  uint32_t max_train_points_per_pq_code = 256;
+
   /**
    * Creates index_params based on shape of the input dataset.
    * Usage example:
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index 3f2f145b0..d6ac246ab 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -43,6 +43,7 @@
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/gather.cuh>
 #include <raft/matrix/linewise_op.cuh>
+#include <raft/matrix/sample_rows.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/stats/histogram.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -67,51 +68,6 @@ using namespace cuvs::spatial::knn::detail;  // NOLINT
 
 using internal_extents_t = int64_t;  // The default mdspan extent type used internally.
 
-template <uint32_t BlockDim, typename T, typename S>
-__launch_bounds__(BlockDim) static __global__ void copy_warped_kernel(
-  T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
-{
-  using warp    = raft::Pow2<raft::WarpSize>;
-  size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x));
-  uint32_t i    = warp::mod(threadIdx.x);
-  if (row_ix >= n_rows) return;
-  out += row_ix * ld_out;
-  in += row_ix * ld_in;
-  auto f = utils::mapping<T>{};
-  for (uint32_t col_ix = i; col_ix < n_cols; col_ix += warp::Value) {
-    auto x = f(in[col_ix]);
-    __syncwarp();
-    out[col_ix] = x;
-  }
-}
-
-/**
- * raft::copy the data one warp-per-row:
- *
- *  1. load the data per-warp
- *  2. apply the `utils::mapping<T>{}`
- *  3. sync within warp
- *  4. store the data.
- *
- * Assuming sizeof(T) >= sizeof(S) and the data is properly aligned (see the usage in `build`), this
- * allows to re-structure the data within rows in-place.
- */
-template <typename T, typename S>
-void copy_warped(T* out,
-                 uint32_t ld_out,
-                 const S* in,
-                 uint32_t ld_in,
-                 uint32_t n_cols,
-                 size_t n_rows,
-                 rmm::cuda_stream_view stream)
-{
-  constexpr uint32_t kBlockDim = 128;
-  dim3 threads(kBlockDim, 1, 1);
-  dim3 blocks(raft::div_rounding_up_safe<size_t>(n_rows, kBlockDim / raft::WarpSize), 1, 1);
-  copy_warped_kernel<kBlockDim, T, S>
-    <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
-}
-
 /**
  * @brief Compute residual vectors from the source dataset given by selected indices.
  *
@@ -357,14 +313,19 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
+                      uint32_t max_train_points_per_pq_code,
                       rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = raft::resource::get_cuda_stream(handle);
   auto device_memory = raft::resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
+  // Subsampling the train set for codebook generation based on max_train_points_per_pq_code.
+  size_t big_enough = max_train_points_per_pq_code * size_t(index.pq_book_size());
+  auto pq_n_rows    = uint32_t(std::min(big_enough, n_rows));
+  rmm::device_uvector<float> sub_trainset(
+    pq_n_rows * size_t(index.pq_len()), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
 
@@ -375,7 +336,7 @@ void train_per_subset(raft::resources const& handle,
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
     utils::copy_selected<float, float, size_t, uint32_t>(
-      n_rows,
+      pq_n_rows,
       index.pq_len(),
       index.centers_rot().data_handle() + index.pq_len() * j,
       labels,
@@ -391,7 +352,7 @@ void train_per_subset(raft::resources const& handle,
                        true,
                        false,
                        index.pq_len(),
-                       n_rows,
+                       pq_n_rows,
                        index.dim(),
                        &alpha,
                        index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j,
@@ -405,13 +366,13 @@ void train_per_subset(raft::resources const& handle,
 
     // train PQ codebook for this subspace
     auto sub_trainset_view = raft::make_device_matrix_view<const float, internal_extents_t>(
-      sub_trainset.data(), n_rows, index.pq_len());
+      sub_trainset.data(), pq_n_rows, index.pq_len());
     auto centers_tmp_view = raft::make_device_matrix_view<float, internal_extents_t>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
     auto sub_labels_view =
-      raft::make_device_vector_view<uint32_t, internal_extents_t>(sub_labels.data(), n_rows);
+      raft::make_device_vector_view<uint32_t, internal_extents_t>(sub_labels.data(), pq_n_rows);
     auto cluster_sizes_view = raft::make_device_vector_view<uint32_t, internal_extents_t>(
       pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
@@ -435,6 +396,7 @@ void train_per_cluster(raft::resources const& handle,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
+                       uint32_t max_train_points_per_pq_code,
                        rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = raft::resource::get_cuda_stream(handle);
@@ -482,9 +444,11 @@ void train_per_cluster(raft::resources const& handle,
                      indices + cluster_offsets[l],
                      device_memory);
 
-    // limit the cluster size to bound the training time.
+    // limit the cluster size to bound the training time based on max_train_points_per_pq_code
+    // If pq_book_size is less than pq_dim, use max_train_points_per_pq_code per pq_dim instead
     // [sic] we interpret the data as pq_len-dimensional
-    size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
+    size_t big_enough =
+      max_train_points_per_pq_code * std::max<size_t>(index.pq_book_size(), index.pq_dim());
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
@@ -1684,6 +1648,7 @@ auto build(raft::resources const& handle,
   utils::memzero(index.inds_ptrs().data_handle(), index.inds_ptrs().size(), stream);
 
   {
+    raft::random::RngState random_state{137};
     auto trainset_ratio = std::max<size_t>(
       1,
       size_t(n_rows) / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
@@ -1694,57 +1659,25 @@ auto build(raft::resources const& handle,
 
     // Besides just sampling, we transform the input dataset into floats to make it easier
     // to use gemm operations from cublas.
-    rmm::device_uvector<float> trainset(n_rows_train * index.dim(), stream, device_memory);
-    // TODO: a proper sampling
+    auto trainset = raft::make_device_matrix<float, internal_extents_t>(handle, n_rows_train, dim);
+
     if constexpr (std::is_same_v<T, float>) {
-      RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
-                                      sizeof(T) * index.dim(),
-                                      dataset,
-                                      sizeof(T) * index.dim() * trainset_ratio,
-                                      sizeof(T) * index.dim(),
-                                      n_rows_train,
-                                      cudaMemcpyDefault,
-                                      stream));
+      raft::matrix::detail::sample_rows<T, int64_t>(
+        handle, random_state, dataset, n_rows, trainset.view());
     } else {
-      size_t dim = index.dim();
-      cudaPointerAttributes dataset_attr;
-      RAFT_CUDA_TRY(cudaPointerGetAttributes(&dataset_attr, dataset));
-      if (dataset_attr.devicePointer != nullptr) {
-        // data is available on device: just run the kernel to raft::copy and map the data
-        auto p = reinterpret_cast<T*>(dataset_attr.devicePointer);
-        auto trainset_view =
-          raft::make_device_vector_view<float, IdxT>(trainset.data(), dim * n_rows_train);
-        raft::linalg::map_offset(
-          handle, trainset_view, [p, trainset_ratio, dim] __device__(size_t i) {
-            auto col = i % dim;
-            return utils::mapping<float>{}(p[(i - col) * size_t(trainset_ratio) + col]);
-          });
-      } else {
-        // data is not available: first raft::copy, then map inplace
-        auto trainset_tmp = reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(trainset.data()) +
-                                                 (sizeof(float) - sizeof(T)) * index.dim());
-        // We raft::copy the data in strides, one row at a time, and place the smaller rows of type
-        // T at the end of float rows.
-        RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset_tmp,
-                                        sizeof(float) * index.dim(),
-                                        dataset,
-                                        sizeof(T) * index.dim() * trainset_ratio,
-                                        sizeof(T) * index.dim(),
-                                        n_rows_train,
-                                        cudaMemcpyDefault,
-                                        stream));
-        // Transform the input `{T -> float}`, one row per warp.
-        // The threads in each warp raft::copy the data synchronously; this and the layout of the
-        // data (content is aligned to the end of the rows) together allow doing the transform
-        // in-place.
-        copy_warped(trainset.data(),
-                    index.dim(),
-                    trainset_tmp,
-                    index.dim() * sizeof(float) / sizeof(T),
-                    index.dim(),
-                    n_rows_train,
-                    stream);
-      }
+      // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp.
+      // TODO(tfeher): After https://github.com/rapidsai/raft/pull/2194 is merged, change this
+      // to use large workspace allocator.
+      auto trainset_tmp =
+        raft::make_device_matrix<T, internal_extents_t>(handle, n_rows_train, dim);
+      raft::matrix::detail::sample_rows<T, int64_t>(
+        handle, random_state, dataset, n_rows, trainset_tmp.view());
+
+      raft::linalg::unaryOp(trainset.data_handle(),
+                            trainset_tmp.data_handle(),
+                            trainset.size(),
+                            utils::mapping<float>{},
+                            raft::resource::get_cuda_stream(handle));
     }
 
     // NB: here cluster_centers is used as if it is [n_clusters, data_dim] not [n_clusters,
@@ -1754,9 +1687,8 @@ auto build(raft::resources const& handle,
     auto cluster_centers = cluster_centers_buf.data();
 
     // Train balanced hierarchical kmeans clustering
-    auto trainset_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
-      trainset.data(), n_rows_train, index.dim());
-    auto centers_view = raft::make_device_matrix_view<float, internal_extents_t>(
+    auto trainset_const_view = raft::make_const_mdspan(trainset.view());
+    auto centers_view        = raft::make_device_matrix_view<float, internal_extents_t>(
       cluster_centers, index.n_lists(), index.dim());
     raft::cluster::kmeans_balanced_params kmeans_params;
     kmeans_params.n_iters = params.kmeans_n_iters;
@@ -1792,18 +1724,20 @@ auto build(raft::resources const& handle,
         train_per_subset(handle,
                          index,
                          n_rows_train,
-                         trainset.data(),
+                         trainset.data_handle(),
                          labels.data(),
                          params.kmeans_n_iters,
+                         params.max_train_points_per_pq_code,
                          &managed_memory_upstream);
         break;
       case codebook_gen::PER_CLUSTER:
         train_per_cluster(handle,
                           index,
                           n_rows_train,
-                          trainset.data(),
+                          trainset.data_handle(),
                           labels.data(),
                           params.kmeans_n_iters,
+                          params.max_train_points_per_pq_code,
                           &managed_memory_upstream);
         break;
       default: RAFT_FAIL("Unreachable code");

From 86455ac781a8a4c0c5f5aab576386f2bc6eac3e0 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 16 May 2024 01:32:38 +0200
Subject: [PATCH 02/10] replace pointer based internal interface of IVF_PQ with
 mdspan

---
 .../neighbors/detail/cagra/cagra_build.cuh    |  3 +-
 cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh     | 35 ++++++-------------
 2 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 9d31fc6f0..1e3fc0c0d 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -91,8 +91,7 @@ void build_knn_graph(
   }();
 
   RAFT_LOG_DEBUG("# Building IVF-PQ index %s", model_name.c_str());
-  auto index = cuvs::neighbors::ivf_pq::detail::build<DataT, int64_t>(
-    res, *build_params, dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+  auto index = cuvs::neighbors::ivf_pq::detail::build<DataT, int64_t>(res, *build_params, dataset);
 
   //
   // search top (k + 1) neighbors
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index d6ac246ab..64224e59c 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -1622,13 +1622,14 @@ auto extend(raft::resources const& handle,
   return ext_index;
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename IdxT, typename accessor>
 auto build(raft::resources const& handle,
            const index_params& params,
-           const T* dataset,
-           IdxT n_rows,
-           uint32_t dim) -> index<IdxT>
+           raft::mdspan<const T, raft::matrix_extent<IdxT>, raft::row_major, accessor> dataset)
+  -> index<IdxT>
 {
+  IdxT n_rows = dataset.extent(0);
+  IdxT dim    = dataset.extent(1);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "ivf_pq::build(%zu, %u)", size_t(n_rows), dim);
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, half> || std::is_same_v<T, uint8_t> ||
@@ -1662,16 +1663,14 @@ auto build(raft::resources const& handle,
     auto trainset = raft::make_device_matrix<float, internal_extents_t>(handle, n_rows_train, dim);
 
     if constexpr (std::is_same_v<T, float>) {
-      raft::matrix::detail::sample_rows<T, int64_t>(
-        handle, random_state, dataset, n_rows, trainset.view());
+      raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     } else {
       // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp.
       // TODO(tfeher): After https://github.com/rapidsai/raft/pull/2194 is merged, change this
       // to use large workspace allocator.
       auto trainset_tmp =
         raft::make_device_matrix<T, internal_extents_t>(handle, n_rows_train, dim);
-      raft::matrix::detail::sample_rows<T, int64_t>(
-        handle, random_state, dataset, n_rows, trainset_tmp.view());
+      raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset_tmp.view());
 
       raft::linalg::unaryOp(trainset.data_handle(),
                             trainset_tmp.data_handle(),
@@ -1746,30 +1745,18 @@ auto build(raft::resources const& handle,
 
   // add the data if necessary
   if (params.add_data_on_build) {
-    detail::extend<T, IdxT>(handle, &index, dataset, nullptr, n_rows);
+    detail::extend<T, IdxT>(handle, &index, dataset.data_handle(), nullptr, n_rows);
   }
   return index;
 }
 
-template <typename T, typename IdxT>
-auto build(raft::resources const& handle,
-           const index_params& params,
-           raft::device_matrix_view<const T, IdxT, raft::row_major> dataset) -> index<IdxT>
-{
-  IdxT n_rows = dataset.extent(0);
-  IdxT dim    = dataset.extent(1);
-  return build(handle, params, dataset.data_handle(), n_rows, dim);
-}
-
-template <typename T, typename IdxT>
+template <typename T, typename IdxT, typename accessor>
 void build(raft::resources const& handle,
            const index_params& params,
-           raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,
+           raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::row_major, accessor> dataset,
            index<IdxT>* index)
 {
-  IdxT n_rows = dataset.extent(0);
-  IdxT dim    = dataset.extent(1);
-  *index      = build(handle, params, dataset.data_handle(), n_rows, dim);
+  *index = build(handle, params, dataset);
 }
 
 template <typename T, typename IdxT>

From d1effdd43dad0d23a634cb3ec551fb5ca3c66daf Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 16 May 2024 01:40:56 +0200
Subject: [PATCH 03/10] Update python API

---
 python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx | 24 +++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
index a1ca6768a..94499702e 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx
@@ -95,11 +95,27 @@ cdef class IndexParams:
         default, if `dim == rot_dim`, the rotation transform is
         initialized with the identity matrix. When
         `force_random_rotation == True`, a random orthogonal transform
+        matrix is generated regardless of the values of `dim` and `pq_dim`.
     add_data_on_build : bool, default = True
         After training the coarse and fine quantizers, we will populate
         the index with the dataset if add_data_on_build == True, otherwise
         the index is left empty, and the extend method can be used
         to add new vectors to the index.
+    conservative_memory_allocation : bool, default = True
+        By default, the algorithm allocates more space than necessary for
+        individual clusters (`list_data`). This allows to amortize the cost
+        of memory allocation and reduce the number of data copies during
+        repeated calls to `extend` (extending the database).
+        To disable this behavior and use as little GPU memory for the
+        database as possible, set this flat to `True`.
+    max_train_points_per_pq_code : int, default = 256
+        The max number of data points to use per PQ code during PQ codebook
+        training. Using more data points per PQ code may increase the
+        quality of PQ codebook but may also increase the build time. The
+        parameter is applied to both PQ codebook generation methods, i.e.,
+        PER_SUBSPACE and PER_CLUSTER. In both cases, we will use
+        pq_book_size * max_train_points_per_pq_code training points to
+        train each codebook.
     """
 
     cdef cuvsIvfPqIndexParams* params
@@ -122,7 +138,8 @@ cdef class IndexParams:
                  codebook_kind="subspace",
                  force_random_rotation=False,
                  add_data_on_build=True,
-                 conservative_memory_allocation=False):
+                 conservative_memory_allocation=False,
+                 max_train_points_per_pq_code=256):
         self.params.n_lists = n_lists
         self._metric = metric
         self.params.metric = <cuvsDistanceType>DISTANCE_TYPES[metric]
@@ -141,6 +158,8 @@ cdef class IndexParams:
         self.params.add_data_on_build = add_data_on_build
         self.params.conservative_memory_allocation = \
             conservative_memory_allocation
+        self.params.max_train_points_per_pq_code = \
+            max_train_points_per_pq_code
 
     @property
     def metric(self):
@@ -190,6 +209,9 @@ cdef class IndexParams:
     def conservative_memory_allocation(self):
         return self.params.conservative_memory_allocation
 
+    @property
+    def max_train_points_per_pq_code(self):
+        return self.params.max_train_points_per_pq_code
 
 cdef class Index:
     """

From b74b0ac1db3fc0c6f125c36583c713f1e417b36e Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 16 May 2024 10:00:50 +0200
Subject: [PATCH 04/10] remove default value from C interface

---
 cpp/include/cuvs/neighbors/ivf_pq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cuvs/neighbors/ivf_pq.h b/cpp/include/cuvs/neighbors/ivf_pq.h
index 5d560413c..c79768bc7 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq.h
+++ b/cpp/include/cuvs/neighbors/ivf_pq.h
@@ -124,7 +124,7 @@ struct cuvsIvfPqIndexParams {
    * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
    * points to train each codebook.
    */
-  uint32_t max_train_points_per_pq_code = 256;
+  uint32_t max_train_points_per_pq_code;
 };
 
 typedef struct cuvsIvfPqIndexParams* cuvsIvfPqIndexParams_t;

From 08aeb52a575ed22afaff69d1e72d2ee0898d81f3 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 29 May 2024 17:53:44 +0200
Subject: [PATCH 05/10] handle new param in C API

---
 cpp/src/neighbors/ivf_pq_c.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/neighbors/ivf_pq_c.cpp b/cpp/src/neighbors/ivf_pq_c.cpp
index 14a879123..c481f5303 100644
--- a/cpp/src/neighbors/ivf_pq_c.cpp
+++ b/cpp/src/neighbors/ivf_pq_c.cpp
@@ -46,6 +46,7 @@ void* _build(cuvsResources_t res, cuvsIvfPqIndexParams params, DLManagedTensor*
     static_cast<cuvs::neighbors::ivf_pq::codebook_gen>((int)params.codebook_kind);
   build_params.force_random_rotation          = params.force_random_rotation;
   build_params.conservative_memory_allocation = params.conservative_memory_allocation;
+  build_params.max_train_points_per_pq_code   = params.max_train_points_per_pq_code;
 
   auto dataset = dataset_tensor->dl_tensor;
   auto dim     = dataset.shape[0];

From e1712d5646e5412dd5217b517c98e52b7a9f11c6 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sun, 30 Jun 2024 21:22:44 +0200
Subject: [PATCH 06/10] fix merge error

---
 cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index 1204846d5..cc86fe347 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -1709,7 +1709,7 @@ auto build(raft::resources const& handle,
       raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     } else {
       // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp.
-      auto trainset_tmp = raft::make_device_mdarray<float>(
+      auto trainset_tmp = raft::make_device_mdarray<T>(
         handle, big_memory_resource, raft::make_extents<int64_t>(n_rows_train, dim));
 
       raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset_tmp.view());

From 7637cefcbd68a81dbf978d33c705ea3259ae9255 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 10 Jul 2024 12:25:16 +0200
Subject: [PATCH 07/10] Add missing declaration to cython wrappers

---
 python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd
index 17d2e4030..9f6a456f1 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd
@@ -50,6 +50,7 @@ cdef extern from "cuvs/neighbors/ivf_pq.h" nogil:
         codebook_gen codebook_kind
         bool force_random_rotation
         bool conservative_memory_allocation
+        uint32_t max_train_points_per_pq_code
 
     ctypedef cuvsIvfPqIndexParams* cuvsIvfPqIndexParams_t
 

From 304d5c76b78183838f8143d7d3330d2f64c5d039 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 30 Jul 2024 01:39:45 +0200
Subject: [PATCH 08/10] restore merge error of devcontainer configs

---
 .devcontainer/cuda11.8-conda/devcontainer.json | 18 ++++--------------
 .devcontainer/cuda11.8-pip/devcontainer.json   | 18 ++++--------------
 .devcontainer/cuda12.5-conda/devcontainer.json | 18 ++++--------------
 .devcontainer/cuda12.5-pip/devcontainer.json   | 18 ++++--------------
 4 files changed, 16 insertions(+), 56 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 4a62e6f76..9cb608e52 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -13,25 +13,15 @@
     "--name",
     "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
   ],
-  "hostRequirements": {
-    "gpu": "optional"
-  },
+  "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"
-  ],
-  "postAttachCommand": [
-    "/bin/bash",
-    "-c",
-    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
-  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
   "mounts": [
@@ -49,4 +39,4 @@
       ]
     }
   }
-}
\ No newline at end of file
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 7f7eab819..733f23f78 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -13,9 +13,7 @@
     "--name",
     "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
   ],
-  "hostRequirements": {
-    "gpu": "optional"
-  },
+  "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
       "version": "1.15.0"
@@ -34,16 +32,8 @@
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"
-  ],
-  "postAttachCommand": [
-    "/bin/bash",
-    "-c",
-    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
-  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
   "mounts": [
@@ -60,4 +50,4 @@
       ]
     }
   }
-}
\ No newline at end of file
+}
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index ca414bab7..db825a269 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -13,25 +13,15 @@
     "--name",
     "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
   ],
-  "hostRequirements": {
-    "gpu": "optional"
-  },
+  "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"
-  ],
-  "postAttachCommand": [
-    "/bin/bash",
-    "-c",
-    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
-  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
   "mounts": [
@@ -49,4 +39,4 @@
       ]
     }
   }
-}
\ No newline at end of file
+}
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index ca9b09cef..fe6ec3dd2 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -13,9 +13,7 @@
     "--name",
     "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
   ],
-  "hostRequirements": {
-    "gpu": "optional"
-  },
+  "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
       "version": "1.15.0"
@@ -34,16 +32,8 @@
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"
-  ],
-  "postAttachCommand": [
-    "/bin/bash",
-    "-c",
-    "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"
-  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
   "mounts": [
@@ -60,4 +50,4 @@
       ]
     }
   }
-}
\ No newline at end of file
+}

From 2d72131d6f570ec609deca4e4ec221e4f8bf2bc3 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 30 Jul 2024 23:16:39 +0200
Subject: [PATCH 09/10] Add Rust setter for max_train_points_per_pq_code

---
 rust/cuvs/src/ivf_pq/index_params.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/rust/cuvs/src/ivf_pq/index_params.rs b/rust/cuvs/src/ivf_pq/index_params.rs
index e38c0945f..321821bc3 100644
--- a/rust/cuvs/src/ivf_pq/index_params.rs
+++ b/rust/cuvs/src/ivf_pq/index_params.rs
@@ -124,6 +124,18 @@ impl IndexParams {
         self
     }
 
+    /// The max number of data points to use per PQ code during PQ codebook training. Using more data
+    /// points per PQ code may increase the quality of PQ codebook but may also increase the build
+    /// time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
+    /// PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
+    /// points to train each codebook.
+    pub fn set_max_train_points_per_pq_code(self, max_pq_points: u32)-> IndexParams {
+        unsafe {
+            (*self.0).max_train_points_per_pq_code = max_pq_points;
+        }
+        self
+    }
+
     /// After training the coarse and fine quantizers, we will populate
     /// the index with the dataset if add_data_on_build == true, otherwise
     /// the index is left empty, and the extend method can be used

From 27c2389a5c19cbad054a1377e0ebbbf23e0ad284 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 1 Aug 2024 02:52:17 +0200
Subject: [PATCH 10/10] Add initializer in C wrapper for
 max_train_points_per_pq_code

---
 cpp/src/neighbors/ivf_pq_c.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_pq_c.cpp b/cpp/src/neighbors/ivf_pq_c.cpp
index 6f6fdbe62..256d760e6 100644
--- a/cpp/src/neighbors/ivf_pq_c.cpp
+++ b/cpp/src/neighbors/ivf_pq_c.cpp
@@ -215,7 +215,8 @@ extern "C" cuvsError_t cuvsIvfPqIndexParamsCreate(cuvsIvfPqIndexParams_t* params
                                        .pq_dim                         = 0,
                                        .codebook_kind                  = codebook_gen::PER_SUBSPACE,
                                        .force_random_rotation          = false,
-                                       .conservative_memory_allocation = false};
+                                       .conservative_memory_allocation = false,
+                                       .max_train_points_per_pq_code   = 256};
   });
 }