From 80d007c3620905aa50b27222035e92c6df371179 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Thu, 16 May 2024 00:20:52 +0200 Subject: [PATCH 01/10] Enable random subsampling --- cpp/include/cuvs/neighbors/ivf_pq.h | 9 ++ cpp/include/cuvs/neighbors/ivf_pq.hpp | 9 ++ cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh | 148 ++++++---------------- 3 files changed, 59 insertions(+), 107 deletions(-) diff --git a/cpp/include/cuvs/neighbors/ivf_pq.h b/cpp/include/cuvs/neighbors/ivf_pq.h index c4ebc31f4..5d560413c 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq.h +++ b/cpp/include/cuvs/neighbors/ivf_pq.h @@ -116,6 +116,15 @@ struct cuvsIvfPqIndexParams { * flag to `true` if you prefer to use as little GPU memory for the database as possible. */ bool conservative_memory_allocation; + + /** + * The max number of data points to use per PQ code during PQ codebook training. Using more data + * points per PQ code may increase the quality of PQ codebook but may also increase the build + * time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and + * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training + * points to train each codebook. + */ + uint32_t max_train_points_per_pq_code = 256; }; typedef struct cuvsIvfPqIndexParams* cuvsIvfPqIndexParams_t; diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp index f013615de..f57c55fe3 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq.hpp +++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp @@ -100,6 +100,15 @@ struct index_params : ann::index_params { */ bool conservative_memory_allocation = false; + /** + * The max number of data points to use per PQ code during PQ codebook training. Using more data + * points per PQ code may increase the quality of PQ codebook but may also increase the build + * time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and + * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training + * points to train each codebook. + */ + uint32_t max_train_points_per_pq_code = 256; + /** * Creates index_params based on shape of the input dataset. * Usage example: diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh index 3f2f145b0..d6ac246ab 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -67,51 +68,6 @@ using namespace cuvs::spatial::knn::detail; // NOLINT using internal_extents_t = int64_t; // The default mdspan extent type used internally. -template -__launch_bounds__(BlockDim) static __global__ void copy_warped_kernel( - T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows) -{ - using warp = raft::Pow2; - size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x)); - uint32_t i = warp::mod(threadIdx.x); - if (row_ix >= n_rows) return; - out += row_ix * ld_out; - in += row_ix * ld_in; - auto f = utils::mapping{}; - for (uint32_t col_ix = i; col_ix < n_cols; col_ix += warp::Value) { - auto x = f(in[col_ix]); - __syncwarp(); - out[col_ix] = x; - } -} - -/** - * raft::copy the data one warp-per-row: - * - * 1. load the data per-warp - * 2. apply the `utils::mapping{}` - * 3. sync within warp - * 4. store the data. - * - * Assuming sizeof(T) >= sizeof(S) and the data is properly aligned (see the usage in `build`), this - * allows to re-structure the data within rows in-place. - */ -template -void copy_warped(T* out, - uint32_t ld_out, - const S* in, - uint32_t ld_in, - uint32_t n_cols, - size_t n_rows, - rmm::cuda_stream_view stream) -{ - constexpr uint32_t kBlockDim = 128; - dim3 threads(kBlockDim, 1, 1); - dim3 blocks(raft::div_rounding_up_safe(n_rows, kBlockDim / raft::WarpSize), 1, 1); - copy_warped_kernel - <<>>(out, ld_out, in, ld_in, n_cols, n_rows); -} - /** * @brief Compute residual vectors from the source dataset given by selected indices. * @@ -357,14 +313,19 @@ void train_per_subset(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, + uint32_t max_train_points_per_pq_code, rmm::mr::device_memory_resource* managed_memory) { auto stream = raft::resource::get_cuda_stream(handle); auto device_memory = raft::resource::get_workspace_resource(handle); rmm::device_uvector pq_centers_tmp(index.pq_centers().size(), stream, device_memory); - rmm::device_uvector sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory); - rmm::device_uvector sub_labels(n_rows, stream, device_memory); + // Subsampling the train set for codebook generation based on max_train_points_per_pq_code. + size_t big_enough = max_train_points_per_pq_code * size_t(index.pq_book_size()); + auto pq_n_rows = uint32_t(std::min(big_enough, n_rows)); + rmm::device_uvector sub_trainset( + pq_n_rows * size_t(index.pq_len()), stream, device_memory); + rmm::device_uvector sub_labels(pq_n_rows, stream, device_memory); rmm::device_uvector pq_cluster_sizes(index.pq_book_size(), stream, device_memory); @@ -375,7 +336,7 @@ void train_per_subset(raft::resources const& handle, // Get the rotated cluster centers for each training vector. // This will be subtracted from the input vectors afterwards. utils::copy_selected( - n_rows, + pq_n_rows, index.pq_len(), index.centers_rot().data_handle() + index.pq_len() * j, labels, @@ -391,7 +352,7 @@ void train_per_subset(raft::resources const& handle, true, false, index.pq_len(), - n_rows, + pq_n_rows, index.dim(), &alpha, index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j, @@ -405,13 +366,13 @@ void train_per_subset(raft::resources const& handle, // train PQ codebook for this subspace auto sub_trainset_view = raft::make_device_matrix_view( - sub_trainset.data(), n_rows, index.pq_len()); + sub_trainset.data(), pq_n_rows, index.pq_len()); auto centers_tmp_view = raft::make_device_matrix_view( pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j, index.pq_book_size(), index.pq_len()); auto sub_labels_view = - raft::make_device_vector_view(sub_labels.data(), n_rows); + raft::make_device_vector_view(sub_labels.data(), pq_n_rows); auto cluster_sizes_view = raft::make_device_vector_view( pq_cluster_sizes.data(), index.pq_book_size()); raft::cluster::kmeans_balanced_params kmeans_params; @@ -435,6 +396,7 @@ void train_per_cluster(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, + uint32_t max_train_points_per_pq_code, rmm::mr::device_memory_resource* managed_memory) { auto stream = raft::resource::get_cuda_stream(handle); @@ -482,9 +444,11 @@ void train_per_cluster(raft::resources const& handle, indices + cluster_offsets[l], device_memory); - // limit the cluster size to bound the training time. + // limit the cluster size to bound the training time based on max_train_points_per_pq_code + // If pq_book_size is less than pq_dim, use max_train_points_per_pq_code per pq_dim instead // [sic] we interpret the data as pq_len-dimensional - size_t big_enough = 256ul * std::max(index.pq_book_size(), index.pq_dim()); + size_t big_enough = + max_train_points_per_pq_code * std::max(index.pq_book_size(), index.pq_dim()); size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim()); auto pq_n_rows = uint32_t(std::min(big_enough, available_rows)); // train PQ codebook for this cluster @@ -1684,6 +1648,7 @@ auto build(raft::resources const& handle, utils::memzero(index.inds_ptrs().data_handle(), index.inds_ptrs().size(), stream); { + raft::random::RngState random_state{137}; auto trainset_ratio = std::max( 1, size_t(n_rows) / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); @@ -1694,57 +1659,25 @@ auto build(raft::resources const& handle, // Besides just sampling, we transform the input dataset into floats to make it easier // to use gemm operations from cublas. - rmm::device_uvector trainset(n_rows_train * index.dim(), stream, device_memory); - // TODO: a proper sampling + auto trainset = raft::make_device_matrix(handle, n_rows_train, dim); + if constexpr (std::is_same_v) { - RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), - sizeof(T) * index.dim(), - dataset, - sizeof(T) * index.dim() * trainset_ratio, - sizeof(T) * index.dim(), - n_rows_train, - cudaMemcpyDefault, - stream)); + raft::matrix::detail::sample_rows( + handle, random_state, dataset, n_rows, trainset.view()); } else { - size_t dim = index.dim(); - cudaPointerAttributes dataset_attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&dataset_attr, dataset)); - if (dataset_attr.devicePointer != nullptr) { - // data is available on device: just run the kernel to raft::copy and map the data - auto p = reinterpret_cast(dataset_attr.devicePointer); - auto trainset_view = - raft::make_device_vector_view(trainset.data(), dim * n_rows_train); - raft::linalg::map_offset( - handle, trainset_view, [p, trainset_ratio, dim] __device__(size_t i) { - auto col = i % dim; - return utils::mapping{}(p[(i - col) * size_t(trainset_ratio) + col]); - }); - } else { - // data is not available: first raft::copy, then map inplace - auto trainset_tmp = reinterpret_cast(reinterpret_cast(trainset.data()) + - (sizeof(float) - sizeof(T)) * index.dim()); - // We raft::copy the data in strides, one row at a time, and place the smaller rows of type - // T at the end of float rows. - RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset_tmp, - sizeof(float) * index.dim(), - dataset, - sizeof(T) * index.dim() * trainset_ratio, - sizeof(T) * index.dim(), - n_rows_train, - cudaMemcpyDefault, - stream)); - // Transform the input `{T -> float}`, one row per warp. - // The threads in each warp raft::copy the data synchronously; this and the layout of the - // data (content is aligned to the end of the rows) together allow doing the transform - // in-place. - copy_warped(trainset.data(), - index.dim(), - trainset_tmp, - index.dim() * sizeof(float) / sizeof(T), - index.dim(), - n_rows_train, - stream); - } + // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp. + // TODO(tfeher): After https://github.com/rapidsai/raft/pull/2194 is merged, change this + // to use large workspace allocator. + auto trainset_tmp = + raft::make_device_matrix(handle, n_rows_train, dim); + raft::matrix::detail::sample_rows( + handle, random_state, dataset, n_rows, trainset_tmp.view()); + + raft::linalg::unaryOp(trainset.data_handle(), + trainset_tmp.data_handle(), + trainset.size(), + utils::mapping{}, + raft::resource::get_cuda_stream(handle)); } // NB: here cluster_centers is used as if it is [n_clusters, data_dim] not [n_clusters, @@ -1754,9 +1687,8 @@ auto build(raft::resources const& handle, auto cluster_centers = cluster_centers_buf.data(); // Train balanced hierarchical kmeans clustering - auto trainset_const_view = raft::make_device_matrix_view( - trainset.data(), n_rows_train, index.dim()); - auto centers_view = raft::make_device_matrix_view( + auto trainset_const_view = raft::make_const_mdspan(trainset.view()); + auto centers_view = raft::make_device_matrix_view( cluster_centers, index.n_lists(), index.dim()); raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = params.kmeans_n_iters; @@ -1792,18 +1724,20 @@ auto build(raft::resources const& handle, train_per_subset(handle, index, n_rows_train, - trainset.data(), + trainset.data_handle(), labels.data(), params.kmeans_n_iters, + params.max_train_points_per_pq_code, &managed_memory_upstream); break; case codebook_gen::PER_CLUSTER: train_per_cluster(handle, index, n_rows_train, - trainset.data(), + trainset.data_handle(), labels.data(), params.kmeans_n_iters, + params.max_train_points_per_pq_code, &managed_memory_upstream); break; default: RAFT_FAIL("Unreachable code"); From 86455ac781a8a4c0c5f5aab576386f2bc6eac3e0 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Thu, 16 May 2024 01:32:38 +0200 Subject: [PATCH 02/10] replace pointer based internal interface of IVF_PQ with mdspan --- .../neighbors/detail/cagra/cagra_build.cuh | 3 +- cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh | 35 ++++++------------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 9d31fc6f0..1e3fc0c0d 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -91,8 +91,7 @@ void build_knn_graph( }(); RAFT_LOG_DEBUG("# Building IVF-PQ index %s", model_name.c_str()); - auto index = cuvs::neighbors::ivf_pq::detail::build( - res, *build_params, dataset.data_handle(), dataset.extent(0), dataset.extent(1)); + auto index = cuvs::neighbors::ivf_pq::detail::build(res, *build_params, dataset); // // search top (k + 1) neighbors diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh index d6ac246ab..64224e59c 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh @@ -1622,13 +1622,14 @@ auto extend(raft::resources const& handle, return ext_index; } -template +template auto build(raft::resources const& handle, const index_params& params, - const T* dataset, - IdxT n_rows, - uint32_t dim) -> index + raft::mdspan, raft::row_major, accessor> dataset) + -> index { + IdxT n_rows = dataset.extent(0); + IdxT dim = dataset.extent(1); raft::common::nvtx::range fun_scope( "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); static_assert(std::is_same_v || std::is_same_v || std::is_same_v || @@ -1662,16 +1663,14 @@ auto build(raft::resources const& handle, auto trainset = raft::make_device_matrix(handle, n_rows_train, dim); if constexpr (std::is_same_v) { - raft::matrix::detail::sample_rows( - handle, random_state, dataset, n_rows, trainset.view()); + raft::matrix::sample_rows(handle, random_state, dataset, trainset.view()); } else { // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp. // TODO(tfeher): After https://github.com/rapidsai/raft/pull/2194 is merged, change this // to use large workspace allocator. auto trainset_tmp = raft::make_device_matrix(handle, n_rows_train, dim); - raft::matrix::detail::sample_rows( - handle, random_state, dataset, n_rows, trainset_tmp.view()); + raft::matrix::sample_rows(handle, random_state, dataset, trainset_tmp.view()); raft::linalg::unaryOp(trainset.data_handle(), trainset_tmp.data_handle(), @@ -1746,30 +1745,18 @@ auto build(raft::resources const& handle, // add the data if necessary if (params.add_data_on_build) { - detail::extend(handle, &index, dataset, nullptr, n_rows); + detail::extend(handle, &index, dataset.data_handle(), nullptr, n_rows); } return index; } -template -auto build(raft::resources const& handle, - const index_params& params, - raft::device_matrix_view dataset) -> index -{ - IdxT n_rows = dataset.extent(0); - IdxT dim = dataset.extent(1); - return build(handle, params, dataset.data_handle(), n_rows, dim); -} - -template +template void build(raft::resources const& handle, const index_params& params, - raft::device_matrix_view dataset, + raft::mdspan, raft::row_major, accessor> dataset, index* index) { - IdxT n_rows = dataset.extent(0); - IdxT dim = dataset.extent(1); - *index = build(handle, params, dataset.data_handle(), n_rows, dim); + *index = build(handle, params, dataset); } template From d1effdd43dad0d23a634cb3ec551fb5ca3c66daf Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Thu, 16 May 2024 01:40:56 +0200 Subject: [PATCH 03/10] Update python API --- python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx | 24 +++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx index a1ca6768a..94499702e 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx @@ -95,11 +95,27 @@ cdef class IndexParams: default, if `dim == rot_dim`, the rotation transform is initialized with the identity matrix. When `force_random_rotation == True`, a random orthogonal transform + matrix is generated regardless of the values of `dim` and `pq_dim`. add_data_on_build : bool, default = True After training the coarse and fine quantizers, we will populate the index with the dataset if add_data_on_build == True, otherwise the index is left empty, and the extend method can be used to add new vectors to the index. + conservative_memory_allocation : bool, default = True + By default, the algorithm allocates more space than necessary for + individual clusters (`list_data`). This allows to amortize the cost + of memory allocation and reduce the number of data copies during + repeated calls to `extend` (extending the database). + To disable this behavior and use as little GPU memory for the + database as possible, set this flat to `True`. + max_train_points_per_pq_code : int, default = 256 + The max number of data points to use per PQ code during PQ codebook + training. Using more data points per PQ code may increase the + quality of PQ codebook but may also increase the build time. The + parameter is applied to both PQ codebook generation methods, i.e., + PER_SUBSPACE and PER_CLUSTER. In both cases, we will use + pq_book_size * max_train_points_per_pq_code training points to + train each codebook. """ cdef cuvsIvfPqIndexParams* params @@ -122,7 +138,8 @@ cdef class IndexParams: codebook_kind="subspace", force_random_rotation=False, add_data_on_build=True, - conservative_memory_allocation=False): + conservative_memory_allocation=False, + max_train_points_per_pq_code=256): self.params.n_lists = n_lists self._metric = metric self.params.metric = DISTANCE_TYPES[metric] @@ -141,6 +158,8 @@ cdef class IndexParams: self.params.add_data_on_build = add_data_on_build self.params.conservative_memory_allocation = \ conservative_memory_allocation + self.params.max_train_points_per_pq_code = \ + max_train_points_per_pq_code @property def metric(self): @@ -190,6 +209,9 @@ cdef class IndexParams: def conservative_memory_allocation(self): return self.params.conservative_memory_allocation + @property + def max_train_points_per_pq_code(self): + return self.params.max_train_points_per_pq_code cdef class Index: """ From b74b0ac1db3fc0c6f125c36583c713f1e417b36e Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Thu, 16 May 2024 10:00:50 +0200 Subject: [PATCH 04/10] remove default value from C interface --- cpp/include/cuvs/neighbors/ivf_pq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cuvs/neighbors/ivf_pq.h b/cpp/include/cuvs/neighbors/ivf_pq.h index 5d560413c..c79768bc7 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq.h +++ b/cpp/include/cuvs/neighbors/ivf_pq.h @@ -124,7 +124,7 @@ struct cuvsIvfPqIndexParams { * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training * points to train each codebook. */ - uint32_t max_train_points_per_pq_code = 256; + uint32_t max_train_points_per_pq_code; }; typedef struct cuvsIvfPqIndexParams* cuvsIvfPqIndexParams_t; From 08aeb52a575ed22afaff69d1e72d2ee0898d81f3 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Wed, 29 May 2024 17:53:44 +0200 Subject: [PATCH 05/10] handle new param in C API --- cpp/src/neighbors/ivf_pq_c.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/neighbors/ivf_pq_c.cpp b/cpp/src/neighbors/ivf_pq_c.cpp index 14a879123..c481f5303 100644 --- a/cpp/src/neighbors/ivf_pq_c.cpp +++ b/cpp/src/neighbors/ivf_pq_c.cpp @@ -46,6 +46,7 @@ void* _build(cuvsResources_t res, cuvsIvfPqIndexParams params, DLManagedTensor* static_cast((int)params.codebook_kind); build_params.force_random_rotation = params.force_random_rotation; build_params.conservative_memory_allocation = params.conservative_memory_allocation; + build_params.max_train_points_per_pq_code = params.max_train_points_per_pq_code; auto dataset = dataset_tensor->dl_tensor; auto dim = dataset.shape[0]; From e1712d5646e5412dd5217b517c98e52b7a9f11c6 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Sun, 30 Jun 2024 21:22:44 +0200 Subject: [PATCH 06/10] fix merge error --- cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh index 1204846d5..cc86fe347 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh @@ -1709,7 +1709,7 @@ auto build(raft::resources const& handle, raft::matrix::sample_rows(handle, random_state, dataset, trainset.view()); } else { // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp. - auto trainset_tmp = raft::make_device_mdarray( + auto trainset_tmp = raft::make_device_mdarray( handle, big_memory_resource, raft::make_extents(n_rows_train, dim)); raft::matrix::sample_rows(handle, random_state, dataset, trainset_tmp.view()); From 7637cefcbd68a81dbf978d33c705ea3259ae9255 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Wed, 10 Jul 2024 12:25:16 +0200 Subject: [PATCH 07/10] Add missing declaration to cython wrappers --- python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd index 17d2e4030..9f6a456f1 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pxd @@ -50,6 +50,7 @@ cdef extern from "cuvs/neighbors/ivf_pq.h" nogil: codebook_gen codebook_kind bool force_random_rotation bool conservative_memory_allocation + uint32_t max_train_points_per_pq_code ctypedef cuvsIvfPqIndexParams* cuvsIvfPqIndexParams_t From 304d5c76b78183838f8143d7d3330d2f64c5d039 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 30 Jul 2024 01:39:45 +0200 Subject: [PATCH 08/10] restore merge error of devcontainer configs --- .devcontainer/cuda11.8-conda/devcontainer.json | 18 ++++-------------- .devcontainer/cuda11.8-pip/devcontainer.json | 18 ++++-------------- .devcontainer/cuda12.5-conda/devcontainer.json | 18 ++++-------------- .devcontainer/cuda12.5-pip/devcontainer.json | 18 ++++-------------- 4 files changed, 16 insertions(+), 56 deletions(-) diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 4a62e6f76..9cb608e52 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -13,25 +13,15 @@ "--name", "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda" ], - "hostRequirements": { - "gpu": "optional" - }, + "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}" - ], - "postAttachCommand": [ - "/bin/bash", - "-c", - "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" - ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", "mounts": [ @@ -49,4 +39,4 @@ ] } } -} \ No newline at end of file +} diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 7f7eab819..733f23f78 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -13,9 +13,7 @@ "--name", "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip" ], - "hostRequirements": { - "gpu": "optional" - }, + "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": { "version": "1.15.0" @@ -34,16 +32,8 @@ "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}" - ], - "postAttachCommand": [ - "/bin/bash", - "-c", - "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" - ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", "mounts": [ @@ -60,4 +50,4 @@ ] } } -} \ No newline at end of file +} diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json index ca414bab7..db825a269 100644 --- a/.devcontainer/cuda12.5-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -13,25 +13,15 @@ "--name", "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda" ], - "hostRequirements": { - "gpu": "optional" - }, + "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}" - ], - "postAttachCommand": [ - "/bin/bash", - "-c", - "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" - ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", "mounts": [ @@ -49,4 +39,4 @@ ] } } -} \ No newline at end of file +} diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json index ca9b09cef..fe6ec3dd2 100644 --- a/.devcontainer/cuda12.5-pip/devcontainer.json +++ b/.devcontainer/cuda12.5-pip/devcontainer.json @@ -13,9 +13,7 @@ "--name", "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip" ], - "hostRequirements": { - "gpu": "optional" - }, + "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": { "version": "1.15.0" @@ -34,16 +32,8 @@ "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": [ - "/bin/bash", - "-c", - "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}" - ], - "postAttachCommand": [ - "/bin/bash", - "-c", - "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" - ], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"], + "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent", "mounts": [ @@ -60,4 +50,4 @@ ] } } -} \ No newline at end of file +} From 2d72131d6f570ec609deca4e4ec221e4f8bf2bc3 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 30 Jul 2024 23:16:39 +0200 Subject: [PATCH 09/10] Add Rust setter for max_train_points_per_pq_code --- rust/cuvs/src/ivf_pq/index_params.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/rust/cuvs/src/ivf_pq/index_params.rs b/rust/cuvs/src/ivf_pq/index_params.rs index e38c0945f..321821bc3 100644 --- a/rust/cuvs/src/ivf_pq/index_params.rs +++ b/rust/cuvs/src/ivf_pq/index_params.rs @@ -124,6 +124,18 @@ impl IndexParams { self } + /// The max number of data points to use per PQ code during PQ codebook training. Using more data + /// points per PQ code may increase the quality of PQ codebook but may also increase the build + /// time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and + /// PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training + /// points to train each codebook. + pub fn set_max_train_points_per_pq_code(self, max_pq_points: u32)-> IndexParams { + unsafe { + (*self.0).max_train_points_per_pq_code = max_pq_points; + } + self + } + /// After training the coarse and fine quantizers, we will populate /// the index with the dataset if add_data_on_build == true, otherwise /// the index is left empty, and the extend method can be used From 27c2389a5c19cbad054a1377e0ebbbf23e0ad284 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Thu, 1 Aug 2024 02:52:17 +0200 Subject: [PATCH 10/10] Add initializer in C wrapper for max_train_points_per_pq_code --- cpp/src/neighbors/ivf_pq_c.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/neighbors/ivf_pq_c.cpp b/cpp/src/neighbors/ivf_pq_c.cpp index 6f6fdbe62..256d760e6 100644 --- a/cpp/src/neighbors/ivf_pq_c.cpp +++ b/cpp/src/neighbors/ivf_pq_c.cpp @@ -215,7 +215,8 @@ extern "C" cuvsError_t cuvsIvfPqIndexParamsCreate(cuvsIvfPqIndexParams_t* params .pq_dim = 0, .codebook_kind = codebook_gen::PER_SUBSPACE, .force_random_rotation = false, - .conservative_memory_allocation = false}; + .conservative_memory_allocation = false, + .max_train_points_per_pq_code = 256}; }); }