diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ce4a14c3..ed9429d55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,67 @@ +# cuvs 24.12.00 (11 Dec 2024) + +## 🚨 Breaking Changes + +- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala) +- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice) + +## 🐛 Bug Fixes + +- Skip IVF-PQ packing test for lists with not enough data ([#512](https://github.com/rapidsai/cuvs/pull/512)) [@achirkin](https://github.com/achirkin) +- [BUG] Fix CAGRA filter ([#489](https://github.com/rapidsai/cuvs/pull/489)) [@enp1s0](https://github.com/enp1s0) +- Add `kIsSingleSource` to `PairwiseDistanceEpilogueElementwise` ([#485](https://github.com/rapidsai/cuvs/pull/485)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Fix include errors, header, and unsafe locks in iface.hpp ([#467](https://github.com/rapidsai/cuvs/pull/467)) [@achirkin](https://github.com/achirkin) +- Fix an OOB error in device-side cuvs::neighbors::refine and CAGRA kern_prune ([#460](https://github.com/rapidsai/cuvs/pull/460)) [@achirkin](https://github.com/achirkin) +- Put a ceiling on cuda-python ([#445](https://github.com/rapidsai/cuvs/pull/445)) [@bdice](https://github.com/bdice) +- Enable NVTX in cuvs-cagra-search component ([#439](https://github.com/rapidsai/cuvs/pull/439)) [@achirkin](https://github.com/achirkin) +- BUG: CAGRA multi-cta illegal access with bad queries ([#438](https://github.com/rapidsai/cuvs/pull/438)) [@achirkin](https://github.com/achirkin) +- Fix index overflow in edge cases of CAGRA graph optimize ([#435](https://github.com/rapidsai/cuvs/pull/435)) [@achirkin](https://github.com/achirkin) +- Fix correct call to brute force in generate groundtruth of cuvs-bench ([#427](https://github.com/rapidsai/cuvs/pull/427)) [@dantegd](https://github.com/dantegd) +- Use Python for sccache hit rate computation. ([#420](https://github.com/rapidsai/cuvs/pull/420)) [@bdice](https://github.com/bdice) +- Add `click` package to `cuvs-bench` conda recipe ([#408](https://github.com/rapidsai/cuvs/pull/408)) [@divyegala](https://github.com/divyegala) +- Fix NVTX annotations ([#400](https://github.com/rapidsai/cuvs/pull/400)) [@achirkin](https://github.com/achirkin) + +## 📖 Documentation + +- [Doc] Fix CAGRA search sample code ([#484](https://github.com/rapidsai/cuvs/pull/484)) [@enp1s0](https://github.com/enp1s0) +- Fix broken link in README.md references ([#473](https://github.com/rapidsai/cuvs/pull/473)) [@Azurethi](https://github.com/Azurethi) +- Adding tech stack to docs ([#448](https://github.com/rapidsai/cuvs/pull/448)) [@cjnolet](https://github.com/cjnolet) +- Fix Question Retrieval notebook ([#352](https://github.com/rapidsai/cuvs/pull/352)) [@lowener](https://github.com/lowener) + +## 🚀 New Features + +- Add C++ API scalar quantization ([#494](https://github.com/rapidsai/cuvs/pull/494)) [@mfoerste4](https://github.com/mfoerste4) +- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala) +- Add serialization API to brute-force ([#461](https://github.com/rapidsai/cuvs/pull/461)) [@lowener](https://github.com/lowener) +- Add Question Retrieval notebook using Milvus ([#451](https://github.com/rapidsai/cuvs/pull/451)) [@lowener](https://github.com/lowener) +- Migrate feature diff for NN Descent from RAFT to cuVS ([#421](https://github.com/rapidsai/cuvs/pull/421)) [@divyegala](https://github.com/divyegala) +- Add --no-lap-sync cmd option to ann-bench ([#405](https://github.com/rapidsai/cuvs/pull/405)) [@achirkin](https://github.com/achirkin) +- Add `InnerProduct` and `CosineExpanded` metric support in NN Descent ([#177](https://github.com/rapidsai/cuvs/pull/177)) [@divyegala](https://github.com/divyegala) + +## 🛠️ Improvements + +- Update cuvs to match raft's cutlass changes ([#516](https://github.com/rapidsai/cuvs/pull/516)) [@vyasr](https://github.com/vyasr) +- add a README for wheels ([#504](https://github.com/rapidsai/cuvs/pull/504)) [@jameslamb](https://github.com/jameslamb) +- Move check_input_array from pylibraft ([#474](https://github.com/rapidsai/cuvs/pull/474)) [@benfred](https://github.com/benfred) +- use different wheel-size thresholds based on CUDA version ([#469](https://github.com/rapidsai/cuvs/pull/469)) [@jameslamb](https://github.com/jameslamb) +- Modify cuvs-bench to be able to generate ground truth in CPU systems ([#466](https://github.com/rapidsai/cuvs/pull/466)) [@dantegd](https://github.com/dantegd) +- enforce wheel size limits, README formatting in CI ([#464](https://github.com/rapidsai/cuvs/pull/464)) [@jameslamb](https://github.com/jameslamb) +- Moving spectral embedding and kernel gramm APIs to cuVS ([#463](https://github.com/rapidsai/cuvs/pull/463)) [@cjnolet](https://github.com/cjnolet) +- Migrate sparse knn and distances code from raft ([#457](https://github.com/rapidsai/cuvs/pull/457)) [@benfred](https://github.com/benfred) +- Don't presume pointers location infers usability. ([#441](https://github.com/rapidsai/cuvs/pull/441)) [@robertmaynard](https://github.com/robertmaynard) +- call `enable_testing` in root CMakeLists.txt ([#437](https://github.com/rapidsai/cuvs/pull/437)) [@robertmaynard](https://github.com/robertmaynard) +- CAGRA tech debt: distance descriptor and workspace memory ([#436](https://github.com/rapidsai/cuvs/pull/436)) [@achirkin](https://github.com/achirkin) +- Add ci run_ scripts needed for build infra ([#434](https://github.com/rapidsai/cuvs/pull/434)) [@robertmaynard](https://github.com/robertmaynard) +- Use environment variables in cache hit rate computation. ([#422](https://github.com/rapidsai/cuvs/pull/422)) [@bdice](https://github.com/bdice) +- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice) +- We need to enable the c_api by default ([#416](https://github.com/rapidsai/cuvs/pull/416)) [@robertmaynard](https://github.com/robertmaynard) +- print sccache stats in builds ([#413](https://github.com/rapidsai/cuvs/pull/413)) [@jameslamb](https://github.com/jameslamb) +- make conda installs in CI stricter ([#406](https://github.com/rapidsai/cuvs/pull/406)) [@jameslamb](https://github.com/jameslamb) +- Ivf c example ([#404](https://github.com/rapidsai/cuvs/pull/404)) [@abner-ma](https://github.com/abner-ma) +- Prune workflows based on changed files ([#392](https://github.com/rapidsai/cuvs/pull/392)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- [WIP] Add pinned memory resource to C API ([#311](https://github.com/rapidsai/cuvs/pull/311)) [@ajit283](https://github.com/ajit283) +- Dynamic Batching ([#261](https://github.com/rapidsai/cuvs/pull/261)) [@achirkin](https://github.com/achirkin) + # cuvs 24.10.00 (9 Oct 2024) ## 🐛 Bug Fixes diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 60b8cc122..bd9ea4834 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -264,6 +264,77 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t return std::make_unique<out_owning_type>(std::move(out_array), out_layout); } +/** + * @brief Contstruct a strided matrix from any mdarray. + * + * This function constructs an owning device matrix and copies the data. + * When the data is copied, padding elements are filled with zeroes. + * + * @tparam DataT + * @tparam IdxT + * @tparam LayoutPolicy + * @tparam ContainerPolicy + * + * @param[in] res raft resources handle + * @param[in] src the source mdarray or mdspan + * @param[in] required_stride the leading dimension (in elements) + * @return owning current-device-accessible strided matrix + */ +template <typename DataT, typename IdxT, typename LayoutPolicy, typename ContainerPolicy> +auto make_strided_dataset( + const raft::resources& res, + raft::mdarray<DataT, raft::matrix_extent<IdxT>, LayoutPolicy, ContainerPolicy>&& src, + uint32_t required_stride) -> std::unique_ptr<strided_dataset<DataT, IdxT>> +{ + using value_type = DataT; + using index_type = IdxT; + using layout_type = LayoutPolicy; + using container_policy_type = ContainerPolicy; + static_assert(std::is_same_v<layout_type, raft::layout_right> || + std::is_same_v<layout_type, raft::layout_right_padded<value_type>> || + std::is_same_v<layout_type, raft::layout_stride>, + "The input must be row-major"); + RAFT_EXPECTS(src.extent(1) <= required_stride, + "The input row length must be not larger than the desired stride."); + const uint32_t src_stride = src.stride(0) > 0 ? src.stride(0) : src.extent(1); + const bool stride_matches = required_stride == src_stride; + + auto out_layout = + raft::make_strided_layout(src.extents(), std::array<index_type, 2>{required_stride, 1}); + + using out_mdarray_type = raft::device_matrix<value_type, index_type>; + using out_layout_type = typename out_mdarray_type::layout_type; + using out_container_policy_type = typename out_mdarray_type::container_policy_type; + using out_owning_type = + owning_dataset<value_type, index_type, out_layout_type, out_container_policy_type>; + + if constexpr (std::is_same_v<layout_type, out_layout_type> && + std::is_same_v<container_policy_type, out_container_policy_type>) { + if (stride_matches) { + // Everything matches, we can own the mdarray + return std::make_unique<out_owning_type>(std::move(src), out_layout); + } + } + // Something is wrong: have to make a copy and produce an owning dataset + auto out_array = + raft::make_device_matrix<value_type, index_type>(res, src.extent(0), required_stride); + + RAFT_CUDA_TRY(cudaMemsetAsync(out_array.data_handle(), + 0, + out_array.size() * sizeof(value_type), + raft::resource::get_cuda_stream(res))); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(), + sizeof(value_type) * required_stride, + src.data_handle(), + sizeof(value_type) * src_stride, + sizeof(value_type) * src.extent(1), + src.extent(0), + cudaMemcpyDefault, + raft::resource::get_cuda_stream(res))); + + return std::make_unique<out_owning_type>(std::move(out_array), out_layout); +} + /** * @brief Contstruct a strided matrix from any mdarray or mdspan. * @@ -278,14 +349,15 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t * @return maybe owning current-device-accessible strided matrix */ template <typename SrcT> -auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t align_bytes = 16) +auto make_aligned_dataset(const raft::resources& res, SrcT src, uint32_t align_bytes = 16) -> std::unique_ptr<strided_dataset<typename SrcT::value_type, typename SrcT::index_type>> { - using value_type = typename SrcT::value_type; + using source_type = std::remove_cv_t<std::remove_reference_t<SrcT>>; + using value_type = typename source_type::value_type; constexpr size_t kSize = sizeof(value_type); uint32_t required_stride = raft::round_up_safe<size_t>(src.extent(1) * kSize, std::lcm(align_bytes, kSize)) / kSize; - return make_strided_dataset(res, src, required_stride); + return make_strided_dataset(res, std::forward<SrcT>(src), required_stride); } /** * @brief VPQ compressed dataset. diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 652d41c85..529356351 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -403,6 +403,17 @@ struct batch_load_iterator { /** A single batch of data residing in device memory. */ struct batch { + ~batch() noexcept + { + /* + If there's no copy, there's no allocation owned by the batch. + If there's no allocation, there's no guarantee that the device pointer is stream-ordered. + If there's no stream order guarantee, we must synchronize with the stream before the batch is + destroyed to make sure all GPU operations in that stream finish earlier. + */ + if (!does_copy()) { RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream_)); } + } + /** Logical width of a single row in a batch, in elements of type `T`. */ [[nodiscard]] auto row_width() const -> size_type { return row_width_; } /** Logical offset of the batch, in rows (`row_width()`) */ diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 0f8309328..9f95c5b1c 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -179,7 +179,7 @@ class device_matrix_view_from_host { public: device_matrix_view_from_host(raft::resources const& res, raft::host_matrix_view<T, IdxT> host_view) - : host_view_(host_view) + : res_(res), host_view_(host_view) { cudaPointerAttributes attr; RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); @@ -199,6 +199,17 @@ class device_matrix_view_from_host { } } + ~device_matrix_view_from_host() noexcept + { + /* + If there's no copy, there's no allocation owned by this struct. + If there's no allocation, there's no guarantee that the device pointer is stream-ordered. + If there's no stream order guarantee, we must synchronize with the stream before the struct is + destroyed to make sure all GPU operations in that stream finish earlier. + */ + if (!allocated_memory()) { raft::resource::sync_stream(res_); } + } + raft::device_matrix_view<T, IdxT> view() { return raft::make_device_matrix_view<T, IdxT>( @@ -207,9 +218,10 @@ class device_matrix_view_from_host { T* data_handle() { return device_ptr; } - bool allocated_memory() const { return device_mem_.has_value(); } + [[nodiscard]] bool allocated_memory() const { return device_mem_.has_value(); } private: + const raft::resources& res_; std::optional<raft::device_matrix<T, IdxT>> device_mem_; raft::host_matrix_view<T, IdxT> host_view_; T* device_ptr; diff --git a/cpp/src/neighbors/detail/dataset_serialize.hpp b/cpp/src/neighbors/detail/dataset_serialize.hpp index 40d9df930..0ecc2cf5d 100644 --- a/cpp/src/neighbors/detail/dataset_serialize.hpp +++ b/cpp/src/neighbors/detail/dataset_serialize.hpp @@ -140,7 +140,7 @@ auto deserialize_strided(raft::resources const& res, std::istream& is) auto stride = raft::deserialize_scalar<uint32_t>(res, is); auto host_array = raft::make_host_matrix<DataT, IdxT>(n_rows, dim); raft::deserialize_mdspan(res, is, host_array.view()); - return make_strided_dataset(res, host_array, stride); + return make_strided_dataset(res, std::move(host_array), stride); } template <typename MathT, typename IdxT> diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 8d5701439..c1cd3ca09 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -389,12 +389,13 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> { (const DataT*)database.data(), ps.n_rows, ps.dim); { + std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt}; cagra::index<DataT, IdxT> index(handle_, index_params.metric); if (ps.host_dataset) { - auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim); - raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); + database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim); + raft::copy(database_host->data_handle(), database.data(), database.size(), stream_); auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>( - (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); + (const DataT*)database_host->data_handle(), ps.n_rows, ps.dim); index = cagra::build(handle_, index_params, database_host_view); } else { @@ -567,13 +568,16 @@ class AnnCagraAddNodesTest : public ::testing::TestWithParam<AnnCagraInputs> { auto initial_database_view = raft::make_device_matrix_view<const DataT, int64_t>( (const DataT*)database.data(), initial_database_size, ps.dim); + std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt}; cagra::index<DataT, IdxT> index(handle_); if (ps.host_dataset) { - auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim); + database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim); raft::copy( - database_host.data_handle(), database.data(), initial_database_view.size(), stream_); + database_host->data_handle(), database.data(), initial_database_view.size(), stream_); auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>( - (const DataT*)database_host.data_handle(), initial_database_size, ps.dim); + (const DataT*)database_host->data_handle(), initial_database_size, ps.dim); + // NB: database_host must live no less than the index, because the index _may_be_ + // non-onwning index = cagra::build(handle_, index_params, database_host_view); } else { index = cagra::build(handle_, index_params, initial_database_view); @@ -763,12 +767,13 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> { auto database_view = raft::make_device_matrix_view<const DataT, int64_t>( (const DataT*)database.data(), ps.n_rows, ps.dim); + std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt}; cagra::index<DataT, IdxT> index(handle_); if (ps.host_dataset) { - auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim); - raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); + database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim); + raft::copy(database_host->data_handle(), database.data(), database.size(), stream_); auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>( - (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); + (const DataT*)database_host->data_handle(), ps.n_rows, ps.dim); index = cagra::build(handle_, index_params, database_host_view); } else { index = cagra::build(handle_, index_params, database_view);