diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 25475fc6f2..eaab637338 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -565,7 +565,6 @@ if(RAFT_COMPILE_LIBRARY) src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu - src/util/memory_pool.cpp ) set_target_properties( raft_objs diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h index 40c1ecfa5e..72a2c0bb05 100644 --- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h +++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -130,8 +131,8 @@ class configured_raft_resources { { } - configured_raft_resources(configured_raft_resources&&) = default; - configured_raft_resources& operator=(configured_raft_resources&&) = default; + configured_raft_resources(configured_raft_resources&&) = delete; + configured_raft_resources& operator=(configured_raft_resources&&) = delete; ~configured_raft_resources() = default; configured_raft_resources(const configured_raft_resources& res) : configured_raft_resources{res.shared_res_} diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu index 709b08db76..d9ef1d74a3 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu +++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu @@ -20,6 +20,7 @@ #include #include +#include #define JSON_DIAGNOSTICS 1 #include @@ -89,10 +90,11 @@ int main(int argc, char** argv) // and is initially sized to half of free device memory. rmm::mr::pool_memory_resource pool_mr{ &cuda_mr, rmm::percent_of_free_device_memory(50)}; - rmm::mr::set_current_device_resource( - &pool_mr); // Updates the current device resource pointer to `pool_mr` - rmm::mr::device_memory_resource* mr = - rmm::mr::get_current_device_resource(); // Points to `pool_mr` - return raft::bench::ann::run_main(argc, argv); + // Updates the current device resource pointer to `pool_mr` + auto old_mr = rmm::mr::set_current_device_resource(&pool_mr); + auto ret = raft::bench::ann::run_main(argc, argv); + // Restores the current device resource pointer to its previous value + rmm::mr::set_current_device_resource(old_mr); + return ret; } #endif diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h index 70fd22001e..46da8c52e6 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include @@ -138,7 +138,7 @@ class RaftCagra : public ANN, public AnnGPU { std::shared_ptr> dataset_; std::shared_ptr> input_dataset_v_; - inline rmm::mr::device_memory_resource* get_mr(AllocatorType mem_type) + inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type) { switch (mem_type) { case (AllocatorType::HostPinned): return &mr_pinned_; diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h index 7f2996d77a..48d2b9de80 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h @@ -134,7 +134,14 @@ void RaftIvfFlatGpu::search( const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const { static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t"); - raft::neighbors::ivf_flat::search( - handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances); + raft::neighbors::ivf_flat::search(handle_, + search_params_, + *index_, + queries, + batch_size, + k, + (IdxT*)neighbors, + distances, + resource::get_workspace_resource(handle_)); } } // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h index 5d8b682264..1d73bd2e51 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -32,9 +32,6 @@ #include #include -#include -#include - #include namespace raft::bench::ann { diff --git a/cpp/bench/prims/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp index 4ecad6df3d..3ce43cc1e7 100644 --- a/cpp/bench/prims/common/benchmark.hpp +++ b/cpp/bench/prims/common/benchmark.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu index 078f9e6198..876e47525c 100644 --- a/cpp/bench/prims/matrix/gather.cu +++ b/cpp/bench/prims/matrix/gather.cu @@ -24,6 +24,7 @@ #include #include +#include #include namespace raft::bench::matrix { diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh index aea7168142..6499078623 100644 --- a/cpp/bench/prims/neighbors/knn.cuh +++ b/cpp/bench/prims/neighbors/knn.cuh @@ -27,10 +27,12 @@ #include #include +#include #include #include #include #include +#include #include @@ -101,7 +103,7 @@ struct device_resource { if (managed_) { delete res_; } } - [[nodiscard]] auto get() const -> rmm::mr::device_memory_resource* { return res_; } + [[nodiscard]] auto get() const -> rmm::device_async_resource_ref { return res_; } private: const bool managed_; @@ -158,8 +160,15 @@ struct ivf_flat_knn { IdxT* out_idxs) { search_params.n_probes = 20; - raft::neighbors::ivf_flat::search( - handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists); + raft::neighbors::ivf_flat::search(handle, + search_params, + *index, + search_items, + ps.n_queries, + ps.k, + out_idxs, + out_dists, + resource::get_workspace_resource(handle)); } }; diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu index 4c8ca2bf31..70a9c65e0d 100644 --- a/cpp/bench/prims/random/subsample.cu +++ b/cpp/bench/prims/random/subsample.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh index 6d3f430e88..0a5a3ba5aa 100644 --- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh @@ -43,15 +43,14 @@ #include #include -#include -#include #include -#include +#include #include #include #include +#include #include #include @@ -91,7 +90,7 @@ inline std::enable_if_t> predict_core( const MathT* dataset_norm, IdxT n_rows, LabelT* labels, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { auto stream = resource::get_cuda_stream(handle); switch (params.metric) { @@ -263,10 +262,9 @@ void calc_centers_and_sizes(const raft::resources& handle, const LabelT* labels, bool reset_counters, MappingOpT mapping_op, - rmm::mr::device_memory_resource* mr = nullptr) + rmm::device_async_resource_ref mr) { auto stream = resource::get_cuda_stream(handle); - if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } if (!reset_counters) { raft::linalg::matrixVectorOp( @@ -322,12 +320,12 @@ void compute_norm(const raft::resources& handle, IdxT dim, IdxT n_rows, MappingOpT mapping_op, - rmm::mr::device_memory_resource* mr = nullptr) + std::optional mr = std::nullopt) { common::nvtx::range fun_scope("compute_norm"); auto stream = resource::get_cuda_stream(handle); - if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } - rmm::device_uvector mapped_dataset(0, stream, mr); + rmm::device_uvector mapped_dataset( + 0, stream, mr.value_or(resource::get_workspace_resource(handle))); const MathT* dataset_ptr = nullptr; @@ -338,7 +336,7 @@ void compute_norm(const raft::resources& handle, linalg::unaryOp(mapped_dataset.data(), dataset, n_rows * dim, mapping_op, stream); - dataset_ptr = (const MathT*)mapped_dataset.data(); + dataset_ptr = static_cast(mapped_dataset.data()); } raft::linalg::rowNorm( @@ -376,22 +374,22 @@ void predict(const raft::resources& handle, IdxT n_rows, LabelT* labels, MappingOpT mapping_op, - rmm::mr::device_memory_resource* mr = nullptr, - const MathT* dataset_norm = nullptr) + std::optional mr = std::nullopt, + const MathT* dataset_norm = nullptr) { auto stream = resource::get_cuda_stream(handle); common::nvtx::range fun_scope( "predict(%zu, %u)", static_cast(n_rows), n_clusters); - if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } + auto mem_res = mr.value_or(resource::get_workspace_resource(handle)); auto [max_minibatch_size, _mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); rmm::device_uvector cur_dataset( - std::is_same_v ? 0 : max_minibatch_size * dim, stream, mr); + std::is_same_v ? 0 : max_minibatch_size * dim, stream, mem_res); bool need_compute_norm = dataset_norm == nullptr && (params.metric == raft::distance::DistanceType::L2Expanded || params.metric == raft::distance::DistanceType::L2SqrtExpanded); rmm::device_uvector cur_dataset_norm( - need_compute_norm ? max_minibatch_size : 0, stream, mr); + need_compute_norm ? max_minibatch_size : 0, stream, mem_res); const MathT* dataset_norm_ptr = nullptr; auto cur_dataset_ptr = cur_dataset.data(); for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) { @@ -407,7 +405,7 @@ void predict(const raft::resources& handle, // Compute the norm now if it hasn't been pre-computed. if (need_compute_norm) { compute_norm( - handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mr); + handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mem_res); dataset_norm_ptr = cur_dataset_norm.data(); } else if (dataset_norm != nullptr) { dataset_norm_ptr = dataset_norm + offset; @@ -422,7 +420,7 @@ void predict(const raft::resources& handle, dataset_norm_ptr, minibatch_size, labels + offset, - mr); + mem_res); } } @@ -530,7 +528,7 @@ auto adjust_centers(MathT* centers, MathT threshold, MappingOpT mapping_op, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* device_memory) -> bool + rmm::device_async_resource_ref device_memory) -> bool { common::nvtx::range fun_scope( "adjust_centers(%zu, %u)", static_cast(n_rows), n_clusters); @@ -628,7 +626,7 @@ void balancing_em_iters(const raft::resources& handle, uint32_t balancing_pullback, MathT balancing_threshold, MappingOpT mapping_op, - rmm::mr::device_memory_resource* device_memory) + rmm::device_async_resource_ref device_memory) { auto stream = resource::get_cuda_stream(handle); uint32_t balancing_counter = balancing_pullback; @@ -711,7 +709,7 @@ void build_clusters(const raft::resources& handle, LabelT* cluster_labels, CounterT* cluster_sizes, MappingOpT mapping_op, - rmm::mr::device_memory_resource* device_memory, + rmm::device_async_resource_ref device_memory, const MathT* dataset_norm = nullptr) { auto stream = resource::get_cuda_stream(handle); @@ -853,8 +851,8 @@ auto build_fine_clusters(const raft::resources& handle, IdxT fine_clusters_nums_max, MathT* cluster_centers, MappingOpT mapping_op, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) -> IdxT + rmm::device_async_resource_ref managed_memory, + rmm::device_async_resource_ref device_memory) -> IdxT { auto stream = resource::get_cuda_stream(handle); rmm::device_uvector mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory); @@ -971,7 +969,7 @@ void build_hierarchical(const raft::resources& handle, // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf. rmm::mr::managed_memory_resource managed_memory; - rmm::mr::device_memory_resource* device_memory = resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle); auto [max_minibatch_size, mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh index 8cd7730814..a1a182608b 100644 --- a/cpp/include/raft/cluster/kmeans_balanced.cuh +++ b/cpp/include/raft/cluster/kmeans_balanced.cuh @@ -358,7 +358,8 @@ void calc_centers_and_sizes(const raft::resources& handle, X.extent(0), labels.data_handle(), reset_counters, - mapping_op); + mapping_op, + resource::get_workspace_resource(handle)); } } // namespace helpers diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp index 8c6eff582b..18d8b77364 100644 --- a/cpp/include/raft/core/device_container_policy.hpp +++ b/cpp/include/raft/core/device_container_policy.hpp @@ -31,7 +31,8 @@ #include #include -#include +#include +#include #include @@ -117,7 +118,7 @@ class device_uvector { */ explicit device_uvector(std::size_t size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) : data_{size, stream, mr} { } @@ -164,19 +165,11 @@ class device_uvector_policy { public: auto create(raft::resources const& res, size_t n) -> container_type { - if (mr_ == nullptr) { - // NB: not using the workspace resource by default! - // The workspace resource is for short-lived temporary allocations. - return container_type(n, resource::get_cuda_stream(res)); - } else { - return container_type(n, resource::get_cuda_stream(res), mr_); - } + return container_type(n, resource::get_cuda_stream(res), mr_); } constexpr device_uvector_policy() = default; - constexpr explicit device_uvector_policy(rmm::mr::device_memory_resource* mr) noexcept : mr_(mr) - { - } + explicit device_uvector_policy(rmm::device_async_resource_ref mr) noexcept : mr_(mr) {} [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference { @@ -192,7 +185,7 @@ class device_uvector_policy { [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } private: - rmm::mr::device_memory_resource* mr_{nullptr}; + rmm::device_async_resource_ref mr_{rmm::mr::get_current_device_resource()}; }; } // namespace raft diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp index 855642cd76..a34f6e2e02 100644 --- a/cpp/include/raft/core/device_mdarray.hpp +++ b/cpp/include/raft/core/device_mdarray.hpp @@ -21,6 +21,8 @@ #include #include +#include + #include namespace raft { @@ -107,7 +109,7 @@ template auto make_device_mdarray(raft::resources const& handle, - rmm::mr::device_memory_resource* mr, + rmm::device_async_resource_ref mr, extents exts) { using mdarray_t = device_mdarray; diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index 366e387fdd..496c65d91f 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -37,6 +37,7 @@ #include #include +#include #include diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh index 3e3699766f..951e030cbd 100644 --- a/cpp/include/raft/distance/detail/masked_nn.cuh +++ b/cpp/include/raft/distance/detail/masked_nn.cuh @@ -256,9 +256,8 @@ void masked_l2_nn_impl(raft::resources const& handle, static_assert(P::Mblk == 64, "masked_l2_nn_impl only supports a policy with 64 rows per block."); // Get stream and workspace memory resource - rmm::mr::device_memory_resource* ws_mr = - dynamic_cast(resource::get_workspace_resource(handle)); auto stream = resource::get_cuda_stream(handle); + auto ws_mr = resource::get_workspace_resource(handle); // Acquire temporary buffers and initialize to zero: // 1) Adjacency matrix bitfield diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index d580ea72c1..9f3be7ce0e 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -28,11 +28,18 @@ namespace raft { namespace linalg { namespace detail { -template +template struct ReductionThinPolicy { - static constexpr int LogicalWarpSize = warpSize; - static constexpr int RowsPerBlock = rpb; - static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock; + static_assert(tpb % warpSize == 0); + + static constexpr int LogicalWarpSize = warpSize; + static constexpr int ThreadsPerBlock = tpb; + static constexpr int RowsPerLogicalWarp = rpw; + static constexpr int NumLogicalWarps = ThreadsPerBlock / LogicalWarpSize; + static constexpr int RowsPerBlock = NumLogicalWarps * RowsPerLogicalWarp; + + // Whether D (run-time arg) will be smaller than warpSize (compile-time parameter) + static constexpr bool NoSequentialReduce = noLoop; }; template (blockIdx.x)); - if (i >= N) return; + /* The strategy to achieve near-SOL memory bandwidth differs based on D: + * - For small D, we need to process multiple rows per logical warp in order to have + * multiple loads per thread and increase bytes in flight and amortize latencies. + * - For large D, we start with a sequential reduction. The compiler partially unrolls + * that loop (e.g. first a loop of stride 16, then 8, 4, and 1). + */ + IdxType i0 = threadIdx.y + (Policy::RowsPerBlock * static_cast(blockIdx.x)); + if (i0 >= N) return; - OutType acc = init; - for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) { - acc = reduce_op(acc, main_op(data[j + (D * i)], j)); + OutType acc[Policy::RowsPerLogicalWarp]; +#pragma unroll + for (int k = 0; k < Policy::RowsPerLogicalWarp; k++) { + acc[k] = init; } - acc = raft::logicalWarpReduce(acc, reduce_op); - if (threadIdx.x == 0) { + + if constexpr (Policy::NoSequentialReduce) { + IdxType j = threadIdx.x; + if (j < D) { +#pragma unroll + for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) { + // Only the first row is known to be within bounds. Clamp to avoid out-of-mem read. + const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1); + acc[k] = reduce_op(acc[k], main_op(data[j + (D * i)], j)); + } + } + } else { + for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) { +#pragma unroll + for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) { + const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1); + acc[k] = reduce_op(acc[k], main_op(data[j + (D * i)], j)); + } + } + } + + /* This vector reduction has two benefits compared to naive separate reductions: + * - It avoids the LSU bottleneck when the number of columns is around 32 (e.g. for 32, 5 shuffles + * are required and there is no initial sequential reduction to amortize that cost). + * - It distributes the outputs to multiple threads, enabling a coalesced store when the number of + * rows per logical warp and logical warp size are equal. + */ + raft::logicalWarpReduceVector( + acc, threadIdx.x, reduce_op); + + constexpr int reducOutVecWidth = + std::max(1, Policy::RowsPerLogicalWarp / Policy::LogicalWarpSize); + constexpr int reducOutGroupSize = + std::max(1, Policy::LogicalWarpSize / Policy::RowsPerLogicalWarp); + constexpr int reducNumGroups = Policy::LogicalWarpSize / reducOutGroupSize; + + if (threadIdx.x % reducOutGroupSize == 0) { + const int groupId = threadIdx.x / reducOutGroupSize; if (inplace) { - dots[i] = final_op(reduce_op(dots[i], acc)); +#pragma unroll + for (int k = 0; k < reducOutVecWidth; k++) { + const int reductionId = k * reducNumGroups + groupId; + const IdxType i = i0 + reductionId * Policy::NumLogicalWarps; + if (i < N) { dots[i] = final_op(reduce_op(dots[i], acc[k])); } + } } else { - dots[i] = final_op(acc); +#pragma unroll + for (int k = 0; k < reducOutVecWidth; k++) { + const int reductionId = k * reducNumGroups + groupId; + const IdxType i = i0 + reductionId * Policy::NumLogicalWarps; + if (i < N) { dots[i] = final_op(acc[k]); } + } } } } @@ -89,8 +149,12 @@ void coalescedReductionThin(OutType* dots, FinalLambda final_op = raft::identity_op()) { common::nvtx::range fun_scope( - "coalescedReductionThin<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock); - dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1); + "coalescedReductionThin<%d,%d,%d,%d>", + Policy::LogicalWarpSize, + Policy::ThreadsPerBlock, + Policy::RowsPerLogicalWarp, + static_cast(Policy::NoSequentialReduce)); + dim3 threads(Policy::LogicalWarpSize, Policy::NumLogicalWarps, 1); dim3 blocks(ceildiv(N, Policy::RowsPerBlock), 1, 1); coalescedReductionThinKernel <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); @@ -115,19 +179,28 @@ void coalescedReductionThinDispatcher(OutType* dots, FinalLambda final_op = raft::identity_op()) { if (D <= IdxType(2)) { - coalescedReductionThin>( + coalescedReductionThin>( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (D <= IdxType(4)) { - coalescedReductionThin>( + coalescedReductionThin>( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (D <= IdxType(8)) { - coalescedReductionThin>( + coalescedReductionThin>( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (D <= IdxType(16)) { - coalescedReductionThin>( + coalescedReductionThin>( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + } else if (D <= IdxType(32)) { + coalescedReductionThin>( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + } else if (D < IdxType(128)) { + coalescedReductionThin>( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else { - coalescedReductionThin>( + // For D=128 (included) and above, the 4x-unrolled loading loop is used + // and multiple rows per warp are counter-productive in terms of cache-friendliness + // and register use. + coalescedReductionThin>( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } } @@ -319,10 +392,10 @@ void coalescedReductionThickDispatcher(OutType* dots, // Note: multiple elements per thread to take advantage of the sequential reduction and loop // unrolling if (D < IdxType(32768)) { - coalescedReductionThick, ReductionThinPolicy<32, 4>>( + coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else { - coalescedReductionThick, ReductionThinPolicy<32, 4>>( + coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } } diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh index 506cbffcb9..6db1a5acac 100644 --- a/cpp/include/raft/matrix/detail/select_k-ext.cuh +++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh @@ -20,9 +20,6 @@ #include #include // RAFT_EXPLICIT -#include // rmm:cuda_stream_view -#include // rmm::mr::device_memory_resource - #include // __half #include // uint32_t diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh index 83d4845c31..9480c8e202 100644 --- a/cpp/include/raft/matrix/detail/select_radix.cuh +++ b/cpp/include/raft/matrix/detail/select_radix.cuh @@ -29,9 +29,9 @@ #include #include +#include #include -#include -#include +#include #include #include @@ -894,14 +894,12 @@ void radix_topk(const T* in, unsigned grid_dim, int sm_cnt, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) { // TODO: is it possible to relax this restriction? static_assert(calc_num_passes() > 1); constexpr int num_buckets = calc_num_buckets(); - if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } - auto kernel = radix_kernel; const size_t max_chunk_size = calc_chunk_size(batch_size, len, sm_cnt, kernel, false); @@ -1179,7 +1177,7 @@ void radix_topk_one_block(const T* in, const IdxT* len_i, int sm_cnt, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { static_assert(calc_num_passes() > 1); diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh index 2cb32585d5..7da659291c 100644 --- a/cpp/include/raft/matrix/detail/select_warpsort.cuh +++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh @@ -27,8 +27,9 @@ #include #include +#include #include -#include +#include #include #include @@ -1032,7 +1033,7 @@ void select_k_(int num_of_block, IdxT* out_idx, bool select_min, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { rmm::device_uvector tmp_val(num_of_block * k * batch_size, stream, mr); rmm::device_uvector tmp_idx(num_of_block * k * batch_size, stream, mr); diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh index d91e45257e..d63f865c39 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh @@ -34,6 +34,8 @@ #include #include +#include + #include #include #include @@ -124,7 +126,7 @@ void build_knn_graph(raft::resources const& res, bool first = true; const auto start_clock = std::chrono::system_clock::now(); - rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(res); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res); raft::spatial::knn::detail::utils::batch_load_iterator vec_batches( dataset.data_handle(), diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp index 265cbfdceb..ece95a7cb7 100644 --- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include @@ -261,9 +261,8 @@ template void copy_with_padding(raft::resources const& res, raft::device_matrix& dst, mdspan, row_major, data_accessor> src, - rmm::mr::device_memory_resource* mr = nullptr) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) { - if (!mr) { mr = rmm::mr::get_current_device_resource(); } size_t padded_dim = round_up_safe(src.extent(1) * sizeof(T), 16) / sizeof(T); if ((dst.extent(0) != src.extent(0)) || (static_cast(dst.extent(1)) != padded_dim)) { diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh index 350b82ede7..c14b0e810f 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh @@ -20,6 +20,8 @@ #include // none_ivf_sample_filter #include // RAFT_EXPLICIT +#include + #include #include // uintX_t @@ -37,8 +39,8 @@ void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; + rmm::device_async_resource_ref mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; } // namespace raft::neighbors::ivf_flat::detail @@ -54,7 +56,7 @@ void search(raft::resources const& handle, uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr, \ + rmm::device_async_resource_ref mr, \ IvfSampleFilterT sample_filter) instantiate_raft_neighbors_ivf_flat_detail_search( diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh index 441fb76b2f..388dd60f14 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh @@ -30,7 +30,7 @@ #include // none_ivf_sample_filter #include // utils::mapping -#include // rmm::device_memory_resource +#include namespace raft::neighbors::ivf_flat::detail { @@ -48,7 +48,7 @@ void search_impl(raft::resources const& handle, bool select_min, IdxT* neighbors, AccT* distances, - rmm::mr::device_memory_resource* search_mr, + rmm::device_async_resource_ref search_mr, IvfSampleFilterT sample_filter) { auto stream = resource::get_cuda_stream(handle); @@ -276,13 +276,12 @@ inline void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(), + IvfSampleFilterT sample_filter = IvfSampleFilterT()) { common::nvtx::range fun_scope( "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim()); - if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } RAFT_EXPECTS(params.n_probes > 0, "n_probes (number of clusters to probe in the search) must be positive."); auto n_probes = std::min(params.n_probes, index.n_lists()); diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 8e3f7dbaf3..24574642ef 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -171,7 +172,7 @@ void select_residuals(raft::resources const& handle, const float* center, // [dim] const T* dataset, // [.., dim] const IdxT* row_ids, // [n_rows] - rmm::mr::device_memory_resource* device_memory + rmm::device_async_resource_ref device_memory ) { @@ -225,7 +226,7 @@ void flat_compute_residuals( device_matrix_view centers, // [n_lists, dim_ext] const T* dataset, // [n_rows, dim] std::variant labels, // [n_rows] - rmm::mr::device_memory_resource* device_memory) + rmm::device_async_resource_ref device_memory) { auto stream = resource::get_cuda_stream(handle); auto dim = rotation_matrix.extent(1); @@ -397,7 +398,7 @@ void train_per_subset(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory) + rmm::device_async_resource_ref managed_memory) { auto stream = resource::get_cuda_stream(handle); auto device_memory = resource::get_workspace_resource(handle); @@ -475,7 +476,7 @@ void train_per_cluster(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory) + rmm::device_async_resource_ref managed_memory) { auto stream = resource::get_cuda_stream(handle); auto device_memory = resource::get_workspace_resource(handle); @@ -1325,7 +1326,7 @@ void process_and_fill_codes(raft::resources const& handle, std::variant src_offset_or_indices, const uint32_t* new_labels, IdxT n_rows, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { auto new_vectors_residual = make_device_mdarray(handle, mr, make_extents(n_rows, index.rot_dim())); @@ -1516,7 +1517,7 @@ void extend(raft::resources const& handle, std::is_same_v, "Unsupported data type"); - rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle); // The spec defines how the clusters look like auto spec = list_spec{ diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 4c5da38092..87e6d0a774 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -45,8 +45,7 @@ #include #include -#include -#include +#include #include #include @@ -76,7 +75,7 @@ void select_clusters(raft::resources const& handle, raft::distance::DistanceType metric, const T* queries, // [n_queries, dim] const float* cluster_centers, // [n_lists, dim_ext] - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { common::nvtx::range fun_scope( "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)", diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh index adcb639301..daa2798b00 100644 --- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh +++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh index a1783dfcfd..12ab0dc3a6 100644 --- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh +++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh @@ -22,7 +22,7 @@ #include // raft::neighbors::ivf_flat::index #include // RAFT_EXPLICIT -#include // rmm::mr::device_memory_resource +#include #include // int64_t @@ -109,8 +109,8 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; + rmm::device_async_resource_ref mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; template void search(raft::resources const& handle, @@ -121,7 +121,7 @@ void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT; + rmm::device_async_resource_ref mr) RAFT_EXPLICIT; template void search_with_filtering(raft::resources const& handle, @@ -240,7 +240,7 @@ instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t); uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr); \ + rmm::device_async_resource_ref mr); \ \ extern template void raft::neighbors::ivf_flat::search( \ raft::resources const& handle, \ diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh index ed1d320795..ea7cff7060 100644 --- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh @@ -24,7 +24,7 @@ #include #include -#include +#include namespace raft::neighbors::ivf_flat { @@ -462,8 +462,8 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - IvfSampleFilterT sample_filter = IvfSampleFilterT()) + rmm::device_async_resource_ref mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) { raft::neighbors::ivf_flat::detail::search( handle, params, index, queries, n_queries, k, neighbors, distances, mr, sample_filter); @@ -520,7 +520,7 @@ void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr) + rmm::device_async_resource_ref mr) { raft::neighbors::ivf_flat::detail::search(handle, params, diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh index 160a2753a5..620f4a244f 100644 --- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh @@ -21,8 +21,6 @@ #include // raft::neighbors::ivf_pq::index #include // RAFT_EXPLICIT -#include // rmm::mr::device_memory_resource - #include // int64_t #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY @@ -105,33 +103,6 @@ void search(raft::resources const& handle, IdxT* neighbors, float* distances) RAFT_EXPLICIT; -template -[[deprecated( - "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void -search_with_filtering(raft::resources const& handle, - const raft::neighbors::ivf_pq::search_params& params, - const index& idx, - const T* queries, - uint32_t n_queries, - uint32_t k, - IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr, - IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT; - -template -[[deprecated( - "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void -search(raft::resources const& handle, - const raft::neighbors::ivf_pq::search_params& params, - const index& idx, - const T* queries, - uint32_t n_queries, - uint32_t k, - IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr) RAFT_EXPLICIT; - } // namespace raft::neighbors::ivf_pq #endif // RAFT_EXPLICIT_INSTANTIATE_ONLY @@ -209,8 +180,7 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t); uint32_t n_queries, \ uint32_t k, \ IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr); \ + float* distances); \ \ extern template void raft::neighbors::ivf_pq::search( \ raft::resources const& handle, \ diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index a893153e1a..77c4bb8553 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -24,8 +24,6 @@ #include #include -#include - #include // shared_ptr namespace raft::neighbors::ivf_pq { @@ -403,38 +401,6 @@ void search_with_filtering(raft::resources const& handle, detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); } -/** - * This function is deprecated and will be removed in a future. - * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. - */ -template -[[deprecated( - "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void -search_with_filtering(raft::resources const& handle, - const search_params& params, - const index& idx, - const T* queries, - uint32_t n_queries, - uint32_t k, - IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr, - IvfSampleFilterT sample_filter = IvfSampleFilterT{}) -{ - if (mr != nullptr) { - // Shallow copy of the resource with the automatic lifespan: - // change the workspace resource temporarily - raft::resources res_local(handle); - resource::set_workspace_resource( - res_local, std::shared_ptr{mr, void_op{}}); - return search_with_filtering( - res_local, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); - } else { - return search_with_filtering( - handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); - } -} - /** * @brief Search ANN using the constructed index. * @@ -446,16 +412,13 @@ search_with_filtering(raft::resources const& handle, * eliminate entirely allocations happening within `search`: * @code{.cpp} * ... - * // Create a pooling memory resource with a pre-defined initial size. - * rmm::mr::pool_memory_resource mr( - * rmm::mr::get_current_device_resource(), 1024 * 1024); * // use default search parameters * ivf_pq::search_params search_params; * // Use the same allocator across multiple searches to reduce the number of * // cuda memory allocations - * ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr); - * ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr); - * ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr); + * ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1); + * ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2); + * ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3); * ... * @endcode * The exact size of the temporary buffer depends on multiple factors and is an implementation @@ -496,33 +459,4 @@ void search(raft::resources const& handle, raft::neighbors::filtering::none_ivf_sample_filter{}); } -/** - * This function is deprecated and will be removed in a future. - * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. - */ -template -[[deprecated( - "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void -search(raft::resources const& handle, - const search_params& params, - const index& idx, - const T* queries, - uint32_t n_queries, - uint32_t k, - IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr) -{ - return search_with_filtering(handle, - params, - idx, - queries, - n_queries, - k, - neighbors, - distances, - mr, - raft::neighbors::filtering::none_ivf_sample_filter{}); -} - } // namespace raft::neighbors::ivf_pq diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh index e88cbbdeea..c33bb8c348 100644 --- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh +++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh @@ -31,10 +31,10 @@ #include #include - -#include +#include #include +#include #include #include #include @@ -278,7 +278,7 @@ class multi_variable_gaussian_setup_token; template multi_variable_gaussian_setup_token build_multi_variable_gaussian_token_impl( raft::resources const& handle, - rmm::mr::device_memory_resource& mem_resource, + rmm::device_async_resource_ref mem_resource, const int dim, const multi_variable_gaussian_decomposition_method method); @@ -294,7 +294,7 @@ class multi_variable_gaussian_setup_token { template friend multi_variable_gaussian_setup_token build_multi_variable_gaussian_token_impl( raft::resources const& handle, - rmm::mr::device_memory_resource& mem_resource, + rmm::device_async_resource_ref mem_resource, const int dim, const multi_variable_gaussian_decomposition_method method); @@ -321,7 +321,7 @@ class multi_variable_gaussian_setup_token { // Constructor, only for use by friend functions. // Hiding this will let us change the implementation in the future. multi_variable_gaussian_setup_token(raft::resources const& handle, - rmm::mr::device_memory_resource& mem_resource, + rmm::device_async_resource_ref mem_resource, const int dim, const multi_variable_gaussian_decomposition_method method) : impl_(std::make_unique>( @@ -378,14 +378,14 @@ class multi_variable_gaussian_setup_token { private: std::unique_ptr> impl_; raft::resources const& handle_; - rmm::mr::device_memory_resource& mem_resource_; + rmm::device_async_resource_ref mem_resource_; int dim_ = 0; auto allocate_workspace() const { const auto num_elements = impl_->get_workspace_size(); return rmm::device_uvector{ - num_elements, resource::get_cuda_stream(handle_), &mem_resource_}; + num_elements, resource::get_cuda_stream(handle_), mem_resource_}; } int dim() const { return dim_; } @@ -394,7 +394,7 @@ class multi_variable_gaussian_setup_token { template multi_variable_gaussian_setup_token build_multi_variable_gaussian_token_impl( raft::resources const& handle, - rmm::mr::device_memory_resource& mem_resource, + rmm::device_async_resource_ref mem_resource, const int dim, const multi_variable_gaussian_decomposition_method method) { @@ -414,7 +414,7 @@ void compute_multi_variable_gaussian_impl( template void compute_multi_variable_gaussian_impl( raft::resources const& handle, - rmm::mr::device_memory_resource& mem_resource, + rmm::device_async_resource_ref mem_resource, std::optional> x, raft::device_matrix_view P, raft::device_matrix_view X, diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh index ab3f433422..4b37e1ff65 100644 --- a/cpp/include/raft/random/multi_variable_gaussian.cuh +++ b/cpp/include/raft/random/multi_variable_gaussian.cuh @@ -24,6 +24,8 @@ #include #include +#include + namespace raft::random { /** @@ -33,7 +35,7 @@ namespace raft::random { template void multi_variable_gaussian(raft::resources const& handle, - rmm::mr::device_memory_resource& mem_resource, + rmm::device_async_resource_ref mem_resource, std::optional> x, raft::device_matrix_view P, raft::device_matrix_view X, @@ -49,12 +51,8 @@ void multi_variable_gaussian(raft::resources const& handle, raft::device_matrix_view X, const multi_variable_gaussian_decomposition_method method) { - rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource(); - RAFT_EXPECTS(mem_resource_ptr != nullptr, - "compute_multi_variable_gaussian: " - "rmm::mr::get_current_device_resource() returned null; " - "please report this bug to the RAPIDS RAFT developers."); - detail::compute_multi_variable_gaussian_impl(handle, *mem_resource_ptr, x, P, X, method); + detail::compute_multi_variable_gaussian_impl( + handle, rmm::mr::get_current_device_resource(), x, P, X, method); } /** @} */ diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh index 08bdfa6f30..922356b040 100644 --- a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh +++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh @@ -21,9 +21,6 @@ #include #include // RAFT_EXPLICIT -#include // rmm:cuda_stream_view -#include // rmm::mr::device_memory_resource - #include // __half #include // uint32_t diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh index 041ab225f9..351bcd5531 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh @@ -108,8 +108,15 @@ void approx_knn_search(raft::resources const& handle, if (index->ivf_flat()) { ivf_flat::search_params params; params.n_probes = index->nprobe; - ivf_flat::search( - handle, params, *(index->ivf_flat()), query_array, n, k, indices, distances); + ivf_flat::search(handle, + params, + *(index->ivf_flat()), + query_array, + n, + k, + indices, + distances, + resource::get_workspace_resource(handle)); } else if (index->ivf_pq) { neighbors::ivf_pq::search_params params; params.n_probes = index->nprobe; diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index d862e586e3..920249172f 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -416,7 +417,7 @@ struct batch_load_iterator { size_type row_width, size_type batch_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) : stream_(stream), buf_(0, stream, mr), source_(source), @@ -502,7 +503,7 @@ struct batch_load_iterator { size_type row_width, size_type batch_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr)), cur_pos_(0) { } diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp index e5ce15e8a3..2b334d1bbf 100644 --- a/cpp/include/raft/util/cudart_utils.hpp +++ b/cpp/include/raft/util/cudart_utils.hpp @@ -18,7 +18,6 @@ #include #include -#include #include diff --git a/cpp/include/raft/util/memory_pool-ext.hpp b/cpp/include/raft/util/memory_pool-ext.hpp deleted file mode 100644 index 030a9c681e..0000000000 --- a/cpp/include/raft/util/memory_pool-ext.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include // rmm::mr::device_memory_resource - -#include // size_t -#include // std::unique_ptr - -namespace raft { - -std::unique_ptr get_pool_memory_resource( - rmm::mr::device_memory_resource*& mr, size_t initial_size); - -} // namespace raft diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp deleted file mode 100644 index bd7e0186b3..0000000000 --- a/cpp/include/raft/util/memory_pool-inl.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // RAFT_INLINE_CONDITIONAL - -#include -#include -#include -#include - -#include -#include - -namespace raft { - -/** - * @defgroup memory_pool Memory Pool - * @{ - */ -/** - * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned - * unique pointer. - * - * This function is useful in the code where multiple repeated allocations/deallocations are - * expected. - * Use case example: - * @code{.cpp} - * void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) { - * auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float)); - * if (pool_guard){ - * RAFT_LOG_INFO("Created a pool"); - * } else { - * RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource"); - * } - * rmm::device_uvector x(n, stream, mr); - * rmm::device_uvector y(n, stream, mr); - * ... - * } - * @endcode - * Here, the new memory resource would be created within the function scope if the passed `mr` is - * null and the default resource is not a pool. After the call, `mr` contains a valid memory - * resource in any case. - * - * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it - * into a `pool_memory_resource` if necessary and return the pointer to the result. - * @param initial_size if a new memory pool is created, this would be its initial size (rounded up - * to 256 bytes). - * - * @return if a new memory pool is created, it returns a unique_ptr to it; - * this managed pointer controls the lifetime of the created memory resource. - */ -RAFT_INLINE_CONDITIONAL std::unique_ptr get_pool_memory_resource( - rmm::mr::device_memory_resource*& mr, size_t initial_size) -{ - using pool_res_t = rmm::mr::pool_memory_resource; - std::unique_ptr pool_res{nullptr}; - if (mr) return pool_res; - mr = rmm::mr::get_current_device_resource(); - if (!dynamic_cast(mr) && - !dynamic_cast*>(mr) && - !dynamic_cast*>(mr)) { - pool_res = std::make_unique( - mr, rmm::align_down(initial_size, rmm::CUDA_ALLOCATION_ALIGNMENT)); - mr = pool_res.get(); - } - return pool_res; -} - -/** @} */ -} // namespace raft diff --git a/cpp/include/raft/util/memory_pool.hpp b/cpp/include/raft/util/memory_pool.hpp deleted file mode 100644 index c9d25ecb1f..0000000000 --- a/cpp/include/raft/util/memory_pool.hpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "memory_pool-ext.hpp" - -#if !defined(RAFT_COMPILED) -#include "memory_pool-inl.hpp" -#endif // RAFT_COMPILED diff --git a/cpp/include/raft/util/pow2_utils.cuh b/cpp/include/raft/util/pow2_utils.cuh index 68b35837b6..0c740ac5f6 100644 --- a/cpp/include/raft/util/pow2_utils.cuh +++ b/cpp/include/raft/util/pow2_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,15 @@ namespace raft { +/** + * Checks whether an integer is a power of 2. + */ +template +constexpr HDI std::enable_if_t, bool> is_pow2(T v) +{ + return (v && !(v & (v - 1))); +} + /** * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time. * @@ -33,7 +42,7 @@ struct Pow2 { static constexpr Type Mask = Value - 1; static_assert(std::is_integral::value, "Value must be integral."); - static_assert(Value && !(Value & Mask), "Value must be power of two."); + static_assert(is_pow2(Value), "Value must be power of two."); #define Pow2_FUNC_QUALIFIER static constexpr __host__ __device__ __forceinline__ #define Pow2_WHEN_INTEGRAL(I) std::enable_if_t diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh index 2c2b1aa228..c0d3da7609 100644 --- a/cpp/include/raft/util/reduction.cuh +++ b/cpp/include/raft/util/reduction.cuh @@ -39,8 +39,8 @@ DI T logicalWarpReduce(T val, ReduceLambda reduce_op) { #pragma unroll for (int i = logicalWarpSize / 2; i > 0; i >>= 1) { - T tmp = shfl_xor(val, i); - val = reduce_op(val, tmp); + const T tmp = shfl_xor(val, i, logicalWarpSize); + val = reduce_op(val, tmp); } return val; } @@ -197,4 +197,104 @@ DI i_t binaryBlockReduce(i_t val, i_t* shmem) } } +/** + * @brief Executes a collaborative vector reduction per sub-warp + * + * This uses fewer shuffles than naively reducing each element independently. + * Better performance is achieved with a larger vector width, up to vecWidth == warpSize/2. + * For example, for logicalWarpSize == 32 and vecWidth == 16, the naive method requires 80 + * shuffles, this one only 31, 2.58x fewer. + * + * However, the output of the reduction is not broadcasted. The vector is modified in place and + * each thread holds a part of the output vector. The outputs are distributed in a round-robin + * pattern between the threads to facilitate coalesced IO. There are 2 possible layouts based on + * which of logicalWarpSize and vecWidth is larger: + * - If vecWidth >= logicalWarpSize, each thread has vecWidth/logicalWarpSize outputs. + * - If logicalWarpSize > vecWidth, logicalWarpSize/vecWidth threads have a copy of the same output. + * + * Example 1: logicalWarpSize == 4, vecWidth == 8, v = a+b+c+d + * IN OUT + * lane 0 | a0 a1 a2 a3 a4 a5 a6 a7 | v0 v4 - - - - - - + * lane 1 | b0 b1 b2 b3 b4 b5 b6 b7 | v1 v5 - - - - - - + * lane 2 | c0 c1 c2 c3 c4 c5 c6 c7 | v2 v6 - - - - - - + * lane 3 | d0 d1 d2 d3 d4 d5 d6 d7 | v3 v7 - - - - - - + * + * Example 2: logicalWarpSize == 8, vecWidth == 4, v = a+b+c+d+e+f+g+h + * IN OUT + * lane 0 | a0 a1 a2 a3 | v0 - - - + * lane 1 | b0 b1 b2 b3 | v0 - - - + * lane 2 | c0 c1 c2 c3 | v1 - - - + * lane 3 | d0 d1 d2 d3 | v1 - - - + * lane 4 | e0 e1 e2 e3 | v2 - - - + * lane 5 | f0 f1 f2 f3 | v2 - - - + * lane 6 | g0 g1 g2 g3 | v3 - - - + * lane 7 | h0 h1 h2 h3 | v3 - - - + * + * @tparam logicalWarpSize Sub-warp size. Must be 2, 4, 8, 16 or 32. + * @tparam vecWidth Vector width. Must be a power of two. + * @tparam T Vector element type. + * @tparam ReduceLambda Reduction operator type. + * @param[in,out] acc Pointer to a vector of size vecWidth or more in registers + * @param[in] lane_id Lane id between 0 and logicalWarpSize-1 + * @param[in] reduce_op Reduction operator, assumed to be commutative and associative. + */ +template +DI void logicalWarpReduceVector(T* acc, int lane_id, ReduceLambda reduce_op) +{ + static_assert(vecWidth > 0, "Vec width must be strictly positive."); + static_assert(!(vecWidth & (vecWidth - 1)), "Vec width must be a power of two."); + static_assert(logicalWarpSize >= 2 && logicalWarpSize <= 32, + "Logical warp size must be between 2 and 32"); + static_assert(!(logicalWarpSize & (logicalWarpSize - 1)), + "Logical warp size must be a power of two."); + + constexpr int shflStride = logicalWarpSize / 2; + constexpr int nextWarpSize = logicalWarpSize / 2; + + // One step of the butterfly reduction, applied to each element of the vector. +#pragma unroll + for (int k = 0; k < vecWidth; k++) { + const T tmp = shfl_xor(acc[k], shflStride, logicalWarpSize); + acc[k] = reduce_op(acc[k], tmp); + } + + constexpr int nextVecWidth = std::max(1, vecWidth / 2); + + /* Split into 2 smaller logical warps and distribute half of the data to each for the next step. + * The distribution pattern is designed so that at the end the outputs are coalesced/round-robin. + * The idea is to distribute contiguous "chunks" of the vectors based on the new warp size. These + * chunks will be halved in the next step and so on. + * + * Example for logicalWarpSize == 4, vecWidth == 8: + * lane 0 | 0 1 2 3 4 5 6 7 | [0 1] [4 5] - - - - | [0] [4] - - - - - - + * lane 1 | 0 1 2 3 4 5 6 7 | [0 1] [4 5] - - - - | [1] [5] - - - - - - + * lane 2 | 0 1 2 3 4 5 6 7 | [2 3] [6 7] - - - - | [2] [6] - - - - - - + * lane 3 | 0 1 2 3 4 5 6 7 | [2 3] [6 7] - - - - | [3] [7] - - - - - - + * chunkSize=2 chunkSize=1 + */ + if constexpr (nextVecWidth < vecWidth) { + T tmp[nextVecWidth]; + const bool firstHalf = (lane_id % logicalWarpSize) < nextWarpSize; + constexpr int chunkSize = std::min(nextVecWidth, nextWarpSize); + constexpr int numChunks = nextVecWidth / chunkSize; +#pragma unroll + for (int c = 0; c < numChunks; c++) { +#pragma unroll + for (int i = 0; i < chunkSize; i++) { + const int k = c * chunkSize + i; + tmp[k] = firstHalf ? acc[2 * c * chunkSize + i] : acc[(2 * c + 1) * chunkSize + i]; + } + } +#pragma unroll + for (int k = 0; k < nextVecWidth; k++) { + acc[k] = tmp[k]; + } + } + + // Recursively call with smaller sub-warps and possibly smaller vector width. + if constexpr (nextWarpSize > 1) { + logicalWarpReduceVector(acc, lane_id % nextWarpSize, reduce_op); + } +} + } // namespace raft diff --git a/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh index 7a65e2d2f8..1e6f4f9976 100644 --- a/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh +++ b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh @@ -25,6 +25,8 @@ #include +#include + #include // int64_t #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT) \ @@ -44,8 +46,7 @@ uint32_t n_queries, \ uint32_t k, \ IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr); \ + float* distances); \ \ extern template void raft::neighbors::ivf_pq::search( \ raft::resources const& handle, \ diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh index 79206c7a43..c14a8e3e9f 100644 --- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh +++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh @@ -23,9 +23,7 @@ #include #include -#include #include -#include namespace raft::neighbors { diff --git a/cpp/src/neighbors/detail/ivf_flat_search.cu b/cpp/src/neighbors/detail/ivf_flat_search.cu index 9d39607750..336bea19b6 100644 --- a/cpp/src/neighbors/detail/ivf_flat_search.cu +++ b/cpp/src/neighbors/detail/ivf_flat_search.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include +#include + #define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT, IvfSampleFilterT) \ template void raft::neighbors::ivf_flat::detail::search( \ raft::resources const& handle, \ @@ -27,7 +29,7 @@ uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr, \ + rmm::device_async_resource_ref mr, \ IvfSampleFilterT sample_filter) instantiate_raft_neighbors_ivf_flat_detail_search( diff --git a/cpp/src/neighbors/ivf_flat_00_generate.py b/cpp/src/neighbors/ivf_flat_00_generate.py index d987a4e17d..7b55cad4de 100644 --- a/cpp/src/neighbors/ivf_flat_00_generate.py +++ b/cpp/src/neighbors/ivf_flat_00_generate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # limitations under the License. header = """/* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -127,8 +127,8 @@ search_macro = """ #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT) \\ - template void raft::neighbors::ivf_flat::search( \\ - raft::resources const& handle, \\ + template void raft::neighbors::ivf_flat::search( \\ + raft::resources const& handle, \\ const raft::neighbors::ivf_flat::search_params& params, \\ const raft::neighbors::ivf_flat::index& index, \\ const T* queries, \\ @@ -136,10 +136,10 @@ uint32_t k, \\ IdxT* neighbors, \\ float* distances, \\ - rmm::mr::device_memory_resource* mr ); \\ + rmm::device_async_resource_ref mr); \\ \\ - template void raft::neighbors::ivf_flat::search( \\ - raft::resources const& handle, \\ + template void raft::neighbors::ivf_flat::search( \\ + raft::resources const& handle, \\ const raft::neighbors::ivf_flat::search_params& params, \\ const raft::neighbors::ivf_flat::index& index, \\ raft::device_matrix_view queries, \\ diff --git a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu index 03dcfee817..e5cfe14e3f 100644 --- a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu +++ b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include +#include + #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT) \ template void raft::neighbors::ivf_flat::search( \ raft::resources const& handle, \ @@ -35,7 +37,7 @@ uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr); \ + rmm::device_async_resource_ref mr); \ \ template void raft::neighbors::ivf_flat::search( \ raft::resources const& handle, \ diff --git a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu index 7646081183..35792a78a8 100644 --- a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu +++ b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include +#include + #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT) \ template void raft::neighbors::ivf_flat::search( \ raft::resources const& handle, \ @@ -35,7 +37,7 @@ uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr); \ + rmm::device_async_resource_ref mr); \ \ template void raft::neighbors::ivf_flat::search( \ raft::resources const& handle, \ diff --git a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu index 5d2effd385..663e52cb99 100644 --- a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu +++ b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include +#include + #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT) \ template void raft::neighbors::ivf_flat::search( \ raft::resources const& handle, \ @@ -35,7 +37,7 @@ uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr); \ + rmm::device_async_resource_ref mr); \ \ template void raft::neighbors::ivf_flat::search( \ raft::resources const& handle, \ diff --git a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu index e56c107735..2d15167099 100644 --- a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu +++ b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include // raft::neighbors::ivf_pq::index +#include + #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT) \ template void raft::neighbors::ivf_pq::search( \ raft::resources const& handle, \ @@ -34,8 +36,7 @@ uint32_t n_queries, \ uint32_t k, \ IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr) + float* distances) instantiate_raft_neighbors_ivf_pq_search(float, int64_t); diff --git a/cpp/src/neighbors/ivfpq_search_half_int64_t.cu b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu index c9f2e6fdd5..c9a380e21f 100644 --- a/cpp/src/neighbors/ivfpq_search_half_int64_t.cu +++ b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu @@ -17,6 +17,8 @@ #include #include // raft::neighbors::ivf_pq::index +#include + #include #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT) \ @@ -36,8 +38,7 @@ uint32_t n_queries, \ uint32_t k, \ IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr) + float* distances) instantiate_raft_neighbors_ivf_pq_search(half, int64_t); diff --git a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu index 1efe4f7fb2..e85c98d8dd 100644 --- a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu +++ b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include // raft::neighbors::ivf_pq::index +#include + #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT) \ template void raft::neighbors::ivf_pq::search( \ raft::resources const& handle, \ @@ -34,8 +36,7 @@ uint32_t n_queries, \ uint32_t k, \ IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr) + float* distances) instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t); diff --git a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu index e746391443..42653254e9 100644 --- a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu +++ b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include // raft::neighbors::ivf_pq::index +#include + #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT) \ template void raft::neighbors::ivf_pq::search( \ raft::resources const& handle, \ @@ -34,8 +36,7 @@ uint32_t n_queries, \ uint32_t k, \ IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr) + float* distances) instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t); diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 4d17aacffd..752dffdc16 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -188,7 +188,6 @@ if(BUILD_TESTS) test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu test/ext_headers/raft_distance_fused_l2_nn.cu test/ext_headers/raft_neighbors_ivf_pq.cu - test/ext_headers/raft_util_memory_pool.cpp test/ext_headers/raft_neighbors_ivf_flat.cu test/ext_headers/raft_core_logger.cpp test/ext_headers/raft_neighbors_refine.cu diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/test/core/device_resources_manager.cpp index b9b8996a09..c63d5896e5 100644 --- a/cpp/test/core/device_resources_manager.cpp +++ b/cpp/test/core/device_resources_manager.cpp @@ -115,16 +115,10 @@ TEST(DeviceResourcesManager, ObeysSetters) auto* mr = dynamic_cast*>( rmm::mr::get_current_device_resource()); - rmm::device_async_resource_ref workspace_mr = - dynamic_cast*>( - res.get_workspace_resource()) - ->get_upstream_resource(); + if (upstream_mrs[i % devices.size()] != nullptr) { // Expect that the current memory resource is a pool memory resource as requested EXPECT_NE(mr, nullptr); - - // We cannot easily check the type of a resource_ref - (void)workspace_mr; } { diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/test/ext_headers/00_generate.py index 682cadbe89..d9c766979b 100644 --- a/cpp/test/ext_headers/00_generate.py +++ b/cpp/test/ext_headers/00_generate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ copyright_notice = """ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,7 +49,6 @@ "raft/spatial/knn/detail/fused_l2_knn-ext.cuh", "raft/distance/fused_l2_nn-ext.cuh", "raft/neighbors/ivf_pq-ext.cuh", - "raft/util/memory_pool-ext.hpp", "raft/neighbors/ivf_flat-ext.cuh", "raft/core/logger-ext.hpp", "raft/neighbors/refine-ext.cuh", diff --git a/cpp/test/ext_headers/raft_util_memory_pool.cpp b/cpp/test/ext_headers/raft_util_memory_pool.cpp deleted file mode 100644 index 11a024b958..0000000000 --- a/cpp/test/ext_headers/raft_util_memory_pool.cpp +++ /dev/null @@ -1,27 +0,0 @@ - -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by 00_generate.py - * - * Make changes there and run in this directory: - * - * > python 00_generate.py - * - */ - -#include diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index 2061f28d36..28f5ff5f60 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -39,7 +39,8 @@ struct coalescedReductionInputs { template ::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.seed; } // Or else, we get the following compilation error @@ -113,15 +114,40 @@ class coalescedReductionTest : public ::testing::TestWithParam dots_act; }; -const std::vector> inputsf = {{0.000002f, 1024, 32, 1234ULL}, - {0.000002f, 1024, 64, 1234ULL}, - {0.000002f, 1024, 128, 1234ULL}, - {0.000002f, 1024, 256, 1234ULL}}; - -const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +// Note: it's important to have a variety of rows/columns combinations to test all possible code +// paths: thin (few cols or many rows), medium, thick (many cols, very few rows). + +const std::vector> inputsf = {{0.000002f, 50, 2, 1234ULL}, + {0.000002f, 50, 3, 1234ULL}, + {0.000002f, 50, 7, 1234ULL}, + {0.000002f, 50, 9, 1234ULL}, + {0.000002f, 50, 20, 1234ULL}, + {0.000002f, 50, 55, 1234ULL}, + {0.000002f, 50, 100, 1234ULL}, + {0.000002f, 50, 270, 1234ULL}, + {0.000002f, 10000, 3, 1234ULL}, + {0.000002f, 10000, 9, 1234ULL}, + {0.000002f, 10000, 20, 1234ULL}, + {0.000002f, 10000, 55, 1234ULL}, + {0.000002f, 10000, 100, 1234ULL}, + {0.000002f, 10000, 270, 1234ULL}, + {0.0001f, 10, 25000, 1234ULL}}; + +const std::vector> inputsd = {{0.000000001, 50, 2, 1234ULL}, + {0.000000001, 50, 3, 1234ULL}, + {0.000000001, 50, 7, 1234ULL}, + {0.000000001, 50, 9, 1234ULL}, + {0.000000001, 50, 20, 1234ULL}, + {0.000000001, 50, 55, 1234ULL}, + {0.000000001, 50, 100, 1234ULL}, + {0.000000001, 50, 270, 1234ULL}, + {0.000000001, 10000, 3, 1234ULL}, + {0.000000001, 10000, 9, 1234ULL}, + {0.000000001, 10000, 20, 1234ULL}, + {0.000000001, 10000, 55, 1234ULL}, + {0.000000001, 10000, 100, 1234ULL}, + {0.000000001, 10000, 270, 1234ULL}, + {0.0000001, 10, 25000, 1234ULL}}; typedef coalescedReductionTest coalescedReductionTestF; TEST_P(coalescedReductionTestF, Result) diff --git a/cpp/test/matrix/select_k.cuh b/cpp/test/matrix/select_k.cuh index 7f9b7b3fc3..f22f4f5fa7 100644 --- a/cpp/test/matrix/select_k.cuh +++ b/cpp/test/matrix/select_k.cuh @@ -25,7 +25,6 @@ #include #include -#include #include diff --git a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu index 942d0fcc44..00baa59f58 100644 --- a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu +++ b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu @@ -37,8 +37,7 @@ uint32_t n_queries, \ uint32_t k, \ IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr) + float* distances) instantiate_raft_neighbors_ivf_pq_search(float, uint32_t); diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index 3e0bead665..2139e97428 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -28,10 +28,6 @@ #include -#include -#include -#include - #include #include diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu index 62bad8e543..bed9515a53 100644 --- a/cpp/test/random/multi_variable_gaussian.cu +++ b/cpp/test/random/multi_variable_gaussian.cu @@ -25,6 +25,7 @@ #include #include +#include #include @@ -287,10 +288,8 @@ class MVGMdspanTest : public ::testing::TestWithParam> { raft::device_matrix_view P_view(P_d.data(), dim, dim); raft::device_matrix_view X_view(X_d.data(), dim, nPoints); - rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource(); - ASSERT_TRUE(mem_resource_ptr != nullptr); raft::random::multi_variable_gaussian( - handle, *mem_resource_ptr, x_view, P_view, X_view, method); + handle, rmm::mr::get_current_device_resource(), x_view, P_view, X_view, method); // saving the mean of the randoms in Rand_mean //@todo can be swapped with a API that calculates mean diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu index c5bb0ad3b6..086d1f4152 100644 --- a/cpp/test/util/device_atomics.cu +++ b/cpp/test/util/device_atomics.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include