diff --git a/cpp/src/prims/key_store.cuh b/cpp/src/prims/key_store.cuh index 6d135b4e94e..907ca36ef4a 100644 --- a/cpp/src/prims/key_store.cuh +++ b/cpp/src/prims/key_store.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,8 @@ namespace cugraph { namespace detail { +using cuco_storage_type = cuco::storage<1>; ///< cuco window storage type + template struct key_binary_search_contains_op_t { using key_type = typename thrust::iterator_traits::value_type; @@ -70,9 +72,8 @@ struct key_binary_search_store_device_view_t { template struct key_cuco_store_contains_device_view_t { - using key_type = typename ViewType::key_type; - using cuco_store_device_ref_type = - typename ViewType::cuco_store_type::ref_type; + using key_type = typename ViewType::key_type; + using cuco_store_device_ref_type = typename ViewType::cuco_set_type::ref_type; static_assert(!ViewType::binary_search); @@ -88,9 +89,8 @@ struct key_cuco_store_contains_device_view_t { template struct key_cuco_store_insert_device_view_t { - using key_type = typename ViewType::key_type; - using cuco_store_device_ref_type = - typename ViewType::cuco_store_type::ref_type; + using key_type = typename ViewType::key_type; + using cuco_store_device_ref_type = typename ViewType::cuco_set_type::ref_type; static_assert(!ViewType::binary_search); @@ -147,16 +147,17 @@ class key_cuco_store_view_t { static constexpr bool binary_search = false; - using cuco_store_type = cuco::experimental::static_set< - key_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_set_type = + cuco::static_set, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; - key_cuco_store_view_t(cuco_store_type const* store) : cuco_store_(store) {} + key_cuco_store_view_t(cuco_set_type const* store) : cuco_store_(store) {} template void contains(QueryKeyIterator key_first, @@ -167,17 +168,14 @@ class key_cuco_store_view_t { cuco_store_->contains(key_first, key_last, value_first, stream); } - auto cuco_store_contains_device_ref() const - { - return cuco_store_->ref(cuco::experimental::contains); - } + auto cuco_store_contains_device_ref() const { return cuco_store_->ref(cuco::contains); } - auto cuco_store_insert_device_ref() const { return cuco_store_->ref(cuco::experimental::insert); } + auto cuco_store_insert_device_ref() const { return cuco_store_->ref(cuco::insert); } key_t invalid_key() const { return cuco_store_->get_empty_key_sentinel(); } private: - cuco_store_type const* cuco_store_{}; + cuco_set_type const* cuco_store_{}; }; template @@ -240,14 +238,15 @@ class key_cuco_store_t { public: using key_type = key_t; - using cuco_store_type = cuco::experimental::static_set< - key_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_set_type = + cuco::static_set, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; key_cuco_store_t(rmm::cuda_stream_view stream) {} @@ -306,7 +305,7 @@ class key_cuco_store_t { return keys; } - cuco_store_type const* cuco_store_ptr() const { return cuco_store_.get(); } + cuco_set_type const* cuco_store_ptr() const { return cuco_store_.get(); } key_t invalid_key() const { return cuco_store_->empty_key_sentinel(); } @@ -324,17 +323,19 @@ class key_cuco_store_t { auto stream_adapter = rmm::mr::make_stream_allocator_adaptor( rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()), stream); - cuco_store_ = std::make_unique( - cuco_size, - cuco::sentinel::empty_key{invalid_key}, - thrust::equal_to{}, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>{}, - stream_adapter, - stream.value()); + cuco_store_ = + std::make_unique(cuco_size, + cuco::sentinel::empty_key{invalid_key}, + thrust::equal_to{}, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>{}, + cuco::thread_scope_device, + cuco_storage_type{}, + stream_adapter, + stream.value()); } - std::unique_ptr cuco_store_{nullptr}; + std::unique_ptr cuco_store_{nullptr}; size_t capacity_{0}; size_t size_{0}; // caching as cuco_store_->size() is expensive (this scans the entire slots to diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh index f17441ad6ab..be4fde2fbff 100644 --- a/cpp/src/prims/kv_store.cuh +++ b/cpp/src/prims/kv_store.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,6 +49,8 @@ namespace cugraph { namespace detail { +using cuco_storage_type = cuco::storage<1>; ///< cuco window storage type + template struct kv_binary_search_find_op_t { using key_type = typename thrust::iterator_traits::value_type; @@ -86,18 +88,19 @@ struct kv_binary_search_contains_op_t { template struct kv_cuco_insert_and_increment_t { - using key_type = typename thrust::iterator_traits::value_type; - using cuco_store_type = cuco::experimental::static_map< - key_type, - size_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; - - typename cuco_store_type::ref_type device_ref{}; + using key_type = typename thrust::iterator_traits::value_type; + using cuco_set_type = + cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; + + typename cuco_set_type::ref_type device_ref{}; KeyIterator key_first{}; size_t* counter{nullptr}; size_t invalid_idx{}; @@ -109,7 +112,7 @@ struct kv_cuco_insert_and_increment_t { if (inserted) { cuda::atomic_ref atomic_counter(*counter); auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); - using ref_type = typename cuco_store_type::ref_type; + using ref_type = typename cuco_set_type::ref_type; cuda::atomic_ref ref( (*iter).second); ref.store(idx, cuda::std::memory_order_relaxed); @@ -122,18 +125,19 @@ struct kv_cuco_insert_and_increment_t { template struct kv_cuco_insert_if_and_increment_t { - using key_type = typename thrust::iterator_traits::value_type; - using cuco_store_type = cuco::experimental::static_map< - key_type, - size_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; - - typename cuco_store_type::ref_type device_ref{}; + using key_type = typename thrust::iterator_traits::value_type; + using cuco_set_type = + cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; + + typename cuco_set_type::ref_type device_ref{}; KeyIterator key_first{}; StencilIterator stencil_first{}; PredOp pred_op{}; @@ -149,7 +153,7 @@ struct kv_cuco_insert_if_and_increment_t { if (inserted) { cuda::atomic_ref atomic_counter(*counter); auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); - using ref_type = typename cuco_store_type::ref_type; + using ref_type = typename cuco_set_type::ref_type; cuda::atomic_ref ref( (*iter).second); ref.store(idx, cuda::std::memory_order_relaxed); @@ -162,23 +166,24 @@ struct kv_cuco_insert_if_and_increment_t { template struct kv_cuco_insert_and_assign_t { - using cuco_store_type = cuco::experimental::static_map< - key_t, - std::conditional_t, value_t, size_t>, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; - - typename cuco_store_type::ref_type device_ref{}; + using cuco_set_type = + cuco::static_map, value_t, size_t>, + cuco::extent, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; + + typename cuco_set_type::ref_type device_ref{}; __device__ void operator()(thrust::tuple pair) { auto [iter, inserted] = device_ref.insert_and_find(pair); if (!inserted) { - using ref_type = typename cuco_store_type::ref_type; + using ref_type = typename cuco_set_type::ref_type; cuda::atomic_ref ref( (*iter).second); ref.store(thrust::get<1>(pair), cuda::std::memory_order_relaxed); @@ -220,10 +225,9 @@ struct kv_binary_search_store_device_view_t { template struct kv_cuco_store_find_device_view_t { - using key_type = typename ViewType::key_type; - using value_type = typename ViewType::value_type; - using cuco_store_device_ref_type = - typename ViewType::cuco_store_type::ref_type; + using key_type = typename ViewType::key_type; + using value_type = typename ViewType::value_type; + using cuco_store_device_ref_type = typename ViewType::cuco_set_type::ref_type; static_assert(!ViewType::binary_search); @@ -336,25 +340,26 @@ class kv_cuco_store_view_t { static constexpr bool binary_search = false; - using cuco_store_type = cuco::experimental::static_map< - key_t, - std::conditional_t, value_type, size_t>, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_set_type = + cuco::static_map, value_type, size_t>, + cuco::extent, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; template - kv_cuco_store_view_t(cuco_store_type const* store, + kv_cuco_store_view_t(cuco_set_type const* store, std::enable_if_t, int32_t> = 0) : cuco_store_(store) { } template - kv_cuco_store_view_t(cuco_store_type const* store, + kv_cuco_store_view_t(cuco_set_type const* store, ValueIterator value_first, type invalid_value, std::enable_if_t, int32_t> = 0) @@ -392,7 +397,7 @@ class kv_cuco_store_view_t { cuco_store_->contains(key_first, key_last, value_first, stream.value()); } - auto cuco_store_find_device_ref() const { return cuco_store_->ref(cuco::experimental::find); } + auto cuco_store_find_device_ref() const { return cuco_store_->ref(cuco::find); } template std::enable_if_t, ValueIterator> store_value_first() const @@ -412,7 +417,7 @@ class kv_cuco_store_view_t { } private: - cuco_store_type const* cuco_store_{}; + cuco_set_type const* cuco_store_{}; std::conditional_t, ValueIterator, std::byte /* dummy */> store_value_first_{}; @@ -531,15 +536,16 @@ class kv_cuco_store_t { std::invoke_result_t), value_buffer_type&>; - using cuco_store_type = cuco::experimental::static_map< - key_t, - std::conditional_t, value_t, size_t>, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_set_type = + cuco::static_map, value_t, size_t>, + cuco::extent, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; kv_cuco_store_t(rmm::cuda_stream_view stream) {} @@ -588,7 +594,7 @@ class kv_cuco_store_t { // requires placing the atomic variable on managed memory and this adds additional // complication. rmm::device_scalar counter(old_store_value_size, stream); - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); rmm::device_uvector store_value_offsets(num_keys, stream); thrust::tabulate( rmm::exec_policy(stream), @@ -629,7 +635,7 @@ class kv_cuco_store_t { // requires placing the atomic variable on managed memory and this adds additional // complication. rmm::device_scalar counter(old_store_value_size, stream); - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); rmm::device_uvector store_value_offsets(num_keys, stream); thrust::tabulate(rmm::exec_policy(stream), store_value_offsets.begin(), @@ -665,8 +671,8 @@ class kv_cuco_store_t { if constexpr (std::is_arithmetic_v) { auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first)); // FIXME: a temporary solution till insert_and_assign is added to - // cuco::experimental::static_map - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + // cuco::static_map + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); thrust::for_each(rmm::exec_policy(stream), pair_first, pair_first + num_keys, @@ -679,7 +685,7 @@ class kv_cuco_store_t { // requires placing the atomic variable on managed memory and this adds additional // complication. rmm::device_scalar counter(old_store_value_size, stream); - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); rmm::device_uvector store_value_offsets(num_keys, stream); thrust::tabulate( rmm::exec_policy(stream), @@ -731,20 +737,19 @@ class kv_cuco_store_t { })), stream); - thrust::for_each( - rmm::exec_policy(stream), - kv_indices.begin(), - kv_indices.end(), - [key_first, - value_first, - store_value_first = get_dataframe_buffer_begin(store_values_), - device_ref = cuco_store_->ref(cuco::experimental::find)] __device__(auto kv_idx) { - size_t store_value_offset{}; - auto found = device_ref.find(*(key_first + kv_idx)); - assert(found != device_ref.end()); - store_value_offset = (*found).second; - *(store_value_first + store_value_offset) = *(value_first + kv_idx); - }); + thrust::for_each(rmm::exec_policy(stream), + kv_indices.begin(), + kv_indices.end(), + [key_first, + value_first, + store_value_first = get_dataframe_buffer_begin(store_values_), + device_ref = cuco_store_->ref(cuco::find)] __device__(auto kv_idx) { + size_t store_value_offset{}; + auto found = device_ref.find(*(key_first + kv_idx)); + assert(found != device_ref.end()); + store_value_offset = (*found).second; + *(store_value_first + store_value_offset) = *(value_first + kv_idx); + }); } } @@ -783,7 +788,7 @@ class kv_cuco_store_t { return std::make_tuple(std::move(retrieved_keys), std::move(retrieved_values)); } - cuco_store_type const* cuco_store_ptr() const { return cuco_store_.get(); } + cuco_set_type const* cuco_store_ptr() const { return cuco_store_.get(); } template std::enable_if_t, const_value_iterator> store_value_first() const @@ -821,23 +826,25 @@ class kv_cuco_store_t { auto stream_adapter = rmm::mr::make_stream_allocator_adaptor( rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()), stream); if constexpr (std::is_arithmetic_v) { - cuco_store_ = std::make_unique( - cuco_size, - cuco::sentinel::empty_key{invalid_key}, - cuco::sentinel::empty_value{invalid_value}, - thrust::equal_to{}, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>{}, - stream_adapter, - stream.value()); + cuco_store_ = + std::make_unique(cuco_size, + cuco::sentinel::empty_key{invalid_key}, + cuco::sentinel::empty_value{invalid_value}, + thrust::equal_to{}, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>{}, + cuco::thread_scope_device, + cuco_storage_type{}, + stream_adapter, + stream.value()); } else { - cuco_store_ = std::make_unique( + cuco_store_ = std::make_unique( cuco_size, cuco::sentinel::empty_key{invalid_key}, cuco::sentinel::empty_value{std::numeric_limits::max()}, thrust::equal_to{}, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>{}, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>{}, stream_adapter, stream); store_values_ = allocate_dataframe_buffer(0, stream); @@ -845,7 +852,7 @@ class kv_cuco_store_t { } } - std::unique_ptr cuco_store_{nullptr}; + std::unique_ptr cuco_store_{nullptr}; std::conditional_t, decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})), std::byte /* dummy */> @@ -857,7 +864,7 @@ class kv_cuco_store_t { size_t size_{ 0}; // caching as cuco_store_->size() is expensive (this scans the entire slots to handle // user inserts through a device reference (and currently this is an upper bound (this - // will become exact once we fully switch to cuco::experimental::static_map and use the + // will become exact once we fully switch to cuco::static_map and use the // static_map class's insert_and_assign function; this function will be added soon) };