diff --git a/cpp/src/prims/key_store.cuh b/cpp/src/prims/key_store.cuh index 6d135b4e94e..3dd406a8930 100644 --- a/cpp/src/prims/key_store.cuh +++ b/cpp/src/prims/key_store.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,7 +72,7 @@ template struct key_cuco_store_contains_device_view_t { using key_type = typename ViewType::key_type; using cuco_store_device_ref_type = - typename ViewType::cuco_store_type::ref_type; + typename ViewType::cuco_store_type::ref_type; static_assert(!ViewType::binary_search); @@ -88,9 +88,8 @@ struct key_cuco_store_contains_device_view_t { template struct key_cuco_store_insert_device_view_t { - using key_type = typename ViewType::key_type; - using cuco_store_device_ref_type = - typename ViewType::cuco_store_type::ref_type; + using key_type = typename ViewType::key_type; + using cuco_store_device_ref_type = typename ViewType::cuco_store_type::ref_type; static_assert(!ViewType::binary_search); @@ -147,14 +146,14 @@ class key_cuco_store_view_t { static constexpr bool binary_search = false; - using cuco_store_type = cuco::experimental::static_set< - key_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_store_type = + cuco::static_set, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>>; key_cuco_store_view_t(cuco_store_type const* store) : cuco_store_(store) {} @@ -167,12 +166,9 @@ class key_cuco_store_view_t { cuco_store_->contains(key_first, key_last, value_first, stream); } - auto cuco_store_contains_device_ref() const - { - return cuco_store_->ref(cuco::experimental::contains); - } + auto cuco_store_contains_device_ref() const { return cuco_store_->ref(cuco::contains); } - auto cuco_store_insert_device_ref() const { return cuco_store_->ref(cuco::experimental::insert); } + auto cuco_store_insert_device_ref() const { return cuco_store_->ref(cuco::insert); } key_t invalid_key() const { return cuco_store_->get_empty_key_sentinel(); } @@ -240,14 +236,14 @@ class key_cuco_store_t { public: using key_type = key_t; - using cuco_store_type = cuco::experimental::static_set< - key_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_store_type = + cuco::static_set, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>>; key_cuco_store_t(rmm::cuda_stream_view stream) {} @@ -324,14 +320,14 @@ class key_cuco_store_t { auto stream_adapter = rmm::mr::make_stream_allocator_adaptor( rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()), stream); - cuco_store_ = std::make_unique( - cuco_size, - cuco::sentinel::empty_key{invalid_key}, - thrust::equal_to{}, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>{}, - stream_adapter, - stream.value()); + cuco_store_ = + std::make_unique(cuco_size, + cuco::sentinel::empty_key{invalid_key}, + thrust::equal_to{}, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>{}, + stream_adapter, + stream.value()); } std::unique_ptr cuco_store_{nullptr}; diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh index f17441ad6ab..5333c8cb713 100644 --- a/cpp/src/prims/kv_store.cuh +++ b/cpp/src/prims/kv_store.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -84,20 +84,23 @@ struct kv_binary_search_contains_op_t { } }; +using cuco_storage_type = cuco::storage<1>; + template struct kv_cuco_insert_and_increment_t { - using key_type = typename thrust::iterator_traits::value_type; - using cuco_store_type = cuco::experimental::static_map< - key_type, - size_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; - - typename cuco_store_type::ref_type device_ref{}; + using key_type = typename thrust::iterator_traits::value_type; + using cuco_store_type = + cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; + + typename cuco_store_type::ref_type device_ref{}; KeyIterator key_first{}; size_t* counter{nullptr}; size_t invalid_idx{}; @@ -109,7 +112,7 @@ struct kv_cuco_insert_and_increment_t { if (inserted) { cuda::atomic_ref atomic_counter(*counter); auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); - using ref_type = typename cuco_store_type::ref_type; + using ref_type = typename cuco_store_type::ref_type; cuda::atomic_ref ref( (*iter).second); ref.store(idx, cuda::std::memory_order_relaxed); @@ -122,18 +125,19 @@ struct kv_cuco_insert_and_increment_t { template struct kv_cuco_insert_if_and_increment_t { - using key_type = typename thrust::iterator_traits::value_type; - using cuco_store_type = cuco::experimental::static_map< - key_type, - size_t, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; - - typename cuco_store_type::ref_type device_ref{}; + using key_type = typename thrust::iterator_traits::value_type; + using cuco_store_type = + cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>, + cuco_storage_type>; + + typename cuco_store_type::ref_type device_ref{}; KeyIterator key_first{}; StencilIterator stencil_first{}; PredOp pred_op{}; @@ -149,7 +153,7 @@ struct kv_cuco_insert_if_and_increment_t { if (inserted) { cuda::atomic_ref atomic_counter(*counter); auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); - using ref_type = typename cuco_store_type::ref_type; + using ref_type = typename cuco_store_type::ref_type; cuda::atomic_ref ref( (*iter).second); ref.store(idx, cuda::std::memory_order_relaxed); @@ -162,23 +166,23 @@ struct kv_cuco_insert_if_and_increment_t { template struct kv_cuco_insert_and_assign_t { - using cuco_store_type = cuco::experimental::static_map< - key_t, - std::conditional_t, value_t, size_t>, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; - - typename cuco_store_type::ref_type device_ref{}; + using cuco_store_type = + cuco::static_map, value_t, size_t>, + cuco::extent, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>>; + + typename cuco_store_type::ref_type device_ref{}; __device__ void operator()(thrust::tuple pair) { auto [iter, inserted] = device_ref.insert_and_find(pair); if (!inserted) { - using ref_type = typename cuco_store_type::ref_type; + using ref_type = typename cuco_store_type::ref_type; cuda::atomic_ref ref( (*iter).second); ref.store(thrust::get<1>(pair), cuda::std::memory_order_relaxed); @@ -220,10 +224,9 @@ struct kv_binary_search_store_device_view_t { template struct kv_cuco_store_find_device_view_t { - using key_type = typename ViewType::key_type; - using value_type = typename ViewType::value_type; - using cuco_store_device_ref_type = - typename ViewType::cuco_store_type::ref_type; + using key_type = typename ViewType::key_type; + using value_type = typename ViewType::value_type; + using cuco_store_device_ref_type = typename ViewType::cuco_store_type::ref_type; static_assert(!ViewType::binary_search); @@ -336,15 +339,15 @@ class kv_cuco_store_view_t { static constexpr bool binary_search = false; - using cuco_store_type = cuco::experimental::static_map< - key_t, - std::conditional_t, value_type, size_t>, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_store_type = + cuco::static_map, value_type, size_t>, + cuco::extent, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>>; template kv_cuco_store_view_t(cuco_store_type const* store, @@ -392,7 +395,7 @@ class kv_cuco_store_view_t { cuco_store_->contains(key_first, key_last, value_first, stream.value()); } - auto cuco_store_find_device_ref() const { return cuco_store_->ref(cuco::experimental::find); } + auto cuco_store_find_device_ref() const { return cuco_store_->ref(cuco::find); } template std::enable_if_t, ValueIterator> store_value_first() const @@ -531,15 +534,15 @@ class kv_cuco_store_t { std::invoke_result_t), value_buffer_type&>; - using cuco_store_type = cuco::experimental::static_map< - key_t, - std::conditional_t, value_t, size_t>, - cuco::experimental::extent, - cuda::thread_scope_device, - thrust::equal_to, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>, - rmm::mr::stream_allocator_adaptor>>; + using cuco_store_type = + cuco::static_map, value_t, size_t>, + cuco::extent, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>, + rmm::mr::stream_allocator_adaptor>>; kv_cuco_store_t(rmm::cuda_stream_view stream) {} @@ -588,7 +591,7 @@ class kv_cuco_store_t { // requires placing the atomic variable on managed memory and this adds additional // complication. rmm::device_scalar counter(old_store_value_size, stream); - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); rmm::device_uvector store_value_offsets(num_keys, stream); thrust::tabulate( rmm::exec_policy(stream), @@ -629,7 +632,7 @@ class kv_cuco_store_t { // requires placing the atomic variable on managed memory and this adds additional // complication. rmm::device_scalar counter(old_store_value_size, stream); - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); rmm::device_uvector store_value_offsets(num_keys, stream); thrust::tabulate(rmm::exec_policy(stream), store_value_offsets.begin(), @@ -665,8 +668,8 @@ class kv_cuco_store_t { if constexpr (std::is_arithmetic_v) { auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first)); // FIXME: a temporary solution till insert_and_assign is added to - // cuco::experimental::static_map - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + // cuco::static_map + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); thrust::for_each(rmm::exec_policy(stream), pair_first, pair_first + num_keys, @@ -679,7 +682,7 @@ class kv_cuco_store_t { // requires placing the atomic variable on managed memory and this adds additional // complication. rmm::device_scalar counter(old_store_value_size, stream); - auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find); + auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find); rmm::device_uvector store_value_offsets(num_keys, stream); thrust::tabulate( rmm::exec_policy(stream), @@ -731,20 +734,19 @@ class kv_cuco_store_t { })), stream); - thrust::for_each( - rmm::exec_policy(stream), - kv_indices.begin(), - kv_indices.end(), - [key_first, - value_first, - store_value_first = get_dataframe_buffer_begin(store_values_), - device_ref = cuco_store_->ref(cuco::experimental::find)] __device__(auto kv_idx) { - size_t store_value_offset{}; - auto found = device_ref.find(*(key_first + kv_idx)); - assert(found != device_ref.end()); - store_value_offset = (*found).second; - *(store_value_first + store_value_offset) = *(value_first + kv_idx); - }); + thrust::for_each(rmm::exec_policy(stream), + kv_indices.begin(), + kv_indices.end(), + [key_first, + value_first, + store_value_first = get_dataframe_buffer_begin(store_values_), + device_ref = cuco_store_->ref(cuco::find)] __device__(auto kv_idx) { + size_t store_value_offset{}; + auto found = device_ref.find(*(key_first + kv_idx)); + assert(found != device_ref.end()); + store_value_offset = (*found).second; + *(store_value_first + store_value_offset) = *(value_first + kv_idx); + }); } } @@ -821,23 +823,27 @@ class kv_cuco_store_t { auto stream_adapter = rmm::mr::make_stream_allocator_adaptor( rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()), stream); if constexpr (std::is_arithmetic_v) { - cuco_store_ = std::make_unique( - cuco_size, - cuco::sentinel::empty_key{invalid_key}, - cuco::sentinel::empty_value{invalid_value}, - thrust::equal_to{}, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>{}, - stream_adapter, - stream.value()); + cuco_store_ = + std::make_unique(cuco_size, + cuco::sentinel::empty_key{invalid_key}, + cuco::sentinel::empty_value{invalid_value}, + thrust::equal_to{}, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>{}, + cuco::thread_scope_device, + cuco_storage_type{}, + stream_adapter, + stream.value()); } else { cuco_store_ = std::make_unique( cuco_size, cuco::sentinel::empty_key{invalid_key}, cuco::sentinel::empty_value{std::numeric_limits::max()}, thrust::equal_to{}, - cuco::experimental::linear_probing<1, // CG size - cuco::murmurhash3_32>{}, + cuco::linear_probing<1, // CG size + cuco::murmurhash3_32>{}, + cuco::thread_scope_device, + cuco_storage_type{}, stream_adapter, stream); store_values_ = allocate_dataframe_buffer(0, stream); @@ -857,7 +863,7 @@ class kv_cuco_store_t { size_t size_{ 0}; // caching as cuco_store_->size() is expensive (this scans the entire slots to handle // user inserts through a device reference (and currently this is an upper bound (this - // will become exact once we fully switch to cuco::experimental::static_map and use the + // will become exact once we fully switch to cuco::static_map and use the // static_map class's insert_and_assign function; this function will be added soon) };