From 7abfd2afa5f11de1c2d74c39d628760cc402725b Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 2 Aug 2024 17:44:19 +0000 Subject: [PATCH 01/19] Add histogram bench --- cpp/benchmarks/CMakeLists.txt | 5 +- cpp/benchmarks/groupby/group_histogram.cpp | 86 ++++++++++++++++++++++ cpp/benchmarks/reduction/histogram.cpp | 74 +++++++++++++++++++ 3 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_histogram.cpp create mode 100644 cpp/benchmarks/reduction/histogram.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ff431c7f260..60f80704e9a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -211,7 +211,8 @@ ConfigureBench( reduction/reduce.cpp reduction/scan.cpp ) ConfigureNVBench( - REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp + REDUCTION_NVBENCH reduction/histogram.cpp reduction/rank.cpp reduction/scan_structs.cpp + reduction/segmented_reduce.cpp ) # ################################################################################################## @@ -231,7 +232,7 @@ ConfigureBench( ) ConfigureNVBench( - GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp + GROUPBY_NVBENCH groupby/group_histogram.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp ) diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp new file mode 100644 index 00000000000..9fae1ecab39 --- /dev/null +++ b/cpp/benchmarks/groupby/group_histogram.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +template +void groupby_histogram_helper(nvbench::state& state, + cudf::size_type num_rows, + cudf::size_type cardinality, + double null_probability) +{ + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto const vals = [&] { + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }(); + + auto keys_view = keys->view(); + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view})); + + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + requests[0].values = vals->view(); + requests[0].aggregations.push_back(cudf::make_histogram_aggregation()); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + auto const result = gb_obj.aggregate(requests); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_groupby_histogram(nvbench::state& state, nvbench::type_list) +{ + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + groupby_histogram_helper(state, num_rows, cardinality, null_probability); +} + +NVBENCH_BENCH_TYPES(bench_groupby_histogram, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("groupby_histogram") + .add_float64_axis("null_probability", {0, 0.1, 0.9}) + .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_power_of_two_axis("num_rows", {12, 18, 24}); + diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp new file mode 100644 index 00000000000..8e8e21dae0b --- /dev/null +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf/aggregation.hpp" +#include "cudf/detail/aggregation/aggregation.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +template +static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list) +{ + auto const dtype = cudf::type_to_id(); + + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } + + data_profile const profile = data_profile_builder() + .null_probability(null_probability) + .cardinality(cardinality) + .distribution(dtype, distribution_id::UNIFORM, 0, num_rows); + + auto const input = create_random_column(dtype, row_count{num_rows}, profile); + auto agg = cudf::make_histogram_aggregation(); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::reduce(*input, *agg, input->type(), stream_view); + }); + + state.add_element_count(input->size()); +} + +using data_type = nvbench::type_list; + +NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) + .set_name("histogram") + .add_float64_axis("null_probability", {0.1}) + .add_int64_axis("cardinality", {1}) + .add_int64_axis("num_rows", + { + 10000, // 10k + 100000, // 100k + 1000000, // 1M + 10000000, // 10M + 100000000, // 100M + }); From e0aa78bb22447509b51a8a715dd5ccd530a49924 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 2 Aug 2024 17:45:35 +0000 Subject: [PATCH 02/19] refactor histogram using static_map insert_or_apply --- .../cudf/reduction/detail/histogram.hpp | 2 +- cpp/src/groupby/sort/group_histogram.cu | 7 + cpp/src/reductions/histogram.cu | 159 +++++------------- 3 files changed, 54 insertions(+), 114 deletions(-) diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp index f23c5a14e33..745f8aa46cc 100644 --- a/cpp/include/cudf/reduction/detail/histogram.hpp +++ b/cpp/include/cudf/reduction/detail/histogram.hpp @@ -36,7 +36,7 @@ namespace cudf::reduction::detail { * @param partial_counts An optional column containing count for each row * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate memory of the returned objects - * @return A pair of array contains the (stable-order) indices of the distinct rows in the input + * @return A pair of array contains the indices of the distinct rows in the input * table, and their corresponding distinct counts */ [[nodiscard]] std::pair>, std::unique_ptr> diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu index 1000ec0d470..4d893232532 100644 --- a/cpp/src/groupby/sort/group_histogram.cu +++ b/cpp/src/groupby/sort/group_histogram.cu @@ -29,6 +29,7 @@ #include #include +#include namespace cudf::groupby::detail { @@ -57,6 +58,12 @@ std::unique_ptr build_histogram(column_view const& values, auto [distinct_indices, distinct_counts] = cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr); + // compute_row_frequencies does not provide stable ordering + thrust::sort_by_key(rmm::exec_policy(stream), + distinct_indices->begin(), + distinct_indices->end(), + distinct_counts->mutable_view().begin()); + // Gather the distinct rows for the output histogram. auto out_table = cudf::detail::gather(labeled_values, *distinct_indices, diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index bebb9d14923..af1187a65e5 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -23,10 +23,12 @@ #include +#include #include #include #include #include +#include #include #include @@ -38,69 +40,6 @@ namespace { // Always use 64-bit signed integer for storing count. using histogram_count_type = int64_t; -/** - * @brief The functor to accumulate the frequency of each distinct rows in the input table. - */ -template -struct reduce_fn : cudf::detail::reduce_by_row_fn_base { - CountType const* d_partial_output; - - reduce_fn(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - CountType* const d_output, - CountType const* const d_partial_output) - : cudf::detail::reduce_by_row_fn_base{d_map, - d_hasher, - d_equal, - d_output}, - d_partial_output{d_partial_output} - { - } - - // Count the number of rows in each group of rows that are compared equal. - __device__ void operator()(size_type const idx) const - { - auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1}; - auto const count = - cuda::atomic_ref(*this->get_output_ptr(idx)); - count.fetch_add(increment, cuda::std::memory_order_relaxed); - } -}; - -/** - * @brief The builder to construct an instance of `reduce_fn` functor. - */ -template -struct reduce_func_builder { - CountType const* const d_partial_output; - - reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output} - { - } - - template - auto build(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - CountType* const d_output) - { - return reduce_fn{ - d_map, d_hasher, d_equal, d_output, d_partial_output}; - } -}; - -/** - * @brief Specialized functor to check for not-zero of the second component of the input. - */ -struct is_not_zero { - template - __device__ bool operator()(Pair const input) const - { - return thrust::get<1>(input) != 0; - } -}; - /** * @brief Building a histogram by gathering distinct rows from the input table and their * corresponding distinct counts. @@ -150,6 +89,16 @@ std::unique_ptr make_empty_histogram_like(column_view const& values) std::move(struct_children)); } +// TODO: replace with cuco reduction functors +struct plus_op { + __device__ void operator()( + cuda::atomic_ref payload_ref, + histogram_count_type val) + { + payload_ref.fetch_add(val, cuda::memory_order_relaxed); + } +}; + std::pair>, std::unique_ptr> compute_row_frequencies(table_view const& input, std::optional const& partial_counts, @@ -164,12 +113,6 @@ compute_row_frequencies(table_view const& input, "Nested types are not yet supported in histogram aggregation.", std::invalid_argument); - auto map = cudf::detail::hash_map_type{compute_hash_table_size(input.num_rows()), - cuco::empty_key{-1}, - cuco::empty_value{std::numeric_limits::min()}, - cudf::detail::cuco_allocator{stream}, - stream.value()}; - auto const preprocessed_input = cudf::experimental::row::hash::preprocessed_table::create(input, stream); auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)}; @@ -180,58 +123,48 @@ compute_row_frequencies(table_view const& input, auto const pair_iter = cudf::detail::make_counting_transform_iterator( size_type{0}, - cuda::proclaim_return_type>( - [] __device__(size_type const i) { return cuco::make_pair(i, i); })); + cuda::proclaim_return_type>( + [d_partial_output = partial_counts ? partial_counts.value().begin() + : nullptr] __device__(size_type const idx) { + auto const increment = d_partial_output ? d_partial_output[idx] : histogram_count_type{1}; + return cuco::pair{idx, increment}; + })); // Always compare NaNs as equal. using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; auto const value_comp = nan_equal_comparator{}; - - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); - } - + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + + using row_hash = + cudf::experimental::row::hash::device_row_hasher; + + auto map = cuco::static_map{input.num_rows(), + 0.5, + cuco::empty_key{-1}, + cuco::empty_value{0}, + key_equal, + cuco::linear_probing<1, row_hash>{key_hasher}, + {}, + {}, + cudf::detail::cuco_allocator{stream}, + stream.value()}; + + // TODO: use `insert_or_apply` init overload for better performance + map.insert_or_apply(pair_iter, pair_iter + input.num_rows(), plus_op{}, stream.value()); + + size_type const map_size = map.size(stream.value()); // Gather the indices of distinct rows. - auto distinct_indices = std::make_unique>( - static_cast(map.get_size()), stream, mr); + auto distinct_indices = std::make_unique>(map_size, stream, mr); // Store the number of occurrences of each distinct row. - auto distinct_counts = make_numeric_column(data_type{type_to_id()}, - static_cast(map.get_size()), - mask_state::UNALLOCATED, - stream, - mr); - - // Compute frequencies (aka distinct counts) for the input rows. - // Note that we consider null and NaNs as always equal. - auto const reduction_results = cudf::detail::hash_reduce_by_row( - map, - preprocessed_input, - input.num_rows(), - has_nulls, - has_nested_columns, - null_equality::EQUAL, - nan_equality::ALL_EQUAL, - reduce_func_builder{ - partial_counts ? partial_counts.value().begin() : nullptr}, - histogram_count_type{0}, - stream, - rmm::mr::get_current_device_resource()); - - auto const input_it = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin())); - auto const output_it = thrust::make_zip_iterator(thrust::make_tuple( - distinct_indices->begin(), distinct_counts->mutable_view().begin())); - - // Reduction results above are either group sizes of equal rows, or `0`. - // The final output is non-zero group sizes only. - thrust::copy_if( - rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{}); + auto distinct_counts = make_numeric_column( + data_type{type_to_id()}, map_size, mask_state::UNALLOCATED, stream, mr); + + map.retrieve_all(distinct_indices->begin(), + distinct_counts->mutable_view().begin(), + stream.value()); return {std::move(distinct_indices), std::move(distinct_counts)}; } From de74ff830f1539a2abdc207d69a4be652030da90 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 6 Aug 2024 22:32:20 +0000 Subject: [PATCH 03/19] minor nits --- cpp/benchmarks/groupby/group_histogram.cpp | 22 ++++++++++++---------- cpp/benchmarks/reduction/histogram.cpp | 9 +-------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp index 9fae1ecab39..5de5a521e4b 100644 --- a/cpp/benchmarks/groupby/group_histogram.cpp +++ b/cpp/benchmarks/groupby/group_histogram.cpp @@ -23,9 +23,9 @@ template void groupby_histogram_helper(nvbench::state& state, - cudf::size_type num_rows, - cudf::size_type cardinality, - double null_probability) + cudf::size_type num_rows, + cudf::size_type cardinality, + double null_probability) { auto const keys = [&] { data_profile const profile = @@ -59,12 +59,11 @@ void groupby_histogram_helper(nvbench::state& state, auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { - auto const result = gb_obj.aggregate(requests); - }); + [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); - state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } template @@ -74,6 +73,10 @@ void bench_groupby_histogram(nvbench::state& state, nvbench::type_list) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const null_probability = state.get_float64("null_probability"); + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } groupby_histogram_helper(state, num_rows, cardinality, null_probability); } @@ -81,6 +84,5 @@ NVBENCH_BENCH_TYPES(bench_groupby_histogram, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("groupby_histogram") .add_float64_axis("null_probability", {0, 0.1, 0.9}) - .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) - .add_int64_power_of_two_axis("num_rows", {12, 18, 24}); - + .add_int64_axis("cardinality", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("num_rows", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp index 8e8e21dae0b..c021849ed89 100644 --- a/cpp/benchmarks/reduction/histogram.cpp +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -64,11 +64,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) .set_name("histogram") .add_float64_axis("null_probability", {0.1}) .add_int64_axis("cardinality", {1}) - .add_int64_axis("num_rows", - { - 10000, // 10k - 100000, // 100k - 1000000, // 1M - 10000000, // 10M - 100000000, // 100M - }); + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); From 11464ba112aabd0428a8f026427b0e4a17e61326 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 6 Aug 2024 23:41:54 +0000 Subject: [PATCH 04/19] more minor nits --- .../cudf/reduction/detail/histogram.hpp | 3 ++ cpp/src/groupby/sort/group_histogram.cu | 9 +++--- cpp/src/reductions/histogram.cu | 28 +++++++++---------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp index 6577b8ff771..b6ce35904df 100644 --- a/cpp/include/cudf/reduction/detail/histogram.hpp +++ b/cpp/include/cudf/reduction/detail/histogram.hpp @@ -31,6 +31,9 @@ namespace CUDF_EXPORT cudf { namespace reduction::detail { +// Always use 64-bit signed integer for storing count. +using histogram_count_type = int64_t; + /** * @brief Compute the frequency for each distinct row in the input table. * diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu index 4d893232532..33d9aa88017 100644 --- a/cpp/src/groupby/sort/group_histogram.cu +++ b/cpp/src/groupby/sort/group_histogram.cu @@ -59,10 +59,11 @@ std::unique_ptr build_histogram(column_view const& values, cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr); // compute_row_frequencies does not provide stable ordering - thrust::sort_by_key(rmm::exec_policy(stream), - distinct_indices->begin(), - distinct_indices->end(), - distinct_counts->mutable_view().begin()); + thrust::sort_by_key( + rmm::exec_policy(stream), + distinct_indices->begin(), + distinct_indices->end(), + distinct_counts->mutable_view().begin()); // Gather the distinct rows for the output histogram. auto out_table = cudf::detail::gather(labeled_values, diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index dd3a63ddfd8..d1531849f3b 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -15,9 +15,11 @@ */ #include +#include #include #include #include +#include #include #include @@ -37,9 +39,6 @@ namespace cudf::reduction::detail { namespace { -// Always use 64-bit signed integer for storing count. -using histogram_count_type = int64_t; - /** * @brief Building a histogram by gathering distinct rows from the input table and their * corresponding distinct counts. @@ -127,7 +126,7 @@ compute_row_frequencies(table_view const& input, [d_partial_output = partial_counts ? partial_counts.value().begin() : nullptr] __device__(size_type const idx) { auto const increment = d_partial_output ? d_partial_output[idx] : histogram_count_type{1}; - return cuco::pair{idx, increment}; + return cuco::pair{idx, increment}; })); // Always compare NaNs as equal. @@ -140,16 +139,17 @@ compute_row_frequencies(table_view const& input, cudf::experimental::row::hash::device_row_hasher; - auto map = cuco::static_map{input.num_rows(), - 0.5, - cuco::empty_key{-1}, - cuco::empty_value{0}, - key_equal, - cuco::linear_probing<1, row_hash>{key_hasher}, - {}, - {}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; + auto map = cuco::static_map{ + input.num_rows(), + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{-1}, + cuco::empty_value{0}, + key_equal, + cuco::linear_probing<1, row_hash>{key_hasher}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; // TODO: use `insert_or_apply` init overload for better performance map.insert_or_apply(pair_iter, pair_iter + input.num_rows(), plus_op{}, stream.value()); From 9029297b4cd734cd0127fdfd609aaed37b849034 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 6 Aug 2024 23:55:45 +0000 Subject: [PATCH 05/19] fix cmake format --- cpp/benchmarks/CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 374c0d26316..7c39803b1c1 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -232,8 +232,13 @@ ConfigureBench( ) ConfigureNVBench( - GROUPBY_NVBENCH groupby/group_histogram.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp - groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp + GROUPBY_NVBENCH + groupby/group_histogram.cpp + groupby/group_max.cpp + groupby/group_max_multithreaded.cpp + groupby/group_nunique.cpp + groupby/group_rank.cpp + groupby/group_struct_keys.cpp ) # ################################################################################################## From 920fd161653248435f7b7547ded4e5955adcffbf Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 4 Oct 2024 21:23:58 +0000 Subject: [PATCH 06/19] Refactor `histogram` with `cuco::static_set` --- .../cudf/reduction/detail/histogram.hpp | 6 +- cpp/src/groupby/sort/group_histogram.cu | 8 -- cpp/src/reductions/histogram.cu | 111 ++++++++++++------ 3 files changed, 74 insertions(+), 51 deletions(-) diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp index 040e4d27079..fb14e3e94cf 100644 --- a/cpp/include/cudf/reduction/detail/histogram.hpp +++ b/cpp/include/cudf/reduction/detail/histogram.hpp @@ -31,9 +31,6 @@ namespace CUDF_EXPORT cudf { namespace reduction::detail { -// Always use 64-bit signed integer for storing count. -using histogram_count_type = int64_t; - /** * @brief Compute the frequency for each distinct row in the input table. * @@ -41,7 +38,8 @@ using histogram_count_type = int64_t; * @param partial_counts An optional column containing count for each row * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate memory of the returned objects - * @return A pair of array contains the indices of the distinct rows in the input + * @return A pair of array contains the (stable-order) indices of the distinct rows in the input + * * table, and their corresponding distinct counts */ [[nodiscard]] std::pair>, std::unique_ptr> diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu index b757dc25873..861d801a070 100644 --- a/cpp/src/groupby/sort/group_histogram.cu +++ b/cpp/src/groupby/sort/group_histogram.cu @@ -29,7 +29,6 @@ #include #include -#include namespace cudf::groupby::detail { @@ -58,13 +57,6 @@ std::unique_ptr build_histogram(column_view const& values, auto [distinct_indices, distinct_counts] = cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr); - // compute_row_frequencies does not provide stable ordering - thrust::sort_by_key( - rmm::exec_policy(stream), - distinct_indices->begin(), - distinct_indices->end(), - distinct_counts->mutable_view().begin()); - // Gather the distinct rows for the output histogram. auto out_table = cudf::detail::gather(labeled_values, *distinct_indices, diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index e5e55d18840..8c8083df65b 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -19,18 +19,20 @@ #include #include #include -#include #include #include #include -#include +#include + +#include +#include #include #include #include #include -#include #include +#include #include @@ -38,6 +40,23 @@ namespace cudf::reduction::detail { namespace { +// A CUDA Cooperative Group of 1 thread for the hash set for histogram +auto constexpr DEFAULT_HISTOGRAM_CG_SIZE = 1; + +// Always use 64-bit signed integer for storing count. +using histogram_count_type = int64_t; + +/** + * @brief Specialized functor to check for not-zero of the second component of the input. + */ +struct is_not_zero { + template + __device__ bool operator()(Pair const input) const + { + return thrust::get<1>(input) != 0; + } +}; + /** * @brief Building a histogram by gathering distinct rows from the input table and their * corresponding distinct counts. @@ -87,16 +106,6 @@ std::unique_ptr make_empty_histogram_like(column_view const& values) std::move(struct_children)); } -// TODO: replace with cuco reduction functors -struct plus_op { - __device__ void operator()( - cuda::atomic_ref payload_ref, - histogram_count_type val) - { - payload_ref.fetch_add(val, cuda::memory_order_relaxed); - } -}; - std::pair>, std::unique_ptr> compute_row_frequencies(table_view const& input, std::optional const& partial_counts, @@ -119,15 +128,6 @@ compute_row_frequencies(table_view const& input, auto const key_hasher = row_hasher.device_hasher(has_nulls); auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - auto const pair_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, - cuda::proclaim_return_type>( - [d_partial_output = partial_counts ? partial_counts.value().begin() - : nullptr] __device__(size_type const idx) { - auto const increment = d_partial_output ? d_partial_output[idx] : histogram_count_type{1}; - return cuco::pair{idx, increment}; - })); - // Always compare NaNs as equal. using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; @@ -138,32 +138,65 @@ compute_row_frequencies(table_view const& input, cudf::experimental::row::hash::device_row_hasher; - auto map = cuco::static_map{ - input.num_rows(), + size_t const num_rows = input.num_rows(); + + // Initialize intial counts to zero + rmm::device_uvector counts(num_rows, stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), counts.begin(), counts.end(), histogram_count_type{0}); + + // Construct a hash set + auto row_set = cuco::static_set{ + cuco::extent{num_rows}, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, cuco::empty_key{-1}, - cuco::empty_value{0}, key_equal, - cuco::linear_probing<1, row_hash>{key_hasher}, + cuco::linear_probing{key_hasher}, {}, {}, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; - // TODO: use `insert_or_apply` init overload for better performance - map.insert_or_apply(pair_iter, pair_iter + input.num_rows(), plus_op{}, stream.value()); - - size_type const map_size = map.size(stream.value()); - // Gather the indices of distinct rows. - auto distinct_indices = std::make_unique>(map_size, stream, mr); - - // Store the number of occurrences of each distinct row. + // Device-accessible reference to the hash set with insert_and_find operatro + auto row_set_ref = row_set.ref(cuco::op::insert_and_find); + + // Compute frequencies (aka distinct counts) for the input rows. + // Note that we consider null and NaNs as always equal. + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [row_set_ref, + increments = + partial_counts.has_value() ? partial_counts.value().begin() : nullptr, + counts = counts.begin()] __device__(auto const idx) mutable { + auto const [inserted_idx_ptr, _] = row_set_ref.insert_and_find(idx); + cuda::atomic_ref count_ref{ + counts[*inserted_idx_ptr]}; + auto increment = histogram_count_type{1}; + if (increments) { increment = increments[idx]; } + count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); + }); + + // Set-size is the number of distinct (inserted) rows + auto const set_size = row_set.size(stream); + + // Vector of distinct indices + auto distinct_indices = std::make_unique>(set_size, stream, mr); + // Column of distinct counts auto distinct_counts = make_numeric_column( - data_type{type_to_id()}, map_size, mask_state::UNALLOCATED, stream, mr); - - map.retrieve_all(distinct_indices->begin(), - distinct_counts->mutable_view().begin(), - stream.value()); + data_type{type_to_id()}, set_size, mask_state::UNALLOCATED, stream, mr); + + // Copy row indices and counts to the output if counts are non-zero + auto const input_it = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_counting_iterator(0), counts.begin())); + auto const output_it = thrust::make_zip_iterator(thrust::make_tuple( + distinct_indices->begin(), distinct_counts->mutable_view().begin())); + + // Reduction results above are either group sizes of equal rows, or `0`. + // The final output is non-zero group sizes only. + thrust::copy_if( + rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{}); return {std::move(distinct_indices), std::move(distinct_counts)}; } From c2d815338a173526259a88a945b8108d622d4927 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 4 Oct 2024 21:26:31 +0000 Subject: [PATCH 07/19] Minor comment update --- cpp/include/cudf/reduction/detail/histogram.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp index fb14e3e94cf..c990db32977 100644 --- a/cpp/include/cudf/reduction/detail/histogram.hpp +++ b/cpp/include/cudf/reduction/detail/histogram.hpp @@ -39,7 +39,6 @@ namespace reduction::detail { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate memory of the returned objects * @return A pair of array contains the (stable-order) indices of the distinct rows in the input - * * table, and their corresponding distinct counts */ [[nodiscard]] std::pair>, std::unique_ptr> From 9aca13ee96cfa5b7087fa7119b4be464a15dbd6c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 4 Oct 2024 21:36:20 +0000 Subject: [PATCH 08/19] Minor improvements --- cpp/src/reductions/histogram.cu | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 8c8083df65b..27cd3b03390 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -140,10 +140,12 @@ compute_row_frequencies(table_view const& input, size_t const num_rows = input.num_rows(); - // Initialize intial counts to zero - rmm::device_uvector counts(num_rows, stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy_nosync(stream), counts.begin(), counts.end(), histogram_count_type{0}); + // Construct a vector to store reduced counts and init to zero + rmm::device_uvector reduction_results(num_rows, stream, mr); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), + reduction_results.begin(), + reduction_results.end(), + histogram_count_type{0}); // Construct a hash set auto row_set = cuco::static_set{ @@ -166,15 +168,14 @@ compute_row_frequencies(table_view const& input, rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_rows), - [row_set_ref, + [set_ref = row_set_ref, increments = partial_counts.has_value() ? partial_counts.value().begin() : nullptr, - counts = counts.begin()] __device__(auto const idx) mutable { - auto const [inserted_idx_ptr, _] = row_set_ref.insert_and_find(idx); + counts = reduction_results.begin()] __device__(auto const idx) mutable { + auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); cuda::atomic_ref count_ref{ counts[*inserted_idx_ptr]}; - auto increment = histogram_count_type{1}; - if (increments) { increment = increments[idx]; } + auto const increment = increments ? increments[idx] : histogram_count_type{1}; count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); }); @@ -189,14 +190,14 @@ compute_row_frequencies(table_view const& input, // Copy row indices and counts to the output if counts are non-zero auto const input_it = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_counting_iterator(0), counts.begin())); + thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin())); auto const output_it = thrust::make_zip_iterator(thrust::make_tuple( distinct_indices->begin(), distinct_counts->mutable_view().begin())); // Reduction results above are either group sizes of equal rows, or `0`. // The final output is non-zero group sizes only. thrust::copy_if( - rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{}); + rmm::exec_policy(stream), input_it, input_it + num_rows, output_it, is_not_zero{}); return {std::move(distinct_indices), std::move(distinct_counts)}; } From 7ddf0aac8a65eb416dd9633cc1ab2cdc08295614 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Sat, 5 Oct 2024 01:27:10 +0000 Subject: [PATCH 09/19] Fix for benchmark --- cpp/benchmarks/groupby/group_histogram.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp index 5de5a521e4b..6ca59462e2b 100644 --- a/cpp/benchmarks/groupby/group_histogram.cpp +++ b/cpp/benchmarks/groupby/group_histogram.cpp @@ -36,7 +36,7 @@ void groupby_histogram_helper(nvbench::state& state, return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); }(); - auto const vals = [&] { + auto const values = [&] { auto builder = data_profile_builder().cardinality(0).distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); if (null_probability > 0) { @@ -47,19 +47,18 @@ void groupby_histogram_helper(nvbench::state& state, return create_random_column( cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); }(); - - auto keys_view = keys->view(); - auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view})); - - std::vector requests; - requests.emplace_back(cudf::groupby::aggregation_request()); - requests[0].values = vals->view(); - requests[0].aggregations.push_back(cudf::make_histogram_aggregation()); + std::vector requests(1); + requests.back().values = values->view(); + requests.back().aggregations.push_back( + cudf::make_histogram_aggregation()); auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys->view()})); + auto const result = gb_obj.aggregate(requests); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); state.add_buffer_size( @@ -77,6 +76,7 @@ void bench_groupby_histogram(nvbench::state& state, nvbench::type_list) state.skip("cardinality > num_rows"); return; } + groupby_histogram_helper(state, num_rows, cardinality, null_probability); } From 2134f511e35b8c08aabba00b73db488add8c0e7d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Sat, 5 Oct 2024 02:08:41 +0000 Subject: [PATCH 10/19] Update cardinality axis for reduction --- cpp/benchmarks/reduction/histogram.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp index c021849ed89..24131e1f30d 100644 --- a/cpp/benchmarks/reduction/histogram.cpp +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -63,5 +63,6 @@ using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) .set_name("histogram") .add_float64_axis("null_probability", {0.1}) - .add_int64_axis("cardinality", {1}) - .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); + .add_int64_axis("cardinality", + {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000}) + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); \ No newline at end of file From d35b4fcb419e977baa9ce963e479465407f4e6e6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:29:27 +0000 Subject: [PATCH 11/19] Minor style fix --- cpp/benchmarks/groupby/group_histogram.cpp | 2 +- cpp/benchmarks/reduction/histogram.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp index 6ca59462e2b..34295931648 100644 --- a/cpp/benchmarks/groupby/group_histogram.cpp +++ b/cpp/benchmarks/groupby/group_histogram.cpp @@ -60,7 +60,7 @@ void groupby_histogram_helper(nvbench::state& state, }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_element_count(static_cast(num_rows) / elapsed_time, "rows/s"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp index 24131e1f30d..d0925de5c87 100644 --- a/cpp/benchmarks/reduction/histogram.cpp +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -65,4 +65,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) .add_float64_axis("null_probability", {0.1}) .add_int64_axis("cardinality", {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000}) - .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); \ No newline at end of file + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); From aaf68377d3678d57560c499a36ea71510e757e54 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:40:00 +0000 Subject: [PATCH 12/19] Minor style fix --- cpp/benchmarks/groupby/group_histogram.cpp | 2 ++ cpp/benchmarks/reduction/histogram.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp index 34295931648..cd7f9f298af 100644 --- a/cpp/benchmarks/groupby/group_histogram.cpp +++ b/cpp/benchmarks/groupby/group_histogram.cpp @@ -47,6 +47,8 @@ void groupby_histogram_helper(nvbench::state& state, return create_random_column( cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); }(); + + // Vector of 1 request std::vector requests(1); requests.back().values = values->view(); requests.back().aggregations.push_back( diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp index d0925de5c87..24131e1f30d 100644 --- a/cpp/benchmarks/reduction/histogram.cpp +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -65,4 +65,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) .add_float64_axis("null_probability", {0.1}) .add_int64_axis("cardinality", {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000}) - .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); \ No newline at end of file From f63fc4a7d8e543c3fe817bd43874f473b63a4c55 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:40:49 +0000 Subject: [PATCH 13/19] Style fix --- cpp/benchmarks/reduction/histogram.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp index 24131e1f30d..d0925de5c87 100644 --- a/cpp/benchmarks/reduction/histogram.cpp +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -65,4 +65,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) .add_float64_axis("null_probability", {0.1}) .add_int64_axis("cardinality", {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000}) - .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); \ No newline at end of file + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); From a5dcc82f1b262a8e8040cc15fc68abd16a85ecb5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 7 Oct 2024 23:29:50 +0000 Subject: [PATCH 14/19] Remove hash_reduce_by_row and use rmm nosync policy --- .../cudf/detail/hash_reduce_by_row.cuh | 169 ------------------ cpp/src/reductions/histogram.cu | 12 +- 2 files changed, 6 insertions(+), 175 deletions(-) delete mode 100644 cpp/include/cudf/detail/hash_reduce_by_row.cuh diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh deleted file mode 100644 index 7de79b31bc7..00000000000 --- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -namespace cudf::detail { - -using hash_map_type = cuco::legacy:: - static_map>; - -/** - * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are - * rows that compared equal. - * - * TODO: We need to switch to use `static_reduction_map` when it is ready - * (https://github.com/NVIDIA/cuCollections/pull/98). - */ -template -struct reduce_by_row_fn_base { - protected: - MapView const d_map; - KeyHasher const d_hasher; - KeyEqual const d_equal; - OutputType* const d_output; - - reduce_by_row_fn_base(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - OutputType* const d_output) - : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output} - { - } - - /** - * @brief Return a pointer to the output array at the given index. - * - * @param idx The access index - * @return A pointer to the given index in the output array - */ - __device__ OutputType* get_output_ptr(size_type const idx) const - { - auto const iter = d_map.find(idx, d_hasher, d_equal); - - if (iter != d_map.end()) { - // Only one (undetermined) index value of the duplicate rows could be inserted into the map. - // As such, looking up for all indices of duplicate rows always returns the same value. - auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed); - - // All duplicate rows will have concurrent access to this same output slot. - return &d_output[inserted_idx]; - } else { - // All input `idx` values have been inserted into the map before. - // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if - // `d_equal(idx, idx) == false`. - // Such situations are due to comparing nulls or NaNs which are considered as always unequal. - // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct - // output slot. - return &d_output[idx]; - } - } -}; - -/** - * @brief Perform a reduction on groups of rows that are compared equal. - * - * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared - * equal. A hash table is used to find groups of equal rows. - * - * At the beginning of the operation, the entire output array is filled with a value given by - * the `init` parameter. Then, the reduction result for each row group is written into the output - * array at the index of an unspecified row in the group. - * - * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a - * reduction functor derived from `reduce_by_row_fn_base` - * @tparam OutputType Type of the reduction results - * @param map The auxiliary map to perform reduction - * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row - * comparisons - * @param num_rows The number of all input rows - * @param has_nulls Indicate whether the input rows has any nulls at any nested levels - * @param has_nested_columns Indicates whether the input table has any nested columns - * @param nulls_equal Flag to specify whether null elements should be considered as equal - * @param nans_equal Flag to specify whether NaN values in floating point column should be - * considered equal. - * @param init The initial value for reduction of each row group - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned vector - * @return A device_uvector containing the reduction results - */ -template -rmm::device_uvector hash_reduce_by_row( - hash_map_type const& map, - std::shared_ptr const preprocessed_input, - size_type num_rows, - cudf::nullate::DYNAMIC has_nulls, - bool has_nested_columns, - null_equality nulls_equal, - nan_equality nans_equal, - ReduceFuncBuilder func_builder, - OutputType init, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const map_dview = map.get_device_view(); - auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const key_hasher = row_hasher.device_hasher(has_nulls); - auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - - auto reduction_results = rmm::device_uvector(num_rows, stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init); - - auto const reduce_by_row = [&](auto const value_comp) { - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); - } - }; - - if (nans_equal == nan_equality::ALL_EQUAL) { - using nan_equal_comparator = - cudf::experimental::row::equality::nan_equal_physical_equality_comparator; - reduce_by_row(nan_equal_comparator{}); - } else { - using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; - reduce_by_row(nan_unequal_comparator{}); - } - - return reduction_results; -} - -} // namespace cudf::detail diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 27cd3b03390..314796e47bd 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -17,10 +17,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -154,12 +154,12 @@ compute_row_frequencies(table_view const& input, cuco::empty_key{-1}, key_equal, cuco::linear_probing{key_hasher}, - {}, - {}, + {}, // thread scope + {}, // storage cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; - // Device-accessible reference to the hash set with insert_and_find operatro + // Device-accessible reference to the hash set with `insert_and_find` operator auto row_set_ref = row_set.ref(cuco::op::insert_and_find); // Compute frequencies (aka distinct counts) for the input rows. @@ -190,14 +190,14 @@ compute_row_frequencies(table_view const& input, // Copy row indices and counts to the output if counts are non-zero auto const input_it = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin())); + thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin())); auto const output_it = thrust::make_zip_iterator(thrust::make_tuple( distinct_indices->begin(), distinct_counts->mutable_view().begin())); // Reduction results above are either group sizes of equal rows, or `0`. // The final output is non-zero group sizes only. thrust::copy_if( - rmm::exec_policy(stream), input_it, input_it + num_rows, output_it, is_not_zero{}); + rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{}); return {std::move(distinct_indices), std::move(distinct_counts)}; } From 79f07dd34841114cf920c202a478ebfa338c66b4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 7 Oct 2024 23:42:01 +0000 Subject: [PATCH 15/19] Add minor comments --- cpp/src/reductions/histogram.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 314796e47bd..350959f81ce 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -132,7 +132,8 @@ compute_row_frequencies(table_view const& input, using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; auto const value_comp = nan_equal_comparator{}; - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); using row_hash = cudf::experimental::row::hash::device_row_hasher Date: Tue, 8 Oct 2024 23:29:17 +0000 Subject: [PATCH 16/19] Add functor to dispatch static set with provided equality comparator --- cpp/src/reductions/histogram.cu | 120 +++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 41 deletions(-) diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 350959f81ce..31e6db0052b 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -106,6 +106,70 @@ std::unique_ptr make_empty_histogram_like(column_view const& values) std::move(struct_children)); } +/** + * @brief Helper functor to compute row frequencies using cuco::static_set with the specified + * row equality comarator + * + * @tparam KeyEqual Type of the row equality comparator + * + * @param[in] num_rows Number of rows in the input table + * @param[in] partial_counts An optional column containing count for each row + * @param[out] reduction_results Devic vector to store the row counts (the histogram) + * @param[in] stream CUDA stream to use + */ +struct hash_set_insert_fn { + size_t num_rows; + std::optional const& partial_counts; + rmm::device_uvector& reduction_results; + rmm::cuda_stream_view stream; + + using row_hash = + cudf::experimental::row::hash::device_row_hasher; + using device_row_hasher = + cudf::experimental::row::hash::device_row_hasher; + + template + size_t operator()(device_row_hasher const& key_hasher, KeyEqual const& key_equal) + { + // Construct a hash set + auto row_set = cuco::static_set{ + cuco::extent{num_rows}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{-1}, + key_equal, + cuco::linear_probing{key_hasher}, + {}, // thread scope + {}, // storage + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + // Device-accessible reference to the hash set with `insert_and_find` operator + auto row_set_ref = row_set.ref(cuco::op::insert_and_find); + + // Compute frequencies (aka distinct counts) for the input rows. + // Note that we consider null and NaNs as always equal. + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [set_ref = row_set_ref, + increments = partial_counts.has_value() + ? partial_counts.value().begin() + : nullptr, + counts = reduction_results.begin()] __device__(auto const idx) mutable { + auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); + cuda::atomic_ref count_ref{ + counts[*inserted_idx_ptr]}; + auto const increment = + increments ? increments[idx] : histogram_count_type{1}; + count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); + }); + + // Set-size is the number of distinct (inserted) rows + return row_set.size(stream); + } +}; + std::pair>, std::unique_ptr> compute_row_frequencies(table_view const& input, std::optional const& partial_counts, @@ -132,56 +196,30 @@ compute_row_frequencies(table_view const& input, using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; auto const value_comp = nan_equal_comparator{}; - // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - - using row_hash = - cudf::experimental::row::hash::device_row_hasher; + // Number of rows in the input table size_t const num_rows = input.num_rows(); - // Construct a vector to store reduced counts and init to zero + // Create a vector to store the reduced row counts and init to zero rmm::device_uvector reduction_results(num_rows, stream, mr); thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), reduction_results.begin(), reduction_results.end(), histogram_count_type{0}); - // Construct a hash set - auto row_set = cuco::static_set{ - cuco::extent{num_rows}, - cudf::detail::CUCO_DESIRED_LOAD_FACTOR, - cuco::empty_key{-1}, - key_equal, - cuco::linear_probing{key_hasher}, - {}, // thread scope - {}, // storage - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Device-accessible reference to the hash set with `insert_and_find` operator - auto row_set_ref = row_set.ref(cuco::op::insert_and_find); - - // Compute frequencies (aka distinct counts) for the input rows. - // Note that we consider null and NaNs as always equal. - thrust::for_each( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - [set_ref = row_set_ref, - increments = - partial_counts.has_value() ? partial_counts.value().begin() : nullptr, - counts = reduction_results.begin()] __device__(auto const idx) mutable { - auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); - cuda::atomic_ref count_ref{ - counts[*inserted_idx_ptr]}; - auto const increment = increments ? increments[idx] : histogram_count_type{1}; - count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); - }); - - // Set-size is the number of distinct (inserted) rows - auto const set_size = row_set.size(stream); + // Hash set size after insertions + size_t set_size = 0; + + // Dispatch the appropriate hash set insert functor + if (has_nested_columns) { + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher, + key_equal); + } else { + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher, + key_equal); + } // Vector of distinct indices auto distinct_indices = std::make_unique>(set_size, stream, mr); From 813630f4c6498e892218cfd50225d99304f8ea9a Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 8 Oct 2024 23:38:06 +0000 Subject: [PATCH 17/19] Revert the dispatcher functor --- cpp/src/reductions/histogram.cu | 120 +++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 41 deletions(-) diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 350959f81ce..31e6db0052b 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -106,6 +106,70 @@ std::unique_ptr make_empty_histogram_like(column_view const& values) std::move(struct_children)); } +/** + * @brief Helper functor to compute row frequencies using cuco::static_set with the specified + * row equality comarator + * + * @tparam KeyEqual Type of the row equality comparator + * + * @param[in] num_rows Number of rows in the input table + * @param[in] partial_counts An optional column containing count for each row + * @param[out] reduction_results Devic vector to store the row counts (the histogram) + * @param[in] stream CUDA stream to use + */ +struct hash_set_insert_fn { + size_t num_rows; + std::optional const& partial_counts; + rmm::device_uvector& reduction_results; + rmm::cuda_stream_view stream; + + using row_hash = + cudf::experimental::row::hash::device_row_hasher; + using device_row_hasher = + cudf::experimental::row::hash::device_row_hasher; + + template + size_t operator()(device_row_hasher const& key_hasher, KeyEqual const& key_equal) + { + // Construct a hash set + auto row_set = cuco::static_set{ + cuco::extent{num_rows}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{-1}, + key_equal, + cuco::linear_probing{key_hasher}, + {}, // thread scope + {}, // storage + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + // Device-accessible reference to the hash set with `insert_and_find` operator + auto row_set_ref = row_set.ref(cuco::op::insert_and_find); + + // Compute frequencies (aka distinct counts) for the input rows. + // Note that we consider null and NaNs as always equal. + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [set_ref = row_set_ref, + increments = partial_counts.has_value() + ? partial_counts.value().begin() + : nullptr, + counts = reduction_results.begin()] __device__(auto const idx) mutable { + auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); + cuda::atomic_ref count_ref{ + counts[*inserted_idx_ptr]}; + auto const increment = + increments ? increments[idx] : histogram_count_type{1}; + count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); + }); + + // Set-size is the number of distinct (inserted) rows + return row_set.size(stream); + } +}; + std::pair>, std::unique_ptr> compute_row_frequencies(table_view const& input, std::optional const& partial_counts, @@ -132,56 +196,30 @@ compute_row_frequencies(table_view const& input, using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; auto const value_comp = nan_equal_comparator{}; - // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - - using row_hash = - cudf::experimental::row::hash::device_row_hasher; + // Number of rows in the input table size_t const num_rows = input.num_rows(); - // Construct a vector to store reduced counts and init to zero + // Create a vector to store the reduced row counts and init to zero rmm::device_uvector reduction_results(num_rows, stream, mr); thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), reduction_results.begin(), reduction_results.end(), histogram_count_type{0}); - // Construct a hash set - auto row_set = cuco::static_set{ - cuco::extent{num_rows}, - cudf::detail::CUCO_DESIRED_LOAD_FACTOR, - cuco::empty_key{-1}, - key_equal, - cuco::linear_probing{key_hasher}, - {}, // thread scope - {}, // storage - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Device-accessible reference to the hash set with `insert_and_find` operator - auto row_set_ref = row_set.ref(cuco::op::insert_and_find); - - // Compute frequencies (aka distinct counts) for the input rows. - // Note that we consider null and NaNs as always equal. - thrust::for_each( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - [set_ref = row_set_ref, - increments = - partial_counts.has_value() ? partial_counts.value().begin() : nullptr, - counts = reduction_results.begin()] __device__(auto const idx) mutable { - auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); - cuda::atomic_ref count_ref{ - counts[*inserted_idx_ptr]}; - auto const increment = increments ? increments[idx] : histogram_count_type{1}; - count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); - }); - - // Set-size is the number of distinct (inserted) rows - auto const set_size = row_set.size(stream); + // Hash set size after insertions + size_t set_size = 0; + + // Dispatch the appropriate hash set insert functor + if (has_nested_columns) { + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher, + key_equal); + } else { + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher, + key_equal); + } // Vector of distinct indices auto distinct_indices = std::make_unique>(set_size, stream, mr); From a872a9729dedc715ca5ccfbaf21736fc02949b13 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 9 Oct 2024 00:02:14 +0000 Subject: [PATCH 18/19] Revert the functor --- cpp/src/reductions/histogram.cu | 120 +++++++++++--------------------- 1 file changed, 41 insertions(+), 79 deletions(-) diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 31e6db0052b..350959f81ce 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -106,70 +106,6 @@ std::unique_ptr make_empty_histogram_like(column_view const& values) std::move(struct_children)); } -/** - * @brief Helper functor to compute row frequencies using cuco::static_set with the specified - * row equality comarator - * - * @tparam KeyEqual Type of the row equality comparator - * - * @param[in] num_rows Number of rows in the input table - * @param[in] partial_counts An optional column containing count for each row - * @param[out] reduction_results Devic vector to store the row counts (the histogram) - * @param[in] stream CUDA stream to use - */ -struct hash_set_insert_fn { - size_t num_rows; - std::optional const& partial_counts; - rmm::device_uvector& reduction_results; - rmm::cuda_stream_view stream; - - using row_hash = - cudf::experimental::row::hash::device_row_hasher; - using device_row_hasher = - cudf::experimental::row::hash::device_row_hasher; - - template - size_t operator()(device_row_hasher const& key_hasher, KeyEqual const& key_equal) - { - // Construct a hash set - auto row_set = cuco::static_set{ - cuco::extent{num_rows}, - cudf::detail::CUCO_DESIRED_LOAD_FACTOR, - cuco::empty_key{-1}, - key_equal, - cuco::linear_probing{key_hasher}, - {}, // thread scope - {}, // storage - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - // Device-accessible reference to the hash set with `insert_and_find` operator - auto row_set_ref = row_set.ref(cuco::op::insert_and_find); - - // Compute frequencies (aka distinct counts) for the input rows. - // Note that we consider null and NaNs as always equal. - thrust::for_each(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - [set_ref = row_set_ref, - increments = partial_counts.has_value() - ? partial_counts.value().begin() - : nullptr, - counts = reduction_results.begin()] __device__(auto const idx) mutable { - auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); - cuda::atomic_ref count_ref{ - counts[*inserted_idx_ptr]}; - auto const increment = - increments ? increments[idx] : histogram_count_type{1}; - count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); - }); - - // Set-size is the number of distinct (inserted) rows - return row_set.size(stream); - } -}; - std::pair>, std::unique_ptr> compute_row_frequencies(table_view const& input, std::optional const& partial_counts, @@ -196,30 +132,56 @@ compute_row_frequencies(table_view const& input, using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; auto const value_comp = nan_equal_comparator{}; + // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + + using row_hash = + cudf::experimental::row::hash::device_row_hasher; - // Number of rows in the input table size_t const num_rows = input.num_rows(); - // Create a vector to store the reduced row counts and init to zero + // Construct a vector to store reduced counts and init to zero rmm::device_uvector reduction_results(num_rows, stream, mr); thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), reduction_results.begin(), reduction_results.end(), histogram_count_type{0}); - // Hash set size after insertions - size_t set_size = 0; - - // Dispatch the appropriate hash set insert functor - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher, - key_equal); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher, - key_equal); - } + // Construct a hash set + auto row_set = cuco::static_set{ + cuco::extent{num_rows}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{-1}, + key_equal, + cuco::linear_probing{key_hasher}, + {}, // thread scope + {}, // storage + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + // Device-accessible reference to the hash set with `insert_and_find` operator + auto row_set_ref = row_set.ref(cuco::op::insert_and_find); + + // Compute frequencies (aka distinct counts) for the input rows. + // Note that we consider null and NaNs as always equal. + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [set_ref = row_set_ref, + increments = + partial_counts.has_value() ? partial_counts.value().begin() : nullptr, + counts = reduction_results.begin()] __device__(auto const idx) mutable { + auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); + cuda::atomic_ref count_ref{ + counts[*inserted_idx_ptr]}; + auto const increment = increments ? increments[idx] : histogram_count_type{1}; + count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); + }); + + // Set-size is the number of distinct (inserted) rows + auto const set_size = row_set.size(stream); // Vector of distinct indices auto distinct_indices = std::make_unique>(set_size, stream, mr); From ab733d5bd6869decdaa23a7c329b5265a40720ae Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 9 Oct 2024 00:13:34 +0000 Subject: [PATCH 19/19] Minor --- cpp/src/reductions/histogram.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 350959f81ce..b40b2b6dd2e 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -167,8 +167,8 @@ compute_row_frequencies(table_view const& input, // Note that we consider null and NaNs as always equal. thrust::for_each( rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), [set_ref = row_set_ref, increments = partial_counts.has_value() ? partial_counts.value().begin() : nullptr,