From 7abfd2afa5f11de1c2d74c39d628760cc402725b Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 2 Aug 2024 17:44:19 +0000
Subject: [PATCH 01/19] Add histogram bench

---
 cpp/benchmarks/CMakeLists.txt              |  5 +-
 cpp/benchmarks/groupby/group_histogram.cpp | 86 ++++++++++++++++++++++
 cpp/benchmarks/reduction/histogram.cpp     | 74 +++++++++++++++++++
 3 files changed, 163 insertions(+), 2 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_histogram.cpp
 create mode 100644 cpp/benchmarks/reduction/histogram.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ff431c7f260..60f80704e9a 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -211,7 +211,8 @@ ConfigureBench(
   reduction/reduce.cpp reduction/scan.cpp
 )
 ConfigureNVBench(
-  REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp
+  REDUCTION_NVBENCH reduction/histogram.cpp reduction/rank.cpp reduction/scan_structs.cpp
+  reduction/segmented_reduce.cpp
 )
 
 # ##################################################################################################
@@ -231,7 +232,7 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
+  GROUPBY_NVBENCH groupby/group_histogram.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp
   groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
 )
 
diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
new file mode 100644
index 00000000000..9fae1ecab39
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/groupby.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void groupby_histogram_helper(nvbench::state& state,
+                        cudf::size_type num_rows,
+                        cudf::size_type cardinality,
+                        double null_probability)
+{
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const vals = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  auto keys_view = keys->view();
+  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view}));
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = vals->view();
+  requests[0].aggregations.push_back(cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { 
+              auto const result = gb_obj.aggregate(requests); 
+        });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_histogram,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_histogram")
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
+  .add_int64_power_of_two_axis("num_rows", {12, 18, 24});
+
diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
new file mode 100644
index 00000000000..8e8e21dae0b
--- /dev/null
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf/aggregation.hpp"
+#include "cudf/detail/aggregation/aggregation.hpp"
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename type>
+static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list<type>)
+{
+  auto const dtype = cudf::type_to_id<type>();
+
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
+
+  data_profile const profile = data_profile_builder()
+                                 .null_probability(null_probability)
+                                 .cardinality(cardinality)
+                                 .distribution(dtype, distribution_id::UNIFORM, 0, num_rows);
+
+  auto const input = create_random_column(dtype, row_count{num_rows}, profile);
+  auto agg         = cudf::make_histogram_aggregation<cudf::reduce_aggregation>();
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::reduce(*input, *agg, input->type(), stream_view);
+  });
+
+  state.add_element_count(input->size());
+}
+
+using data_type = nvbench::type_list<int32_t, int64_t>;
+
+NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
+  .set_name("histogram")
+  .add_float64_axis("null_probability", {0.1})
+  .add_int64_axis("cardinality", {1})
+  .add_int64_axis("num_rows",
+                  {
+                    10000,      // 10k
+                    100000,     // 100k
+                    1000000,    // 1M
+                    10000000,   // 10M
+                    100000000,  // 100M
+                  });

From e0aa78bb22447509b51a8a715dd5ccd530a49924 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 2 Aug 2024 17:45:35 +0000
Subject: [PATCH 02/19] refactor histogram using static_map insert_or_apply

---
 .../cudf/reduction/detail/histogram.hpp       |   2 +-
 cpp/src/groupby/sort/group_histogram.cu       |   7 +
 cpp/src/reductions/histogram.cu               | 159 +++++-------------
 3 files changed, 54 insertions(+), 114 deletions(-)

diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index f23c5a14e33..745f8aa46cc 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -36,7 +36,7 @@ namespace cudf::reduction::detail {
  * @param partial_counts An optional column containing count for each row
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate memory of the returned objects
- * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
+ * @return A pair of array contains the indices of the distinct rows in the input
  * table, and their corresponding distinct counts
  */
 [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 1000ec0d470..4d893232532 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -29,6 +29,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
+#include <thrust/sort.h>
 
 namespace cudf::groupby::detail {
 
@@ -57,6 +58,12 @@ std::unique_ptr<column> build_histogram(column_view const& values,
   auto [distinct_indices, distinct_counts] =
     cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr);
 
+  // compute_row_frequencies does not provide stable ordering
+  thrust::sort_by_key(rmm::exec_policy(stream),
+                      distinct_indices->begin(),
+                      distinct_indices->end(),
+                      distinct_counts->mutable_view().begin<int64_t>());
+
   // Gather the distinct rows for the output histogram.
   auto out_table = cudf::detail::gather(labeled_values,
                                         *distinct_indices,
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index bebb9d14923..af1187a65e5 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -23,10 +23,12 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <cuco/static_map.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
 #include <thrust/tuple.h>
 
 #include <optional>
@@ -38,69 +40,6 @@ namespace {
 // Always use 64-bit signed integer for storing count.
 using histogram_count_type = int64_t;
 
-/**
- * @brief The functor to accumulate the frequency of each distinct rows in the input table.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename CountType>
-struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType> {
-  CountType const* d_partial_output;
-
-  reduce_fn(MapView const& d_map,
-            KeyHasher const& d_hasher,
-            KeyEqual const& d_equal,
-            CountType* const d_output,
-            CountType const* const d_partial_output)
-    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType>{d_map,
-                                                                                   d_hasher,
-                                                                                   d_equal,
-                                                                                   d_output},
-      d_partial_output{d_partial_output}
-  {
-  }
-
-  // Count the number of rows in each group of rows that are compared equal.
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1};
-    auto const count =
-      cuda::atomic_ref<CountType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
-    count.fetch_add(increment, cuda::std::memory_order_relaxed);
-  }
-};
-
-/**
- * @brief The builder to construct an instance of `reduce_fn` functor.
- */
-template <typename CountType>
-struct reduce_func_builder {
-  CountType const* const d_partial_output;
-
-  reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output}
-  {
-  }
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             CountType* const d_output)
-  {
-    return reduce_fn<MapView, KeyHasher, KeyEqual, CountType>{
-      d_map, d_hasher, d_equal, d_output, d_partial_output};
-  }
-};
-
-/**
- * @brief Specialized functor to check for not-zero of the second component of the input.
- */
-struct is_not_zero {
-  template <typename Pair>
-  __device__ bool operator()(Pair const input) const
-  {
-    return thrust::get<1>(input) != 0;
-  }
-};
-
 /**
  * @brief Building a histogram by gathering distinct rows from the input table and their
  * corresponding distinct counts.
@@ -150,6 +89,16 @@ std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
                                   std::move(struct_children));
 }
 
+// TODO: replace with cuco reduction functors
+struct plus_op {
+  __device__ void operator()(
+    cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> payload_ref,
+    histogram_count_type val)
+  {
+    payload_ref.fetch_add(val, cuda::memory_order_relaxed);
+  }
+};
+
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
@@ -164,12 +113,6 @@ compute_row_frequencies(table_view const& input,
                "Nested types are not yet supported in histogram aggregation.",
                std::invalid_argument);
 
-  auto map = cudf::detail::hash_map_type{compute_hash_table_size(input.num_rows()),
-                                         cuco::empty_key{-1},
-                                         cuco::empty_value{std::numeric_limits<size_type>::min()},
-                                         cudf::detail::cuco_allocator{stream},
-                                         stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
   auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
@@ -180,58 +123,48 @@ compute_row_frequencies(table_view const& input,
 
   auto const pair_iter = cudf::detail::make_counting_transform_iterator(
     size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
+    cuda::proclaim_return_type<cuco::pair<size_type, histogram_count_type>>(
+      [d_partial_output = partial_counts ? partial_counts.value().begin<histogram_count_type>()
+                                         : nullptr] __device__(size_type const idx) {
+        auto const increment = d_partial_output ? d_partial_output[idx] : histogram_count_type{1};
+        return cuco::pair<size_type, histogram_count_type>{idx, increment};
+      }));
 
   // Always compare NaNs as equal.
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
-
-  if (has_nested_columns) {
-    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  } else {
-    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  }
-
+  auto const key_equal  = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+
+  using row_hash =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+
+  auto map = cuco::static_map{input.num_rows(),
+                              0.5,
+                              cuco::empty_key<size_type>{-1},
+                              cuco::empty_value<histogram_count_type>{0},
+                              key_equal,
+                              cuco::linear_probing<1, row_hash>{key_hasher},
+                              {},
+                              {},
+                              cudf::detail::cuco_allocator{stream},
+                              stream.value()};
+
+  // TODO: use `insert_or_apply` init overload for better performance
+  map.insert_or_apply(pair_iter, pair_iter + input.num_rows(), plus_op{}, stream.value());
+
+  size_type const map_size = map.size(stream.value());
   // Gather the indices of distinct rows.
-  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(
-    static_cast<size_type>(map.get_size()), stream, mr);
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(map_size, stream, mr);
 
   // Store the number of occurrences of each distinct row.
-  auto distinct_counts = make_numeric_column(data_type{type_to_id<histogram_count_type>()},
-                                             static_cast<size_type>(map.get_size()),
-                                             mask_state::UNALLOCATED,
-                                             stream,
-                                             mr);
-
-  // Compute frequencies (aka distinct counts) for the input rows.
-  // Note that we consider null and NaNs as always equal.
-  auto const reduction_results = cudf::detail::hash_reduce_by_row(
-    map,
-    preprocessed_input,
-    input.num_rows(),
-    has_nulls,
-    has_nested_columns,
-    null_equality::EQUAL,
-    nan_equality::ALL_EQUAL,
-    reduce_func_builder<histogram_count_type>{
-      partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
-    histogram_count_type{0},
-    stream,
-    rmm::mr::get_current_device_resource());
-
-  auto const input_it = thrust::make_zip_iterator(
-    thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
-  auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
-    distinct_indices->begin(), distinct_counts->mutable_view().begin<histogram_count_type>()));
-
-  // Reduction results above are either group sizes of equal rows, or `0`.
-  // The final output is non-zero group sizes only.
-  thrust::copy_if(
-    rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{});
+  auto distinct_counts = make_numeric_column(
+    data_type{type_to_id<histogram_count_type>()}, map_size, mask_state::UNALLOCATED, stream, mr);
+
+  map.retrieve_all(distinct_indices->begin(),
+                   distinct_counts->mutable_view().begin<histogram_count_type>(),
+                   stream.value());
 
   return {std::move(distinct_indices), std::move(distinct_counts)};
 }

From de74ff830f1539a2abdc207d69a4be652030da90 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 6 Aug 2024 22:32:20 +0000
Subject: [PATCH 03/19] minor nits

---
 cpp/benchmarks/groupby/group_histogram.cpp | 22 ++++++++++++----------
 cpp/benchmarks/reduction/histogram.cpp     |  9 +--------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
index 9fae1ecab39..5de5a521e4b 100644
--- a/cpp/benchmarks/groupby/group_histogram.cpp
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -23,9 +23,9 @@
 
 template <typename Type>
 void groupby_histogram_helper(nvbench::state& state,
-                        cudf::size_type num_rows,
-                        cudf::size_type cardinality,
-                        double null_probability)
+                              cudf::size_type num_rows,
+                              cudf::size_type cardinality,
+                              double null_probability)
 {
   auto const keys = [&] {
     data_profile const profile =
@@ -59,12 +59,11 @@ void groupby_histogram_helper(nvbench::state& state,
   auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
-             [&](nvbench::launch& launch) { 
-              auto const result = gb_obj.aggregate(requests); 
-        });
+             [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
-  state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 template <typename Type>
@@ -74,6 +73,10 @@ void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
   auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const null_probability = state.get_float64("null_probability");
 
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
   groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
 }
 
@@ -81,6 +84,5 @@ NVBENCH_BENCH_TYPES(bench_groupby_histogram,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
   .set_name("groupby_histogram")
   .add_float64_axis("null_probability", {0, 0.1, 0.9})
-  .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
-  .add_int64_power_of_two_axis("num_rows", {12, 18, 24});
-
+  .add_int64_axis("cardinality", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
+  .add_int64_axis("num_rows", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
index 8e8e21dae0b..c021849ed89 100644
--- a/cpp/benchmarks/reduction/histogram.cpp
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -64,11 +64,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
   .set_name("histogram")
   .add_float64_axis("null_probability", {0.1})
   .add_int64_axis("cardinality", {1})
-  .add_int64_axis("num_rows",
-                  {
-                    10000,      // 10k
-                    100000,     // 100k
-                    1000000,    // 1M
-                    10000000,   // 10M
-                    100000000,  // 100M
-                  });
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});

From 11464ba112aabd0428a8f026427b0e4a17e61326 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 6 Aug 2024 23:41:54 +0000
Subject: [PATCH 04/19] more minor nits

---
 .../cudf/reduction/detail/histogram.hpp       |  3 ++
 cpp/src/groupby/sort/group_histogram.cu       |  9 +++---
 cpp/src/reductions/histogram.cu               | 28 +++++++++----------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index 6577b8ff771..b6ce35904df 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -31,6 +31,9 @@
 namespace CUDF_EXPORT cudf {
 namespace reduction::detail {
 
+// Always use 64-bit signed integer for storing count.
+using histogram_count_type = int64_t;
+
 /**
  * @brief Compute the frequency for each distinct row in the input table.
  *
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 4d893232532..33d9aa88017 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -59,10 +59,11 @@ std::unique_ptr<column> build_histogram(column_view const& values,
     cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr);
 
   // compute_row_frequencies does not provide stable ordering
-  thrust::sort_by_key(rmm::exec_policy(stream),
-                      distinct_indices->begin(),
-                      distinct_indices->end(),
-                      distinct_counts->mutable_view().begin<int64_t>());
+  thrust::sort_by_key(
+    rmm::exec_policy(stream),
+    distinct_indices->begin(),
+    distinct_indices->end(),
+    distinct_counts->mutable_view().begin<cudf::reduction::detail::histogram_count_type>());
 
   // Gather the distinct rows for the output histogram.
   auto out_table = cudf::detail::gather(labeled_values,
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index dd3a63ddfd8..d1531849f3b 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -15,9 +15,11 @@
  */
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 
@@ -37,9 +39,6 @@ namespace cudf::reduction::detail {
 
 namespace {
 
-// Always use 64-bit signed integer for storing count.
-using histogram_count_type = int64_t;
-
 /**
  * @brief Building a histogram by gathering distinct rows from the input table and their
  * corresponding distinct counts.
@@ -127,7 +126,7 @@ compute_row_frequencies(table_view const& input,
       [d_partial_output = partial_counts ? partial_counts.value().begin<histogram_count_type>()
                                          : nullptr] __device__(size_type const idx) {
         auto const increment = d_partial_output ? d_partial_output[idx] : histogram_count_type{1};
-        return cuco::pair<size_type, histogram_count_type>{idx, increment};
+        return cuco::pair{idx, increment};
       }));
 
   // Always compare NaNs as equal.
@@ -140,16 +139,17 @@ compute_row_frequencies(table_view const& input,
     cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                      cudf::nullate::DYNAMIC>;
 
-  auto map = cuco::static_map{input.num_rows(),
-                              0.5,
-                              cuco::empty_key<size_type>{-1},
-                              cuco::empty_value<histogram_count_type>{0},
-                              key_equal,
-                              cuco::linear_probing<1, row_hash>{key_hasher},
-                              {},
-                              {},
-                              cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-                              stream.value()};
+  auto map = cuco::static_map{
+    input.num_rows(),
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
+    cuco::empty_key<size_type>{-1},
+    cuco::empty_value<histogram_count_type>{0},
+    key_equal,
+    cuco::linear_probing<1, row_hash>{key_hasher},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   // TODO: use `insert_or_apply` init overload for better performance
   map.insert_or_apply(pair_iter, pair_iter + input.num_rows(), plus_op{}, stream.value());

From 9029297b4cd734cd0127fdfd609aaed37b849034 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 6 Aug 2024 23:55:45 +0000
Subject: [PATCH 05/19] fix cmake format

---
 cpp/benchmarks/CMakeLists.txt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 374c0d26316..7c39803b1c1 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -232,8 +232,13 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_histogram.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp
-  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH
+  groupby/group_histogram.cpp
+  groupby/group_max.cpp
+  groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp
+  groupby/group_rank.cpp
+  groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################

From 920fd161653248435f7b7547ded4e5955adcffbf Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 4 Oct 2024 21:23:58 +0000
Subject: [PATCH 06/19] Refactor `histogram` with `cuco::static_set`

---
 .../cudf/reduction/detail/histogram.hpp       |   6 +-
 cpp/src/groupby/sort/group_histogram.cu       |   8 --
 cpp/src/reductions/histogram.cu               | 111 ++++++++++++------
 3 files changed, 74 insertions(+), 51 deletions(-)

diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index 040e4d27079..fb14e3e94cf 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -31,9 +31,6 @@
 namespace CUDF_EXPORT cudf {
 namespace reduction::detail {
 
-// Always use 64-bit signed integer for storing count.
-using histogram_count_type = int64_t;
-
 /**
  * @brief Compute the frequency for each distinct row in the input table.
  *
@@ -41,7 +38,8 @@ using histogram_count_type = int64_t;
  * @param partial_counts An optional column containing count for each row
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate memory of the returned objects
- * @return A pair of array contains the indices of the distinct rows in the input
+ * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
+ *
  * table, and their corresponding distinct counts
  */
 [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index b757dc25873..861d801a070 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -29,7 +29,6 @@
 #include <rmm/device_buffer.hpp>
 
 #include <thrust/gather.h>
-#include <thrust/sort.h>
 
 namespace cudf::groupby::detail {
 
@@ -58,13 +57,6 @@ std::unique_ptr<column> build_histogram(column_view const& values,
   auto [distinct_indices, distinct_counts] =
     cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr);
 
-  // compute_row_frequencies does not provide stable ordering
-  thrust::sort_by_key(
-    rmm::exec_policy(stream),
-    distinct_indices->begin(),
-    distinct_indices->end(),
-    distinct_counts->mutable_view().begin<cudf::reduction::detail::histogram_count_type>());
-
   // Gather the distinct rows for the output histogram.
   auto out_table = cudf::detail::gather(labeled_values,
                                         *distinct_indices,
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index e5e55d18840..8c8083df65b 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -19,18 +19,20 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <cuco/static_map.cuh>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/operator.hpp>
+#include <cuco/static_set.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
 #include <thrust/tuple.h>
+#include <thrust/uninitialized_fill.h>
 
 #include <optional>
 
@@ -38,6 +40,23 @@ namespace cudf::reduction::detail {
 
 namespace {
 
+// A CUDA Cooperative Group of 1 thread for the hash set for histogram
+auto constexpr DEFAULT_HISTOGRAM_CG_SIZE = 1;
+
+// Always use 64-bit signed integer for storing count.
+using histogram_count_type = int64_t;
+
+/**
+ * @brief Specialized functor to check for not-zero of the second component of the input.
+ */
+struct is_not_zero {
+  template <typename Pair>
+  __device__ bool operator()(Pair const input) const
+  {
+    return thrust::get<1>(input) != 0;
+  }
+};
+
 /**
  * @brief Building a histogram by gathering distinct rows from the input table and their
  * corresponding distinct counts.
@@ -87,16 +106,6 @@ std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
                                   std::move(struct_children));
 }
 
-// TODO: replace with cuco reduction functors
-struct plus_op {
-  __device__ void operator()(
-    cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> payload_ref,
-    histogram_count_type val)
-  {
-    payload_ref.fetch_add(val, cuda::memory_order_relaxed);
-  }
-};
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
@@ -119,15 +128,6 @@ compute_row_frequencies(table_view const& input,
   auto const key_hasher = row_hasher.device_hasher(has_nulls);
   auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, histogram_count_type>>(
-      [d_partial_output = partial_counts ? partial_counts.value().begin<histogram_count_type>()
-                                         : nullptr] __device__(size_type const idx) {
-        auto const increment = d_partial_output ? d_partial_output[idx] : histogram_count_type{1};
-        return cuco::pair{idx, increment};
-      }));
-
   // Always compare NaNs as equal.
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
@@ -138,32 +138,65 @@ compute_row_frequencies(table_view const& input,
     cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                      cudf::nullate::DYNAMIC>;
 
-  auto map = cuco::static_map{
-    input.num_rows(),
+  size_t const num_rows = input.num_rows();
+
+  // Initialize intial counts to zero
+  rmm::device_uvector<histogram_count_type> counts(num_rows, stream, mr);
+  thrust::uninitialized_fill(
+    rmm::exec_policy_nosync(stream), counts.begin(), counts.end(), histogram_count_type{0});
+
+  // Construct a hash set
+  auto row_set = cuco::static_set{
+    cuco::extent{num_rows},
     cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
     cuco::empty_key<size_type>{-1},
-    cuco::empty_value<histogram_count_type>{0},
     key_equal,
-    cuco::linear_probing<1, row_hash>{key_hasher},
+    cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
     {},
     {},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
 
-  // TODO: use `insert_or_apply` init overload for better performance
-  map.insert_or_apply(pair_iter, pair_iter + input.num_rows(), plus_op{}, stream.value());
-
-  size_type const map_size = map.size(stream.value());
-  // Gather the indices of distinct rows.
-  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(map_size, stream, mr);
-
-  // Store the number of occurrences of each distinct row.
+  // Device-accessible reference to the hash set with insert_and_find operatro
+  auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
+
+  // Compute frequencies (aka distinct counts) for the input rows.
+  // Note that we consider null and NaNs as always equal.
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_rows),
+    [row_set_ref,
+     increments =
+       partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
+     counts = counts.begin()] __device__(auto const idx) mutable {
+      auto const [inserted_idx_ptr, _] = row_set_ref.insert_and_find(idx);
+      cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
+        counts[*inserted_idx_ptr]};
+      auto increment = histogram_count_type{1};
+      if (increments) { increment = increments[idx]; }
+      count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
+    });
+
+  // Set-size is the number of distinct (inserted) rows
+  auto const set_size = row_set.size(stream);
+
+  // Vector of distinct indices
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(set_size, stream, mr);
+  // Column of distinct counts
   auto distinct_counts = make_numeric_column(
-    data_type{type_to_id<histogram_count_type>()}, map_size, mask_state::UNALLOCATED, stream, mr);
-
-  map.retrieve_all(distinct_indices->begin(),
-                   distinct_counts->mutable_view().begin<histogram_count_type>(),
-                   stream.value());
+    data_type{type_to_id<histogram_count_type>()}, set_size, mask_state::UNALLOCATED, stream, mr);
+
+  // Copy row indices and counts to the output if counts are non-zero
+  auto const input_it = thrust::make_zip_iterator(
+    thrust::make_tuple(thrust::make_counting_iterator<size_type>(0), counts.begin()));
+  auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
+    distinct_indices->begin(), distinct_counts->mutable_view().begin<histogram_count_type>()));
+
+  // Reduction results above are either group sizes of equal rows, or `0`.
+  // The final output is non-zero group sizes only.
+  thrust::copy_if(
+    rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
 
   return {std::move(distinct_indices), std::move(distinct_counts)};
 }

From c2d815338a173526259a88a945b8108d622d4927 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 4 Oct 2024 21:26:31 +0000
Subject: [PATCH 07/19] Minor comment update

---
 cpp/include/cudf/reduction/detail/histogram.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index fb14e3e94cf..c990db32977 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -39,7 +39,6 @@ namespace reduction::detail {
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate memory of the returned objects
  * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
- *
  * table, and their corresponding distinct counts
  */
 [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>

From 9aca13ee96cfa5b7087fa7119b4be464a15dbd6c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 4 Oct 2024 21:36:20 +0000
Subject: [PATCH 08/19] Minor improvements

---
 cpp/src/reductions/histogram.cu | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 8c8083df65b..27cd3b03390 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -140,10 +140,12 @@ compute_row_frequencies(table_view const& input,
 
   size_t const num_rows = input.num_rows();
 
-  // Initialize intial counts to zero
-  rmm::device_uvector<histogram_count_type> counts(num_rows, stream, mr);
-  thrust::uninitialized_fill(
-    rmm::exec_policy_nosync(stream), counts.begin(), counts.end(), histogram_count_type{0});
+  // Construct a vector to store reduced counts and init to zero
+  rmm::device_uvector<histogram_count_type> reduction_results(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             histogram_count_type{0});
 
   // Construct a hash set
   auto row_set = cuco::static_set{
@@ -166,15 +168,14 @@ compute_row_frequencies(table_view const& input,
     rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(num_rows),
-    [row_set_ref,
+    [set_ref = row_set_ref,
      increments =
        partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
-     counts = counts.begin()] __device__(auto const idx) mutable {
-      auto const [inserted_idx_ptr, _] = row_set_ref.insert_and_find(idx);
+     counts = reduction_results.begin()] __device__(auto const idx) mutable {
+      auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
       cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
         counts[*inserted_idx_ptr]};
-      auto increment = histogram_count_type{1};
-      if (increments) { increment = increments[idx]; }
+      auto const increment = increments ? increments[idx] : histogram_count_type{1};
       count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
     });
 
@@ -189,14 +190,14 @@ compute_row_frequencies(table_view const& input,
 
   // Copy row indices and counts to the output if counts are non-zero
   auto const input_it = thrust::make_zip_iterator(
-    thrust::make_tuple(thrust::make_counting_iterator<size_type>(0), counts.begin()));
+    thrust::make_tuple(thrust::make_counting_iterator<size_type>(0), reduction_results.begin()));
   auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
     distinct_indices->begin(), distinct_counts->mutable_view().begin<histogram_count_type>()));
 
   // Reduction results above are either group sizes of equal rows, or `0`.
   // The final output is non-zero group sizes only.
   thrust::copy_if(
-    rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
+    rmm::exec_policy(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
 
   return {std::move(distinct_indices), std::move(distinct_counts)};
 }

From 7ddf0aac8a65eb416dd9633cc1ab2cdc08295614 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Sat, 5 Oct 2024 01:27:10 +0000
Subject: [PATCH 09/19] Fix for benchmark

---
 cpp/benchmarks/groupby/group_histogram.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
index 5de5a521e4b..6ca59462e2b 100644
--- a/cpp/benchmarks/groupby/group_histogram.cpp
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -36,7 +36,7 @@ void groupby_histogram_helper(nvbench::state& state,
     return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
   }();
 
-  auto const vals = [&] {
+  auto const values = [&] {
     auto builder = data_profile_builder().cardinality(0).distribution(
       cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
     if (null_probability > 0) {
@@ -47,19 +47,18 @@ void groupby_histogram_helper(nvbench::state& state,
     return create_random_column(
       cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
   }();
-
-  auto keys_view = keys->view();
-  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view}));
-
-  std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals->view();
-  requests[0].aggregations.push_back(cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+  std::vector<cudf::groupby::aggregation_request> requests(1);
+  requests.back().values = values->view();
+  requests.back().aggregations.push_back(
+    cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(nvbench::exec_tag::sync,
-             [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto gb_obj       = cudf::groupby::groupby(cudf::table_view({keys->view()}));
+    auto const result = gb_obj.aggregate(requests);
+  });
+
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
   state.add_buffer_size(
@@ -77,6 +76,7 @@ void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
     state.skip("cardinality > num_rows");
     return;
   }
+
   groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
 }
 

From 2134f511e35b8c08aabba00b73db488add8c0e7d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Sat, 5 Oct 2024 02:08:41 +0000
Subject: [PATCH 10/19] Update cardinality axis for reduction

---
 cpp/benchmarks/reduction/histogram.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
index c021849ed89..24131e1f30d 100644
--- a/cpp/benchmarks/reduction/histogram.cpp
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -63,5 +63,6 @@ using data_type = nvbench::type_list<int32_t, int64_t>;
 NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
   .set_name("histogram")
   .add_float64_axis("null_probability", {0.1})
-  .add_int64_axis("cardinality", {1})
-  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
+  .add_int64_axis("cardinality",
+                  {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000})
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
\ No newline at end of file

From d35b4fcb419e977baa9ce963e479465407f4e6e6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 7 Oct 2024 18:29:27 +0000
Subject: [PATCH 11/19] Minor style fix

---
 cpp/benchmarks/groupby/group_histogram.cpp | 2 +-
 cpp/benchmarks/reduction/histogram.cpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
index 6ca59462e2b..34295931648 100644
--- a/cpp/benchmarks/groupby/group_histogram.cpp
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -60,7 +60,7 @@ void groupby_histogram_helper(nvbench::state& state,
   });
 
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time, "rows/s");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
index 24131e1f30d..d0925de5c87 100644
--- a/cpp/benchmarks/reduction/histogram.cpp
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -65,4 +65,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
   .add_float64_axis("null_probability", {0.1})
   .add_int64_axis("cardinality",
                   {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000})
-  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
\ No newline at end of file
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});

From aaf68377d3678d57560c499a36ea71510e757e54 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 7 Oct 2024 18:40:00 +0000
Subject: [PATCH 12/19] Minor style fix

---
 cpp/benchmarks/groupby/group_histogram.cpp | 2 ++
 cpp/benchmarks/reduction/histogram.cpp     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
index 34295931648..cd7f9f298af 100644
--- a/cpp/benchmarks/groupby/group_histogram.cpp
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -47,6 +47,8 @@ void groupby_histogram_helper(nvbench::state& state,
     return create_random_column(
       cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
   }();
+
+  // Vector of 1 request
   std::vector<cudf::groupby::aggregation_request> requests(1);
   requests.back().values = values->view();
   requests.back().aggregations.push_back(
diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
index d0925de5c87..24131e1f30d 100644
--- a/cpp/benchmarks/reduction/histogram.cpp
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -65,4 +65,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
   .add_float64_axis("null_probability", {0.1})
   .add_int64_axis("cardinality",
                   {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000})
-  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
\ No newline at end of file

From f63fc4a7d8e543c3fe817bd43874f473b63a4c55 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 7 Oct 2024 18:40:49 +0000
Subject: [PATCH 13/19] Style fix

---
 cpp/benchmarks/reduction/histogram.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
index 24131e1f30d..d0925de5c87 100644
--- a/cpp/benchmarks/reduction/histogram.cpp
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -65,4 +65,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
   .add_float64_axis("null_probability", {0.1})
   .add_int64_axis("cardinality",
                   {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000})
-  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
\ No newline at end of file
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});

From a5dcc82f1b262a8e8040cc15fc68abd16a85ecb5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 7 Oct 2024 23:29:50 +0000
Subject: [PATCH 14/19] Remove hash_reduce_by_row and use rmm nosync policy

---
 .../cudf/detail/hash_reduce_by_row.cuh        | 169 ------------------
 cpp/src/reductions/histogram.cu               |  12 +-
 2 files changed, 6 insertions(+), 175 deletions(-)
 delete mode 100644 cpp/include/cudf/detail/hash_reduce_by_row.cuh

diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
deleted file mode 100644
index 7de79b31bc7..00000000000
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuco/static_map.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator<char>>;
-
-/**
- * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
- * rows that compared equal.
- *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
-struct reduce_by_row_fn_base {
- protected:
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
-  OutputType* const d_output;
-
-  reduce_by_row_fn_base(MapView const& d_map,
-                        KeyHasher const& d_hasher,
-                        KeyEqual const& d_equal,
-                        OutputType* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
-  {
-  }
-
-  /**
-   * @brief Return a pointer to the output array at the given index.
-   *
-   * @param idx The access index
-   * @return A pointer to the given index in the output array
-   */
-  __device__ OutputType* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
-};
-
-/**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `init` parameter. Then, the reduction result for each row group is written into the output
- * array at the index of an unspecified row in the group.
- *
- * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
- *         reduction functor derived from `reduce_by_row_fn_base`
- * @tparam OutputType Type of the reduction results
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
- * @param init The initial value for reduction of each row group
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-template <typename ReduceFuncBuilder, typename OutputType>
-rmm::device_uvector<OutputType> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  ReduceFuncBuilder func_builder,
-  OutputType init,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto const map_dview  = map.get_device_view();
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
-  thrust::uninitialized_fill(
-    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 27cd3b03390..314796e47bd 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -17,10 +17,10 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -154,12 +154,12 @@ compute_row_frequencies(table_view const& input,
     cuco::empty_key<size_type>{-1},
     key_equal,
     cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
-    {},
-    {},
+    {},  // thread scope
+    {},  // storage
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
 
-  // Device-accessible reference to the hash set with insert_and_find operatro
+  // Device-accessible reference to the hash set with `insert_and_find` operator
   auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
 
   // Compute frequencies (aka distinct counts) for the input rows.
@@ -190,14 +190,14 @@ compute_row_frequencies(table_view const& input,
 
   // Copy row indices and counts to the output if counts are non-zero
   auto const input_it = thrust::make_zip_iterator(
-    thrust::make_tuple(thrust::make_counting_iterator<size_type>(0), reduction_results.begin()));
+    thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
   auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
     distinct_indices->begin(), distinct_counts->mutable_view().begin<histogram_count_type>()));
 
   // Reduction results above are either group sizes of equal rows, or `0`.
   // The final output is non-zero group sizes only.
   thrust::copy_if(
-    rmm::exec_policy(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
+    rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
 
   return {std::move(distinct_indices), std::move(distinct_counts)};
 }

From 79f07dd34841114cf920c202a478ebfa338c66b4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 7 Oct 2024 23:42:01 +0000
Subject: [PATCH 15/19] Add minor comments

---
 cpp/src/reductions/histogram.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 314796e47bd..350959f81ce 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -132,7 +132,8 @@ compute_row_frequencies(table_view const& input,
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
-  auto const key_equal  = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+  // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns
+  auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
 
   using row_hash =
     cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,

From b48b76bda8a9cb1a93649302f3451c68d488daf2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 8 Oct 2024 23:29:17 +0000
Subject: [PATCH 16/19] Add functor to dispatch static set with provided
 equality comparator

---
 cpp/src/reductions/histogram.cu | 120 +++++++++++++++++++++-----------
 1 file changed, 79 insertions(+), 41 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 350959f81ce..31e6db0052b 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -106,6 +106,70 @@ std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
                                   std::move(struct_children));
 }
 
+/**
+ * @brief Helper functor to compute row frequencies using cuco::static_set with the specified
+ *        row equality comarator
+ *
+ * @tparam KeyEqual Type of the row equality comparator
+ *
+ * @param[in]  num_rows Number of rows in the input table
+ * @param[in]  partial_counts An optional column containing count for each row
+ * @param[out] reduction_results Devic vector to store the row counts (the histogram)
+ * @param[in]  stream CUDA stream to use
+ */
+struct hash_set_insert_fn {
+  size_t num_rows;
+  std::optional<column_view> const& partial_counts;
+  rmm::device_uvector<histogram_count_type>& reduction_results;
+  rmm::cuda_stream_view stream;
+
+  using row_hash =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+  using device_row_hasher =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     nullate::DYNAMIC>;
+
+  template <typename KeyEqual>
+  size_t operator()(device_row_hasher const& key_hasher, KeyEqual const& key_equal)
+  {
+    // Construct a hash set
+    auto row_set = cuco::static_set{
+      cuco::extent{num_rows},
+      cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
+      cuco::empty_key<size_type>{-1},
+      key_equal,
+      cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
+      {},  // thread scope
+      {},  // storage
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
+    // Device-accessible reference to the hash set with `insert_and_find` operator
+    auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
+
+    // Compute frequencies (aka distinct counts) for the input rows.
+    // Note that we consider null and NaNs as always equal.
+    thrust::for_each(rmm::exec_policy_nosync(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     thrust::make_counting_iterator<size_type>(num_rows),
+                     [set_ref    = row_set_ref,
+                      increments = partial_counts.has_value()
+                                     ? partial_counts.value().begin<histogram_count_type>()
+                                     : nullptr,
+                      counts     = reduction_results.begin()] __device__(auto const idx) mutable {
+                       auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+                       cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
+                         counts[*inserted_idx_ptr]};
+                       auto const increment =
+                         increments ? increments[idx] : histogram_count_type{1};
+                       count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
+                     });
+
+    // Set-size is the number of distinct (inserted) rows
+    return row_set.size(stream);
+  }
+};
+
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
@@ -132,56 +196,30 @@ compute_row_frequencies(table_view const& input,
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
-  // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns
-  auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-
-  using row_hash =
-    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                     cudf::nullate::DYNAMIC>;
 
+  // Number of rows in the input table
   size_t const num_rows = input.num_rows();
 
-  // Construct a vector to store reduced counts and init to zero
+  // Create a vector to store the reduced row counts and init to zero
   rmm::device_uvector<histogram_count_type> reduction_results(num_rows, stream, mr);
   thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
                              reduction_results.begin(),
                              reduction_results.end(),
                              histogram_count_type{0});
 
-  // Construct a hash set
-  auto row_set = cuco::static_set{
-    cuco::extent{num_rows},
-    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
-    cuco::empty_key<size_type>{-1},
-    key_equal,
-    cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
-    {},  // thread scope
-    {},  // storage
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
-  // Device-accessible reference to the hash set with `insert_and_find` operator
-  auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
-
-  // Compute frequencies (aka distinct counts) for the input rows.
-  // Note that we consider null and NaNs as always equal.
-  thrust::for_each(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(num_rows),
-    [set_ref = row_set_ref,
-     increments =
-       partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
-     counts = reduction_results.begin()] __device__(auto const idx) mutable {
-      auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
-      cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
-        counts[*inserted_idx_ptr]};
-      auto const increment = increments ? increments[idx] : histogram_count_type{1};
-      count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
-    });
-
-  // Set-size is the number of distinct (inserted) rows
-  auto const set_size = row_set.size(stream);
+  // Hash set size after insertions
+  size_t set_size = 0;
+
+  // Dispatch the appropriate hash set insert functor
+  if (has_nested_columns) {
+    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
+    set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher,
+                                                                                       key_equal);
+  } else {
+    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+    set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher,
+                                                                                       key_equal);
+  }
 
   // Vector of distinct indices
   auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(set_size, stream, mr);

From 813630f4c6498e892218cfd50225d99304f8ea9a Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 8 Oct 2024 23:38:06 +0000
Subject: [PATCH 17/19] Revert the dispatcher functor

---
 cpp/src/reductions/histogram.cu | 120 +++++++++++++++++++++-----------
 1 file changed, 79 insertions(+), 41 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 350959f81ce..31e6db0052b 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -106,6 +106,70 @@ std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
                                   std::move(struct_children));
 }
 
+/**
+ * @brief Helper functor to compute row frequencies using cuco::static_set with the specified
+ *        row equality comarator
+ *
+ * @tparam KeyEqual Type of the row equality comparator
+ *
+ * @param[in]  num_rows Number of rows in the input table
+ * @param[in]  partial_counts An optional column containing count for each row
+ * @param[out] reduction_results Devic vector to store the row counts (the histogram)
+ * @param[in]  stream CUDA stream to use
+ */
+struct hash_set_insert_fn {
+  size_t num_rows;
+  std::optional<column_view> const& partial_counts;
+  rmm::device_uvector<histogram_count_type>& reduction_results;
+  rmm::cuda_stream_view stream;
+
+  using row_hash =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+  using device_row_hasher =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     nullate::DYNAMIC>;
+
+  template <typename KeyEqual>
+  size_t operator()(device_row_hasher const& key_hasher, KeyEqual const& key_equal)
+  {
+    // Construct a hash set
+    auto row_set = cuco::static_set{
+      cuco::extent{num_rows},
+      cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
+      cuco::empty_key<size_type>{-1},
+      key_equal,
+      cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
+      {},  // thread scope
+      {},  // storage
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
+    // Device-accessible reference to the hash set with `insert_and_find` operator
+    auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
+
+    // Compute frequencies (aka distinct counts) for the input rows.
+    // Note that we consider null and NaNs as always equal.
+    thrust::for_each(rmm::exec_policy_nosync(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     thrust::make_counting_iterator<size_type>(num_rows),
+                     [set_ref    = row_set_ref,
+                      increments = partial_counts.has_value()
+                                     ? partial_counts.value().begin<histogram_count_type>()
+                                     : nullptr,
+                      counts     = reduction_results.begin()] __device__(auto const idx) mutable {
+                       auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+                       cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
+                         counts[*inserted_idx_ptr]};
+                       auto const increment =
+                         increments ? increments[idx] : histogram_count_type{1};
+                       count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
+                     });
+
+    // Set-size is the number of distinct (inserted) rows
+    return row_set.size(stream);
+  }
+};
+
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
@@ -132,56 +196,30 @@ compute_row_frequencies(table_view const& input,
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
-  // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns
-  auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-
-  using row_hash =
-    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                     cudf::nullate::DYNAMIC>;
 
+  // Number of rows in the input table
   size_t const num_rows = input.num_rows();
 
-  // Construct a vector to store reduced counts and init to zero
+  // Create a vector to store the reduced row counts and init to zero
   rmm::device_uvector<histogram_count_type> reduction_results(num_rows, stream, mr);
   thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
                              reduction_results.begin(),
                              reduction_results.end(),
                              histogram_count_type{0});
 
-  // Construct a hash set
-  auto row_set = cuco::static_set{
-    cuco::extent{num_rows},
-    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
-    cuco::empty_key<size_type>{-1},
-    key_equal,
-    cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
-    {},  // thread scope
-    {},  // storage
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
-  // Device-accessible reference to the hash set with `insert_and_find` operator
-  auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
-
-  // Compute frequencies (aka distinct counts) for the input rows.
-  // Note that we consider null and NaNs as always equal.
-  thrust::for_each(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(num_rows),
-    [set_ref = row_set_ref,
-     increments =
-       partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
-     counts = reduction_results.begin()] __device__(auto const idx) mutable {
-      auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
-      cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
-        counts[*inserted_idx_ptr]};
-      auto const increment = increments ? increments[idx] : histogram_count_type{1};
-      count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
-    });
-
-  // Set-size is the number of distinct (inserted) rows
-  auto const set_size = row_set.size(stream);
+  // Hash set size after insertions
+  size_t set_size = 0;
+
+  // Dispatch the appropriate hash set insert functor
+  if (has_nested_columns) {
+    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
+    set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher,
+                                                                                       key_equal);
+  } else {
+    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+    set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher,
+                                                                                       key_equal);
+  }
 
   // Vector of distinct indices
   auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(set_size, stream, mr);

From a872a9729dedc715ca5ccfbaf21736fc02949b13 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 9 Oct 2024 00:02:14 +0000
Subject: [PATCH 18/19] Revert the functor

---
 cpp/src/reductions/histogram.cu | 120 +++++++++++---------------------
 1 file changed, 41 insertions(+), 79 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 31e6db0052b..350959f81ce 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -106,70 +106,6 @@ std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
                                   std::move(struct_children));
 }
 
-/**
- * @brief Helper functor to compute row frequencies using cuco::static_set with the specified
- *        row equality comarator
- *
- * @tparam KeyEqual Type of the row equality comparator
- *
- * @param[in]  num_rows Number of rows in the input table
- * @param[in]  partial_counts An optional column containing count for each row
- * @param[out] reduction_results Devic vector to store the row counts (the histogram)
- * @param[in]  stream CUDA stream to use
- */
-struct hash_set_insert_fn {
-  size_t num_rows;
-  std::optional<column_view> const& partial_counts;
-  rmm::device_uvector<histogram_count_type>& reduction_results;
-  rmm::cuda_stream_view stream;
-
-  using row_hash =
-    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                     cudf::nullate::DYNAMIC>;
-  using device_row_hasher =
-    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                     nullate::DYNAMIC>;
-
-  template <typename KeyEqual>
-  size_t operator()(device_row_hasher const& key_hasher, KeyEqual const& key_equal)
-  {
-    // Construct a hash set
-    auto row_set = cuco::static_set{
-      cuco::extent{num_rows},
-      cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
-      cuco::empty_key<size_type>{-1},
-      key_equal,
-      cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
-      {},  // thread scope
-      {},  // storage
-      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-      stream.value()};
-    // Device-accessible reference to the hash set with `insert_and_find` operator
-    auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
-
-    // Compute frequencies (aka distinct counts) for the input rows.
-    // Note that we consider null and NaNs as always equal.
-    thrust::for_each(rmm::exec_policy_nosync(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(num_rows),
-                     [set_ref    = row_set_ref,
-                      increments = partial_counts.has_value()
-                                     ? partial_counts.value().begin<histogram_count_type>()
-                                     : nullptr,
-                      counts     = reduction_results.begin()] __device__(auto const idx) mutable {
-                       auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
-                       cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
-                         counts[*inserted_idx_ptr]};
-                       auto const increment =
-                         increments ? increments[idx] : histogram_count_type{1};
-                       count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
-                     });
-
-    // Set-size is the number of distinct (inserted) rows
-    return row_set.size(stream);
-  }
-};
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
@@ -196,30 +132,56 @@ compute_row_frequencies(table_view const& input,
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
+  // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns
+  auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+
+  using row_hash =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
 
-  // Number of rows in the input table
   size_t const num_rows = input.num_rows();
 
-  // Create a vector to store the reduced row counts and init to zero
+  // Construct a vector to store reduced counts and init to zero
   rmm::device_uvector<histogram_count_type> reduction_results(num_rows, stream, mr);
   thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
                              reduction_results.begin(),
                              reduction_results.end(),
                              histogram_count_type{0});
 
-  // Hash set size after insertions
-  size_t set_size = 0;
-
-  // Dispatch the appropriate hash set insert functor
-  if (has_nested_columns) {
-    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
-    set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher,
-                                                                                       key_equal);
-  } else {
-    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-    set_size = hash_set_insert_fn{num_rows, partial_counts, reduction_results, stream}(key_hasher,
-                                                                                       key_equal);
-  }
+  // Construct a hash set
+  auto row_set = cuco::static_set{
+    cuco::extent{num_rows},
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
+    cuco::empty_key<size_type>{-1},
+    key_equal,
+    cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
+    {},  // thread scope
+    {},  // storage
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
+  // Device-accessible reference to the hash set with `insert_and_find` operator
+  auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
+
+  // Compute frequencies (aka distinct counts) for the input rows.
+  // Note that we consider null and NaNs as always equal.
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_rows),
+    [set_ref = row_set_ref,
+     increments =
+       partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
+     counts = reduction_results.begin()] __device__(auto const idx) mutable {
+      auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+      cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
+        counts[*inserted_idx_ptr]};
+      auto const increment = increments ? increments[idx] : histogram_count_type{1};
+      count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
+    });
+
+  // Set-size is the number of distinct (inserted) rows
+  auto const set_size = row_set.size(stream);
 
   // Vector of distinct indices
   auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(set_size, stream, mr);

From ab733d5bd6869decdaa23a7c329b5265a40720ae Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 9 Oct 2024 00:13:34 +0000
Subject: [PATCH 19/19] Minor

---
 cpp/src/reductions/histogram.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 350959f81ce..b40b2b6dd2e 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -167,8 +167,8 @@ compute_row_frequencies(table_view const& input,
   // Note that we consider null and NaNs as always equal.
   thrust::for_each(
     rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(num_rows),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(num_rows),
     [set_ref = row_set_ref,
      increments =
        partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,