From 45573e0b1f69c8c497407e98ad4f9dbe3baf36c4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Sep 2024 12:37:39 -0700
Subject: [PATCH] Cleanups

---
 cpp/CMakeLists.txt                            |   3 -
 cpp/src/groupby/hash/compute_groupby.cu       | 111 +++++-
 cpp/src/groupby/hash/compute_groupby.cuh      | 111 ------
 cpp/src/groupby/hash/compute_groupby.hpp      |  68 ++++
 cpp/src/groupby/hash/compute_groupby_null.cu  |  31 --
 .../groupby/hash/compute_single_pass_aggs.cu  | 375 ++++++++++++++++--
 .../groupby/hash/compute_single_pass_aggs.cuh | 355 -----------------
 .../groupby/hash/compute_single_pass_aggs.hpp |  15 +-
 .../hash/compute_single_pass_aggs_null.cu     |  58 ---
 cpp/src/groupby/hash/groupby.cu               |   2 +-
 .../groupby/hash/sparse_to_dense_results.cu   |  61 ++-
 ...esults.cuh => sparse_to_dense_results.hpp} |  38 +-
 .../hash/sparse_to_dense_results_null.cu      |  33 --
 13 files changed, 599 insertions(+), 662 deletions(-)
 delete mode 100644 cpp/src/groupby/hash/compute_groupby.cuh
 create mode 100644 cpp/src/groupby/hash/compute_groupby.hpp
 delete mode 100644 cpp/src/groupby/hash/compute_groupby_null.cu
 delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh
 delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
 rename cpp/src/groupby/hash/{sparse_to_dense_results.cuh => sparse_to_dense_results.hpp} (57%)
 delete mode 100644 cpp/src/groupby/hash/sparse_to_dense_results_null.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0d9529ef58d..663f2210ef4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -316,14 +316,11 @@ add_library(
   src/groupby/groupby.cu
   src/groupby/hash/compute_aggregations.cu
   src/groupby/hash/compute_groupby.cu
-  src/groupby/hash/compute_groupby_null.cu
-  src/groupby/hash/compute_single_pass_aggs_null.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
   src/groupby/hash/hash_compound_agg_finalizer.cu
   src/groupby/hash/sparse_to_dense_results.cu
-  src/groupby/hash/sparse_to_dense_results_null.cu
   src/groupby/hash/var_hash_functor.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index 7965d0891a7..9643567a825 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -13,11 +13,110 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-#include "compute_groupby.cuh"
+#include "compute_single_pass_aggs.cuh"
+// #include "compute_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "sparse_to_dense_results.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <memory>
 
 namespace cudf::groupby::detail::hash {
 
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ */
+template <typename Equal>
+std::unique_ptr<table> compute_groupby(table_view const& keys,
+                                       host_span<aggregation_request const> requests,
+                                       cudf::detail::result_cache* cache,
+                                       bool skip_key_rows_with_nulls,
+                                       Equal const& d_row_equal,
+                                       row_hash_t const& d_row_hash,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+{
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys = static_cast<int64_t>(keys.num_rows());
+
+  // Cache of sparse results where the location of aggregate value in each
+  // column is indexed by the hash set
+  cudf::detail::result_cache sparse_results(requests.size());
+
+  auto const set = cuco::static_set{
+    cuco::extent<int64_t>{num_keys},
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
+    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    d_row_equal,
+    probing_scheme_t{d_row_hash},
+    cuco::thread_scope_device,
+    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
+  // Compute all single pass aggs first
+  auto gather_map = compute_single_pass_aggs(
+    keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
+
+  // Compact all results from sparse_results and insert into cache
+  sparse_to_dense_results(keys,
+                          requests,
+                          &sparse_results,
+                          cache,
+                          gather_map,
+                          set.ref(cuco::find),
+                          skip_key_rows_with_nulls,
+                          stream,
+                          mr);
+
+  return cudf::detail::gather(keys,
+                              gather_map,
+                              out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr);
+}
+
 template std::unique_ptr<table> compute_groupby<row_comparator_t>(
   table_view const& keys,
   host_span<aggregation_request const> requests,
@@ -28,4 +127,14 @@ template std::unique_ptr<table> compute_groupby<row_comparator_t>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
+template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* cache,
+  bool skip_key_rows_with_nulls,
+  nullable_row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh
deleted file mode 100644
index 146900f61eb..00000000000
--- a/cpp/src/groupby/hash/compute_groupby.cuh
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "compute_single_pass_aggs.cuh"
-// #include "compute_single_pass_aggs.hpp"
-#include "sparse_to_dense_results.cuh"
-#include "var_hash_functor.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/unary.hpp>
-
-#include <cuco/static_set.cuh>
-
-namespace cudf::groupby::detail::hash {
-
-/**
- * @brief Computes groupby using hash table.
- *
- * First, we create a hash table that stores the indices of unique rows in
- * `keys`. The upper limit on the number of values in this map is the number
- * of rows in `keys`.
- *
- * To store the results of aggregations, we create temporary sparse columns
- * which have the same size as input value columns. Using the hash map, we
- * determine the location within the sparse column to write the result of the
- * aggregation into.
- *
- * The sparse column results of all aggregations are stored into the cache
- * `sparse_results`. This enables the use of previously calculated results in
- * other aggregations.
- *
- * All the aggregations which can be computed in a single pass are computed
- * first, in a combined kernel. Then using these results, aggregations that
- * require multiple passes, will be computed.
- *
- * Finally, using the hash map, we generate a vector of indices of populated
- * values in sparse result columns. Then, for each aggregation originally
- * requested in `requests`, we gather sparse results into a column of dense
- * results using the aforementioned index vector. Dense results are stored into
- * the in/out parameter `cache`.
- */
-template <typename Equal>
-std::unique_ptr<table> compute_groupby(
-  table_view const& keys,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* cache,
-  bool skip_key_rows_with_nulls,
-  Equal const& d_row_equal,
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC> const& d_row_hash,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_keys = static_cast<int64_t>(keys.num_rows());
-
-  // Cache of sparse results where the location of aggregate value in each
-  // column is indexed by the hash set
-  cudf::detail::result_cache sparse_results(requests.size());
-
-  auto const set = cuco::static_set{
-    cuco::extent<int64_t>{num_keys},
-    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% occupancy
-    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-    d_row_equal,
-    probing_scheme_t{d_row_hash},
-    cuco::thread_scope_device,
-    cuco::storage<GROUPBY_WINDOW_SIZE>{},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
-  // Compute all single pass aggs first
-  auto gather_map = compute_single_pass_aggs(
-    keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream);
-
-  // Compact all results from sparse_results and insert into cache
-  sparse_to_dense_results(keys,
-                          requests,
-                          &sparse_results,
-                          cache,
-                          gather_map,
-                          set.ref(cuco::find),
-                          skip_key_rows_with_nulls,
-                          stream,
-                          mr);
-
-  return cudf::detail::gather(keys,
-                              gather_map,
-                              out_of_bounds_policy::DONT_CHECK,
-                              cudf::detail::negative_index_policy::NOT_ALLOWED,
-                              stream,
-                              mr);
-}
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
new file mode 100644
index 00000000000..358c81365a0
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "helpers.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ */
+template <typename Equal>
+std::unique_ptr<cudf::table> compute_groupby(table_view const& keys,
+                                             host_span<aggregation_request const> requests,
+                                             cudf::detail::result_cache* cache,
+                                             bool skip_key_rows_with_nulls,
+                                             Equal const& d_row_equal,
+                                             row_hash_t const& d_row_hash,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu
deleted file mode 100644
index 1f9707902cc..00000000000
--- a/cpp/src/groupby/hash/compute_groupby_null.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compute_groupby.cuh"
-
-namespace cudf::groupby::detail::hash {
-
-template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t>(
-  table_view const& keys,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* cache,
-  bool skip_key_rows_with_nulls,
-  nullable_row_comparator_t const& d_row_equal,
-  row_hash_t const& d_row_hash,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
index 2770dc2a84c..b5c68ea639a 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,342 @@
  * limitations under the License.
  */
 
-#include "compute_single_pass_aggs.cuh"
+#include "compute_aggregations.hpp"
+// #include "compute_single_pass_aggs.hpp"
+#include "flatten_single_pass_aggs.hpp"
 #include "helpers.cuh"
+#include "single_pass_functors.cuh"
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cooperative_groups.h>
+#include <cuco/static_set.cuh>
+
+#include <unordered_set>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+template <typename SetType>
+// TODO pass block
+__device__ void find_local_mapping(cudf::size_type cur_idx,
+                                   cudf::size_type num_input_rows,
+                                   SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  cudf::size_type result_idx;
+  // TODO: un-init
+  bool inserted;
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
+    auto const result = shared_set.insert_and_find(cur_idx);
+    result_idx        = *result.first;
+    inserted          = result.second;
+    // inserted a new element
+    if (result.second) {
+      auto const shared_set_index          = atomicAdd(cardinality, 1);
+      shared_set_indices[shared_set_index] = cur_idx;
+      local_mapping_index[cur_idx]         = shared_set_index;
+    }
+  }
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  __syncthreads();
+  if (cur_idx < num_input_rows and
+      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetType>
+__device__ void find_global_mapping(cudf::size_type cur_idx,
+                                    SetType global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  auto const input_idx = shared_set_indices[cur_idx];
+  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
+    *global_set.insert_and_find(input_idx).first;
+}
+
+/*
+ * Inserts keys into the shared memory hash set, and stores the row index of the local
+ * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
+ * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
+ * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
+ * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef, typename GlobalSetType, class WindowExtent>
+CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
+                                         cudf::size_type num_input_rows,
+                                         WindowExtent window_extent,
+                                         bitmask_type const* row_bitmask,
+                                         bool skip_rows_with_nulls,
+                                         cudf::size_type* local_mapping_index,
+                                         cudf::size_type* global_mapping_index,
+                                         cudf::size_type* block_cardinality,
+                                         bool* direct_aggregations)
+{
+  // TODO: indices inserted in each shared memory set
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ typename SetRef::window_type windows[window_extent.value()];
+  auto storage     = SetRef::storage_ref_type(window_extent, windows);
+  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                           global_set.key_eq(),
+                           probing_scheme_t{global_set.hash_function()},
+                            {},
+                           storage);
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+
+  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
+
+  __shared__ cudf::size_type cardinality;
+  if (block.thread_rank() == 0) { cardinality = 0; }
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
+       cur_idx - block.thread_rank() < num_input_rows;
+       cur_idx += stride) {
+    find_local_mapping(cur_idx,
+                       num_input_rows,
+                       shared_insert_ref,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { *direct_aggregations = true; }
+      break;
+    }
+
+    block.sync();
+  }
+
+  // Insert unique keys from shared to global hash set
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
+         cur_idx += block.num_threads()) {
+      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
+    }
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+template <typename Kernel>
+int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
+{
+  int max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+
+template <typename SetType>
+void extract_populated_keys(SetType const& key_set,
+                            rmm::device_uvector<cudf::size_type>& populated_keys,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+}
+
+// make table that will hold sparse results
+template <typename GlobalSetType>
+auto create_sparse_results_table(cudf::table_view const& flattened_values,
+                                 cudf::aggregation::Kind const* d_agg_kinds,
+                                 std::vector<cudf::aggregation::Kind> aggs,
+                                 bool direct_aggregations,
+                                 GlobalSetType const& global_set,
+                                 rmm::device_uvector<cudf::size_type>& populated_keys,
+                                 rmm::cuda_stream_view stream)
+{
+  // TODO single allocation - room for performance improvement
+  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
+  std::transform(flattened_values.begin(),
+                 flattened_values.end(),
+                 aggs.begin(),
+                 std::back_inserter(sparse_columns),
+                 [stream](auto const& col, auto const& agg) {
+                   auto const nullable =
+                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
+                       ? false
+                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
+                          agg == cudf::aggregation::STD);
+                   auto mask_flag =
+                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
+                   auto const col_type = cudf::is_dictionary(col.type())
+                                           ? cudf::dictionary_column_view(col).keys().type()
+                                           : col.type();
+                   return make_fixed_width_column(
+                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+                 });
+  cudf::table sparse_table(std::move(sparse_columns));
+  // If no direct aggregations, initialize the sparse table
+  // only for the keys inserted in global hash set
+  if (!direct_aggregations) {
+    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
+    extract_populated_keys(global_set, populated_keys, stream);
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      populated_keys.size(),
+      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
+  }
+  // Else initialize the whole table
+  else {
+    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
+    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
+  }
+  return sparse_table;
+}
+}  // namespace
+
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  SetType& global_set,
+  bool skip_rows_with_nulls,
+  rmm::cuda_stream_view stream)
+{
+  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+  auto constexpr shared_set_capacity =
+    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
+  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
+  using shared_set_type        = cuco::static_set<cudf::size_type,
+                                           extent_type,
+                                           cuda::thread_scope_block,
+                                           typename SetType::key_equal,
+                                           probing_scheme_t,
+                                           cuco::cuda_allocator<cudf::size_type>,
+                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
+  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
+  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
+
+  auto const num_input_rows = keys.num_rows();
+
+  auto row_bitmask =
+    skip_rows_with_nulls
+      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
+  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
+  auto const grid_size = max_occupancy_grid_size(
+    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
+    num_input_rows);
+  // 'local_mapping_index' maps from the global row index of the input table to the row index of
+  // the local pre-aggregate table
+  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
+  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
+  // global aggregate table
+  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
+                                                            stream);
+  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
+  rmm::device_scalar<bool> direct_aggregations(false, stream);
+  compute_mapping_indices<shared_set_ref_type>
+    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
+                                                   num_input_rows,
+                                                   window_extent,
+                                                   static_cast<bitmask_type*>(row_bitmask.data()),
+                                                   skip_rows_with_nulls,
+                                                   local_mapping_index.data(),
+                                                   global_mapping_index.data(),
+                                                   block_cardinality.data(),
+                                                   direct_aggregations.data());
+  stream.synchronize();
+
+  // 'populated_keys' contains inserted row_indices (keys) of global hash set
+  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
+
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
+  // make table that will hold sparse results
+  cudf::table sparse_table = create_sparse_results_table(flattened_values,
+                                                         d_agg_kinds.data(),
+                                                         agg_kinds,
+                                                         direct_aggregations.value(stream),
+                                                         global_set,
+                                                         populated_keys,
+                                                         stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
+
+  compute_aggregations(grid_size,
+                       num_input_rows,
+                       static_cast<bitmask_type*>(row_bitmask.data()),
+                       skip_rows_with_nulls,
+                       local_mapping_index.data(),
+                       global_mapping_index.data(),
+                       block_cardinality.data(),
+                       *d_values,
+                       *d_sparse_table,
+                       d_agg_kinds.data(),
+                       stream);
+
+  if (direct_aggregations.value(stream)) {
+    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator(0),
+                       keys.num_rows(),
+                       compute_direct_aggregates{global_set_ref,
+                                                 *d_values,
+                                                 *d_sparse_table,
+                                                 d_agg_kinds.data(),
+                                                 block_cardinality.data(),
+                                                 stride,
+                                                 static_cast<bitmask_type*>(row_bitmask.data()),
+                                                 skip_rows_with_nulls});
+    extract_populated_keys(global_set, populated_keys, stream);
+  }
+
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+
+  return populated_keys;
+}
 
 using global_set_t = cuco::static_set<cudf::size_type,
                                       cuco::extent<int64_t>,
@@ -30,20 +359,6 @@ using global_set_t = cuco::static_set<cudf::size_type,
                                       cudf::detail::cuco_allocator<char>,
                                       cuco::storage<GROUPBY_WINDOW_SIZE>>;
 
-template void extract_populated_keys<global_set_t>(
-  global_set_t const& key_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
-template auto create_sparse_results_table<global_set_t>(
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> aggs,
-  bool direct_aggregations,
-  global_set_t const& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
 template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
   cudf::table_view const& keys,
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
@@ -52,7 +367,19 @@ template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_se
   bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
 
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+using nullable_global_set_t = cuco::static_set<cudf::size_type,
+                                               cuco::extent<int64_t>,
+                                               cuda::thread_scope_device,
+                                               nullable_row_comparator_t,
+                                               probing_scheme_t,
+                                               cudf::detail::cuco_allocator<char>,
+                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<nullable_global_set_t>(
+  cudf::table_view const& keys,
+  cudf::host_span<cudf::groupby::aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  nullable_global_set_t& global_set,
+  bool skip_rows_with_nulls,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
deleted file mode 100644
index 051259bf9f4..00000000000
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "compute_aggregations.hpp"
-// #include "compute_single_pass_aggs.hpp"
-#include "flatten_single_pass_aggs.hpp"
-#include "helpers.cuh"
-#include "single_pass_functors.cuh"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <cooperative_groups.h>
-#include <cuco/static_set.cuh>
-
-#include <unordered_set>
-
-namespace cudf::groupby::detail::hash {
-
-template <typename SetType>
-// TODO pass block
-__device__ void find_local_mapping(cudf::size_type cur_idx,
-                                   cudf::size_type num_input_rows,
-                                   SetType shared_set,
-                                   bitmask_type const* row_bitmask,
-                                   bool skip_rows_with_nulls,
-                                   cudf::size_type* cardinality,
-                                   cudf::size_type* local_mapping_index,
-                                   cudf::size_type* shared_set_indices)
-{
-  cudf::size_type result_idx;
-  // TODO: un-init
-  bool inserted;
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
-    auto const result = shared_set.insert_and_find(cur_idx);
-    result_idx        = *result.first;
-    inserted          = result.second;
-    // inserted a new element
-    if (result.second) {
-      auto const shared_set_index          = atomicAdd(cardinality, 1);
-      shared_set_indices[shared_set_index] = cur_idx;
-      local_mapping_index[cur_idx]         = shared_set_index;
-    }
-  }
-  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
-  // threads in the thread block.
-  __syncthreads();
-  if (cur_idx < num_input_rows and
-      (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) {
-    // element was already in set
-    if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; }
-  }
-}
-
-template <typename SetType>
-__device__ void find_global_mapping(cudf::size_type cur_idx,
-                                    SetType global_set,
-                                    cudf::size_type* shared_set_indices,
-                                    cudf::size_type* global_mapping_index)
-{
-  auto const input_idx = shared_set_indices[cur_idx];
-  global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] =
-    *global_set.insert_and_find(input_idx).first;
-}
-
-/*
- * Inserts keys into the shared memory hash set, and stores the row index of the local
- * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a
- * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without
- * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to
- * the global hash set, and save the row index of the global sparse table in `global_mapping_index`.
- */
-template <class SetRef, typename GlobalSetType, class WindowExtent>
-CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set,
-                                         cudf::size_type num_input_rows,
-                                         WindowExtent window_extent,
-                                         bitmask_type const* row_bitmask,
-                                         bool skip_rows_with_nulls,
-                                         cudf::size_type* local_mapping_index,
-                                         cudf::size_type* global_mapping_index,
-                                         cudf::size_type* block_cardinality,
-                                         bool* direct_aggregations)
-{
-  // TODO: indices inserted in each shared memory set
-  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
-
-  // Shared set initialization
-  __shared__ typename SetRef::window_type windows[window_extent.value()];
-  auto storage     = SetRef::storage_ref_type(window_extent, windows);
-  auto shared_set  = SetRef(cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                           global_set.key_eq(),
-                           probing_scheme_t{global_set.hash_function()},
-                            {},
-                           storage);
-  auto const block = cooperative_groups::this_thread_block();
-  shared_set.initialize(block);
-
-  auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find);
-
-  __shared__ cudf::size_type cardinality;
-  if (block.thread_rank() == 0) { cardinality = 0; }
-  block.sync();
-
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-  for (auto cur_idx = cudf::detail::grid_1d::global_thread_id();
-       cur_idx - block.thread_rank() < num_input_rows;
-       cur_idx += stride) {
-    find_local_mapping(cur_idx,
-                       num_input_rows,
-                       shared_insert_ref,
-                       row_bitmask,
-                       skip_rows_with_nulls,
-                       &cardinality,
-                       local_mapping_index,
-                       shared_set_indices);
-
-    block.sync();
-
-    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
-      if (block.thread_rank() == 0) { *direct_aggregations = true; }
-      break;
-    }
-
-    block.sync();
-  }
-
-  // Insert unique keys from shared to global hash set
-  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
-    for (auto cur_idx = block.thread_rank(); cur_idx < cardinality;
-         cur_idx += block.num_threads()) {
-      find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index);
-    }
-  }
-
-  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
-}
-
-template <typename Kernel>
-int max_occupancy_grid_size(Kernel kernel, cudf::size_type n)
-{
-  int max_active_blocks{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0));
-  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
-  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
-  return std::min(grid_size, num_blocks);
-}
-
-template <typename SetType>
-void extract_populated_keys(SetType const& key_set,
-                            rmm::device_uvector<cudf::size_type>& populated_keys,
-                            rmm::cuda_stream_view stream)
-{
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-}
-
-// make table that will hold sparse results
-template <typename GlobalSetType>
-auto create_sparse_results_table(cudf::table_view const& flattened_values,
-                                 cudf::aggregation::Kind const* d_agg_kinds,
-                                 std::vector<cudf::aggregation::Kind> aggs,
-                                 bool direct_aggregations,
-                                 GlobalSetType const& global_set,
-                                 rmm::device_uvector<cudf::size_type>& populated_keys,
-                                 rmm::cuda_stream_view stream)
-{
-  // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<cudf::column>> sparse_columns;
-  std::transform(flattened_values.begin(),
-                 flattened_values.end(),
-                 aggs.begin(),
-                 std::back_inserter(sparse_columns),
-                 [stream](auto const& col, auto const& agg) {
-                   auto const nullable =
-                     (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL)
-                       ? false
-                       : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or
-                          agg == cudf::aggregation::STD);
-                   auto mask_flag =
-                     (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED;
-                   auto const col_type = cudf::is_dictionary(col.type())
-                                           ? cudf::dictionary_column_view(col).keys().type()
-                                           : col.type();
-                   return make_fixed_width_column(
-                     cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-                 });
-  cudf::table sparse_table(std::move(sparse_columns));
-  // If no direct aggregations, initialize the sparse table
-  // only for the keys inserted in global hash set
-  if (!direct_aggregations) {
-    auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream);
-    extract_populated_keys(global_set, populated_keys, stream);
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      populated_keys.size(),
-      initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds});
-  }
-  // Else initialize the whole table
-  else {
-    cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view();
-    cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream);
-  }
-  return sparse_table;
-}
-
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
-  cudf::table_view const& keys,
-  cudf::host_span<cudf::groupby::aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  SetType& global_set,
-  bool skip_rows_with_nulls,
-  rmm::cuda_stream_view stream)
-{
-  // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
-  auto constexpr shared_set_capacity =
-    static_cast<std::size_t>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43);
-  using extent_type            = cuco::extent<cudf::size_type, shared_set_capacity>;
-  using shared_set_type        = cuco::static_set<cudf::size_type,
-                                           extent_type,
-                                           cuda::thread_scope_block,
-                                           typename SetType::key_equal,
-                                           probing_scheme_t,
-                                           cuco::cuda_allocator<cudf::size_type>,
-                                           cuco::storage<GROUPBY_WINDOW_SIZE>>;
-  using shared_set_ref_type    = typename shared_set_type::ref_type<>;
-  auto constexpr window_extent = cuco::make_window_extent<shared_set_ref_type>(extent_type{});
-
-  auto const num_input_rows = keys.num_rows();
-
-  auto row_bitmask =
-    skip_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
-      : rmm::device_buffer{};
-
-  auto global_set_ref  = global_set.ref(cuco::op::insert_and_find);
-  auto const grid_size = max_occupancy_grid_size(
-    compute_mapping_indices<shared_set_ref_type, decltype(global_set_ref), decltype(window_extent)>,
-    num_input_rows);
-  // 'local_mapping_index' maps from the global row index of the input table to the row index of
-  // the local pre-aggregate table
-  rmm::device_uvector<cudf::size_type> local_mapping_index(num_input_rows, stream);
-  // 'global_mapping_index' maps from  the local pre-aggregate table to the row index of
-  // global aggregate table
-  rmm::device_uvector<cudf::size_type> global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS,
-                                                            stream);
-  rmm::device_uvector<cudf::size_type> block_cardinality(grid_size, stream);
-  rmm::device_scalar<bool> direct_aggregations(false, stream);
-  compute_mapping_indices<shared_set_ref_type>
-    <<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(global_set_ref,
-                                                   num_input_rows,
-                                                   window_extent,
-                                                   static_cast<bitmask_type*>(row_bitmask.data()),
-                                                   skip_rows_with_nulls,
-                                                   local_mapping_index.data(),
-                                                   global_mapping_index.data(),
-                                                   block_cardinality.data(),
-                                                   direct_aggregations.data());
-  stream.synchronize();
-
-  // 'populated_keys' contains inserted row_indices (keys) of global hash set
-  rmm::device_uvector<cudf::size_type> populated_keys(keys.num_rows(), stream);
-
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-  auto const d_agg_kinds                         = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
-  // make table that will hold sparse results
-  cudf::table sparse_table = create_sparse_results_table(flattened_values,
-                                                         d_agg_kinds.data(),
-                                                         agg_kinds,
-                                                         direct_aggregations.value(stream),
-                                                         global_set,
-                                                         populated_keys,
-                                                         stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-
-  compute_aggregations(grid_size,
-                       num_input_rows,
-                       static_cast<bitmask_type*>(row_bitmask.data()),
-                       skip_rows_with_nulls,
-                       local_mapping_index.data(),
-                       global_mapping_index.data(),
-                       block_cardinality.data(),
-                       *d_values,
-                       *d_sparse_table,
-                       d_agg_kinds.data(),
-                       stream);
-
-  if (direct_aggregations.value(stream)) {
-    auto const stride = GROUPBY_BLOCK_SIZE * grid_size;
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       keys.num_rows(),
-                       compute_direct_aggregates{global_set_ref,
-                                                 *d_values,
-                                                 *d_sparse_table,
-                                                 d_agg_kinds.data(),
-                                                 block_cardinality.data(),
-                                                 stride,
-                                                 static_cast<bitmask_type*>(row_bitmask.data()),
-                                                 skip_rows_with_nulls});
-    extract_populated_keys(global_set, populated_keys, stream);
-  }
-
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-
-  return populated_keys;
-}
-
-}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
index 848ace94ff9..6cbea9fcd3c 100644
--- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/detail/aggregation/result_cache.hpp>
@@ -25,10 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 /**
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
@@ -39,11 +35,6 @@ rmm::device_uvector<cudf::size_type> compute_single_pass_aggs(
   cudf::host_span<cudf::groupby::aggregation_request const> requests,
   cudf::detail::result_cache* sparse_results,
   SetType& global_set,
-  bool keys_have_nulls,
-  null_policy include_null_keys,
+  bool skip_rows_with_nulls,
   rmm::cuda_stream_view stream);
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
deleted file mode 100644
index e566c2c5d27..00000000000
--- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "compute_single_pass_aggs.cuh"
-#include "helpers.cuh"
-
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
-
-using global_set_t = cuco::static_set<cudf::size_type,
-                                      cuco::extent<int64_t>,
-                                      cuda::thread_scope_device,
-                                      nullable_row_comparator_t,
-                                      probing_scheme_t,
-                                      cudf::detail::cuco_allocator<char>,
-                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
-
-template void extract_populated_keys<global_set_t>(
-  global_set_t const& key_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
-template auto create_sparse_results_table<global_set_t>(
-  cudf::table_view const& flattened_values,
-  cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> aggs,
-  bool direct_aggregations,
-  global_set_t const& global_set,
-  rmm::device_uvector<cudf::size_type>& populated_keys,
-  rmm::cuda_stream_view stream);
-
-template rmm::device_uvector<cudf::size_type> compute_single_pass_aggs<global_set_t>(
-  cudf::table_view const& keys,
-  cudf::host_span<cudf::groupby::aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  global_set_t& global_set,
-  bool skip_rows_with_nulls,
-  rmm::cuda_stream_view stream);
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 62434bf5fd2..b307b8a8d1f 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "compute_groupby.cuh"
+#include "compute_groupby.hpp"
 #include "groupby/common/utils.hpp"
 #include "helpers.cuh"
 
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
index 760926afa13..a416e2124ce 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cu
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,55 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-#include "helpers.cuh"
-#include "sparse_to_dense_results.cuh"
+#include "hash_compound_agg_finalizer.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace cudf::groupby::detail::hash {
+/**
+ * @brief Gather sparse results into dense using `gather_map` and add to
+ * `dense_cache`
+ *
+ * @see groupby_null_templated()
+ */
+template <typename SetType>
+void sparse_to_dense_results(table_view const& keys,
+                             host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetType set,
+                             bool skip_key_rows_with_nulls,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
+  bitmask_type const* row_bitmask_ptr =
+    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+    auto const& col   = request.values;
+
+    // Given an aggregation, this will get the result from sparse_results and
+    // convert and return dense, compacted result
+    auto finalizer = hash_compound_agg_finalizer(
+      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
+  }
+}
 
 template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
                                                       host_span<aggregation_request const> requests,
@@ -29,4 +73,15 @@ template void sparse_to_dense_results<hash_set_ref_t>(table_view const& keys,
                                                       rmm::cuda_stream_view stream,
                                                       rmm::device_async_resource_ref mr);
 
+template void sparse_to_dense_results<nullable_hash_set_ref_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  nullable_hash_set_ref_t set,
+  bool skip_key_rows_with_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
similarity index 57%
rename from cpp/src/groupby/hash/sparse_to_dense_results.cuh
rename to cpp/src/groupby/hash/sparse_to_dense_results.hpp
index b89fc308e6e..bfdc42953ad 100644
--- a/cpp/src/groupby/hash/sparse_to_dense_results.cuh
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -15,18 +15,16 @@
  */
 #pragma once
 
-#include "compute_single_pass_aggs.cuh"
-#include "hash_compound_agg_finalizer.cuh"
-#include "var_hash_functor.cuh"
-
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/unary.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
-namespace cudf::groupby::detail::hash {
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
+namespace cudf::groupby::detail::hash {
 /**
  * @brief Gather sparse results into dense using `gather_map` and add to
  * `dense_cache`
@@ -42,25 +40,5 @@ void sparse_to_dense_results(table_view const& keys,
                              SetType set,
                              bool skip_key_rows_with_nulls,
                              rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-    auto const& col   = request.values;
-
-    // Given an aggregation, this will get the result from sparse_results and
-    // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
-    for (auto&& agg : agg_v) {
-      agg->finalize(finalizer);
-    }
-  }
-}
-
+                             rmm::device_async_resource_ref mr);
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu
deleted file mode 100644
index b6820f7f6db..00000000000
--- a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "helpers.cuh"
-#include "sparse_to_dense_results.cuh"
-
-namespace cudf::groupby::detail::hash {
-
-template void sparse_to_dense_results<nullable_hash_set_ref_t>(
-  table_view const& keys,
-  host_span<aggregation_request const> requests,
-  cudf::detail::result_cache* sparse_results,
-  cudf::detail::result_cache* dense_results,
-  device_span<size_type const> gather_map,
-  nullable_hash_set_ref_t set,
-  bool skip_key_rows_with_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
-}  // namespace cudf::groupby::detail::hash