From 45573e0b1f69c8c497407e98ad4f9dbe3baf36c4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Sep 2024 12:37:39 -0700 Subject: [PATCH] Cleanups --- cpp/CMakeLists.txt | 3 - cpp/src/groupby/hash/compute_groupby.cu | 111 +++++- cpp/src/groupby/hash/compute_groupby.cuh | 111 ------ cpp/src/groupby/hash/compute_groupby.hpp | 68 ++++ cpp/src/groupby/hash/compute_groupby_null.cu | 31 -- .../groupby/hash/compute_single_pass_aggs.cu | 375 ++++++++++++++++-- .../groupby/hash/compute_single_pass_aggs.cuh | 355 ----------------- .../groupby/hash/compute_single_pass_aggs.hpp | 15 +- .../hash/compute_single_pass_aggs_null.cu | 58 --- cpp/src/groupby/hash/groupby.cu | 2 +- .../groupby/hash/sparse_to_dense_results.cu | 61 ++- ...esults.cuh => sparse_to_dense_results.hpp} | 38 +- .../hash/sparse_to_dense_results_null.cu | 33 -- 13 files changed, 599 insertions(+), 662 deletions(-) delete mode 100644 cpp/src/groupby/hash/compute_groupby.cuh create mode 100644 cpp/src/groupby/hash/compute_groupby.hpp delete mode 100644 cpp/src/groupby/hash/compute_groupby_null.cu delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cuh delete mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs_null.cu rename cpp/src/groupby/hash/{sparse_to_dense_results.cuh => sparse_to_dense_results.hpp} (57%) delete mode 100644 cpp/src/groupby/hash/sparse_to_dense_results_null.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0d9529ef58d..663f2210ef4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -316,14 +316,11 @@ add_library( src/groupby/groupby.cu src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_groupby.cu - src/groupby/hash/compute_groupby_null.cu - src/groupby/hash/compute_single_pass_aggs_null.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu src/groupby/hash/hash_compound_agg_finalizer.cu src/groupby/hash/sparse_to_dense_results.cu - src/groupby/hash/sparse_to_dense_results_null.cu src/groupby/hash/var_hash_functor.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 7965d0891a7..9643567a825 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -13,11 +13,110 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once -#include "compute_groupby.cuh" +#include "compute_single_pass_aggs.cuh" +// #include "compute_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "sparse_to_dense_results.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include namespace cudf::groupby::detail::hash { +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + */ +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + Equal const& d_row_equal, + row_hash_t const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); + + // Cache of sparse results where the location of aggregate value in each + // column is indexed by the hash set + cudf::detail::result_cache sparse_results(requests.size()); + + auto const set = cuco::static_set{ + cuco::extent{num_keys}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_row_equal, + probing_scheme_t{d_row_hash}, + cuco::thread_scope_device, + cuco::storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + // Compute all single pass aggs first + auto gather_map = compute_single_pass_aggs( + keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); + + // Compact all results from sparse_results and insert into cache + sparse_to_dense_results(keys, + requests, + &sparse_results, + cache, + gather_map, + set.ref(cuco::find), + skip_key_rows_with_nulls, + stream, + mr); + + return cudf::detail::gather(keys, + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + template std::unique_ptr
compute_groupby( table_view const& keys, host_span requests, @@ -28,4 +127,14 @@ template std::unique_ptr
compute_groupby( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + nullable_row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.cuh b/cpp/src/groupby/hash/compute_groupby.cuh deleted file mode 100644 index 146900f61eb..00000000000 --- a/cpp/src/groupby/hash/compute_groupby.cuh +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "compute_single_pass_aggs.cuh" -// #include "compute_single_pass_aggs.hpp" -#include "sparse_to_dense_results.cuh" -#include "var_hash_functor.cuh" - -#include -#include -#include -#include - -#include - -namespace cudf::groupby::detail::hash { - -/** - * @brief Computes groupby using hash table. - * - * First, we create a hash table that stores the indices of unique rows in - * `keys`. The upper limit on the number of values in this map is the number - * of rows in `keys`. - * - * To store the results of aggregations, we create temporary sparse columns - * which have the same size as input value columns. Using the hash map, we - * determine the location within the sparse column to write the result of the - * aggregation into. - * - * The sparse column results of all aggregations are stored into the cache - * `sparse_results`. This enables the use of previously calculated results in - * other aggregations. - * - * All the aggregations which can be computed in a single pass are computed - * first, in a combined kernel. Then using these results, aggregations that - * require multiple passes, will be computed. - * - * Finally, using the hash map, we generate a vector of indices of populated - * values in sparse result columns. Then, for each aggregation originally - * requested in `requests`, we gather sparse results into a column of dense - * results using the aforementioned index vector. Dense results are stored into - * the in/out parameter `cache`. - */ -template -std::unique_ptr
compute_groupby( - table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, - Equal const& d_row_equal, - cudf::experimental::row::hash::device_row_hasher const& d_row_hash, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - // convert to int64_t to avoid potential overflow with large `keys` - auto const num_keys = static_cast(keys.num_rows()); - - // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash set - cudf::detail::result_cache sparse_results(requests.size()); - - auto const set = cuco::static_set{ - cuco::extent{num_keys}, - cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% occupancy - cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_row_equal, - probing_scheme_t{d_row_hash}, - cuco::thread_scope_device, - cuco::storage{}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Compute all single pass aggs first - auto gather_map = compute_single_pass_aggs( - keys, requests, &sparse_results, set, skip_key_rows_with_nulls, stream); - - // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(keys, - requests, - &sparse_results, - cache, - gather_map, - set.ref(cuco::find), - skip_key_rows_with_nulls, - stream, - mr); - - return cudf::detail::gather(keys, - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); -} - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp new file mode 100644 index 00000000000..358c81365a0 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "helpers.cuh" + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + */ +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool skip_key_rows_with_nulls, + Equal const& d_row_equal, + row_hash_t const& d_row_hash, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby_null.cu b/cpp/src/groupby/hash/compute_groupby_null.cu deleted file mode 100644 index 1f9707902cc..00000000000 --- a/cpp/src/groupby/hash/compute_groupby_null.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_groupby.cuh" - -namespace cudf::groupby::detail::hash { - -template std::unique_ptr
compute_groupby( - table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool skip_key_rows_with_nulls, - nullable_row_comparator_t const& d_row_equal, - row_hash_t const& d_row_hash, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu index 2770dc2a84c..b5c68ea639a 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cu +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,342 @@ * limitations under the License. */ -#include "compute_single_pass_aggs.cuh" +#include "compute_aggregations.hpp" +// #include "compute_single_pass_aggs.hpp" +#include "flatten_single_pass_aggs.hpp" #include "helpers.cuh" +#include "single_pass_functors.cuh" -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +namespace { +template +// TODO pass block +__device__ void find_local_mapping(cudf::size_type cur_idx, + cudf::size_type num_input_rows, + SetType shared_set, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* cardinality, + cudf::size_type* local_mapping_index, + cudf::size_type* shared_set_indices) +{ + cudf::size_type result_idx; + // TODO: un-init + bool inserted; + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { + auto const result = shared_set.insert_and_find(cur_idx); + result_idx = *result.first; + inserted = result.second; + // inserted a new element + if (result.second) { + auto const shared_set_index = atomicAdd(cardinality, 1); + shared_set_indices[shared_set_index] = cur_idx; + local_mapping_index[cur_idx] = shared_set_index; + } + } + // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all + // threads in the thread block. + __syncthreads(); + if (cur_idx < num_input_rows and + (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { + // element was already in set + if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } + } +} + +template +__device__ void find_global_mapping(cudf::size_type cur_idx, + SetType global_set, + cudf::size_type* shared_set_indices, + cudf::size_type* global_mapping_index) +{ + auto const input_idx = shared_set_indices[cur_idx]; + global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = + *global_set.insert_and_find(input_idx).first; +} + +/* + * Inserts keys into the shared memory hash set, and stores the row index of the local + * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a + * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without + * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to + * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. + */ +template +CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, + cudf::size_type num_input_rows, + WindowExtent window_extent, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + bool* direct_aggregations) +{ + // TODO: indices inserted in each shared memory set + __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; + + // Shared set initialization + __shared__ typename SetRef::window_type windows[window_extent.value()]; + auto storage = SetRef::storage_ref_type(window_extent, windows); + auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + global_set.key_eq(), + probing_scheme_t{global_set.hash_function()}, + {}, + storage); + auto const block = cooperative_groups::this_thread_block(); + shared_set.initialize(block); + + auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); + + __shared__ cudf::size_type cardinality; + if (block.thread_rank() == 0) { cardinality = 0; } + block.sync(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + + for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); + cur_idx - block.thread_rank() < num_input_rows; + cur_idx += stride) { + find_local_mapping(cur_idx, + num_input_rows, + shared_insert_ref, + row_bitmask, + skip_rows_with_nulls, + &cardinality, + local_mapping_index, + shared_set_indices); + + block.sync(); + + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { + if (block.thread_rank() == 0) { *direct_aggregations = true; } + break; + } + + block.sync(); + } + + // Insert unique keys from shared to global hash set + if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { + for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; + cur_idx += block.num_threads()) { + find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); + } + } + + if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } +} + +template +int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) +{ + int max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); + auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); + return std::min(grid_size, num_blocks); +} + +template +void extract_populated_keys(SetType const& key_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); +} + +// make table that will hold sparse results +template +auto create_sparse_results_table(cudf::table_view const& flattened_values, + cudf::aggregation::Kind const* d_agg_kinds, + std::vector aggs, + bool direct_aggregations, + GlobalSetType const& global_set, + rmm::device_uvector& populated_keys, + rmm::cuda_stream_view stream) +{ + // TODO single allocation - room for performance improvement + std::vector> sparse_columns; + std::transform(flattened_values.begin(), + flattened_values.end(), + aggs.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or + agg == cudf::aggregation::STD); + auto mask_flag = + (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + cudf::table sparse_table(std::move(sparse_columns)); + // If no direct aggregations, initialize the sparse table + // only for the keys inserted in global hash set + if (!direct_aggregations) { + auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); + extract_populated_keys(global_set, populated_keys, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + populated_keys.size(), + initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); + } + // Else initialize the whole table + else { + cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); + } + return sparse_table; +} +} // namespace + +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + SetType& global_set, + bool skip_rows_with_nulls, + rmm::cuda_stream_view stream) +{ + // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy + auto constexpr shared_set_capacity = + static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); + using extent_type = cuco::extent; + using shared_set_type = cuco::static_set, + cuco::storage>; + using shared_set_ref_type = typename shared_set_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + auto const num_input_rows = keys.num_rows(); + + auto row_bitmask = + skip_rows_with_nulls + ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + + auto global_set_ref = global_set.ref(cuco::op::insert_and_find); + auto const grid_size = max_occupancy_grid_size( + compute_mapping_indices, + num_input_rows); + // 'local_mapping_index' maps from the global row index of the input table to the row index of + // the local pre-aggregate table + rmm::device_uvector local_mapping_index(num_input_rows, stream); + // 'global_mapping_index' maps from the local pre-aggregate table to the row index of + // global aggregate table + rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, + stream); + rmm::device_uvector block_cardinality(grid_size, stream); + rmm::device_scalar direct_aggregations(false, stream); + compute_mapping_indices + <<>>(global_set_ref, + num_input_rows, + window_extent, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + direct_aggregations.data()); + stream.synchronize(); + + // 'populated_keys' contains inserted row_indices (keys) of global hash set + rmm::device_uvector populated_keys(keys.num_rows(), stream); + + // flatten the aggs to a table that can be operated on by aggregate_row + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto const d_agg_kinds = cudf::detail::make_device_uvector_async( + agg_kinds, stream, rmm::mr::get_current_device_resource()); + // make table that will hold sparse results + cudf::table sparse_table = create_sparse_results_table(flattened_values, + d_agg_kinds.data(), + agg_kinds, + direct_aggregations.value(stream), + global_set, + populated_keys, + stream); + // prepare to launch kernel to do the actual aggregation + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto d_values = table_device_view::create(flattened_values, stream); + + compute_aggregations(grid_size, + num_input_rows, + static_cast(row_bitmask.data()), + skip_rows_with_nulls, + local_mapping_index.data(), + global_mapping_index.data(), + block_cardinality.data(), + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + stream); + + if (direct_aggregations.value(stream)) { + auto const stride = GROUPBY_BLOCK_SIZE * grid_size; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + keys.num_rows(), + compute_direct_aggregates{global_set_ref, + *d_values, + *d_sparse_table, + d_agg_kinds.data(), + block_cardinality.data(), + stride, + static_cast(row_bitmask.data()), + skip_rows_with_nulls}); + extract_populated_keys(global_set, populated_keys, stream); + } + + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } + + return populated_keys; +} using global_set_t = cuco::static_set, @@ -30,20 +359,6 @@ using global_set_t = cuco::static_set, cuco::storage>; -template void extract_populated_keys( - global_set_t const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - -template auto create_sparse_results_table( - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, - bool direct_aggregations, - global_set_t const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - template rmm::device_uvector compute_single_pass_aggs( cudf::table_view const& keys, cudf::host_span requests, @@ -52,7 +367,19 @@ template rmm::device_uvector compute_single_pass_aggs, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +template rmm::device_uvector compute_single_pass_aggs( + cudf::table_view const& keys, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + nullable_global_set_t& global_set, + bool skip_rows_with_nulls, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh b/cpp/src/groupby/hash/compute_single_pass_aggs.cuh deleted file mode 100644 index 051259bf9f4..00000000000 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.cuh +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "compute_aggregations.hpp" -// #include "compute_single_pass_aggs.hpp" -#include "flatten_single_pass_aggs.hpp" -#include "helpers.cuh" -#include "single_pass_functors.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include - -namespace cudf::groupby::detail::hash { - -template -// TODO pass block -__device__ void find_local_mapping(cudf::size_type cur_idx, - cudf::size_type num_input_rows, - SetType shared_set, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* cardinality, - cudf::size_type* local_mapping_index, - cudf::size_type* shared_set_indices) -{ - cudf::size_type result_idx; - // TODO: un-init - bool inserted; - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { - auto const result = shared_set.insert_and_find(cur_idx); - result_idx = *result.first; - inserted = result.second; - // inserted a new element - if (result.second) { - auto const shared_set_index = atomicAdd(cardinality, 1); - shared_set_indices[shared_set_index] = cur_idx; - local_mapping_index[cur_idx] = shared_set_index; - } - } - // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all - // threads in the thread block. - __syncthreads(); - if (cur_idx < num_input_rows and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, cur_idx))) { - // element was already in set - if (!inserted) { local_mapping_index[cur_idx] = local_mapping_index[result_idx]; } - } -} - -template -__device__ void find_global_mapping(cudf::size_type cur_idx, - SetType global_set, - cudf::size_type* shared_set_indices, - cudf::size_type* global_mapping_index) -{ - auto const input_idx = shared_set_indices[cur_idx]; - global_mapping_index[blockIdx.x * GROUPBY_SHM_MAX_ELEMENTS + cur_idx] = - *global_set.insert_and_find(input_idx).first; -} - -/* - * Inserts keys into the shared memory hash set, and stores the row index of the local - * pre-aggregate table in `local_mapping_index`. If the number of unique keys found in a - * threadblock exceeds `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without - * updating `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to - * the global hash set, and save the row index of the global sparse table in `global_mapping_index`. - */ -template -CUDF_KERNEL void compute_mapping_indices(GlobalSetType global_set, - cudf::size_type num_input_rows, - WindowExtent window_extent, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, - cudf::size_type* local_mapping_index, - cudf::size_type* global_mapping_index, - cudf::size_type* block_cardinality, - bool* direct_aggregations) -{ - // TODO: indices inserted in each shared memory set - __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; - - // Shared set initialization - __shared__ typename SetRef::window_type windows[window_extent.value()]; - auto storage = SetRef::storage_ref_type(window_extent, windows); - auto shared_set = SetRef(cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - global_set.key_eq(), - probing_scheme_t{global_set.hash_function()}, - {}, - storage); - auto const block = cooperative_groups::this_thread_block(); - shared_set.initialize(block); - - auto shared_insert_ref = std::move(shared_set).with(cuco::insert_and_find); - - __shared__ cudf::size_type cardinality; - if (block.thread_rank() == 0) { cardinality = 0; } - block.sync(); - - auto const stride = cudf::detail::grid_1d::grid_stride(); - - for (auto cur_idx = cudf::detail::grid_1d::global_thread_id(); - cur_idx - block.thread_rank() < num_input_rows; - cur_idx += stride) { - find_local_mapping(cur_idx, - num_input_rows, - shared_insert_ref, - row_bitmask, - skip_rows_with_nulls, - &cardinality, - local_mapping_index, - shared_set_indices); - - block.sync(); - - if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { - if (block.thread_rank() == 0) { *direct_aggregations = true; } - break; - } - - block.sync(); - } - - // Insert unique keys from shared to global hash set - if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) { - for (auto cur_idx = block.thread_rank(); cur_idx < cardinality; - cur_idx += block.num_threads()) { - find_global_mapping(cur_idx, global_set, shared_set_indices, global_mapping_index); - } - } - - if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; } -} - -template -int max_occupancy_grid_size(Kernel kernel, cudf::size_type n) -{ - int max_active_blocks{-1}; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, kernel, GROUPBY_BLOCK_SIZE, 0)); - auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); - auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); - return std::min(grid_size, num_blocks); -} - -template -void extract_populated_keys(SetType const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); -} - -// make table that will hold sparse results -template -auto create_sparse_results_table(cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, - bool direct_aggregations, - GlobalSetType const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream) -{ - // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - std::transform(flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - auto const nullable = - (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or - agg == cudf::aggregation::STD); - auto mask_flag = - (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; - auto const col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); - cudf::table sparse_table(std::move(sparse_columns)); - // If no direct aggregations, initialize the sparse table - // only for the keys inserted in global hash set - if (!direct_aggregations) { - auto d_sparse_table = cudf::mutable_table_device_view::create(sparse_table, stream); - extract_populated_keys(global_set, populated_keys, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - populated_keys.size(), - initialize_sparse_table{populated_keys.data(), *d_sparse_table, d_agg_kinds}); - } - // Else initialize the whole table - else { - cudf::mutable_table_view sparse_table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(sparse_table_view, aggs, stream); - } - return sparse_table; -} - -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - SetType& global_set, - bool skip_rows_with_nulls, - rmm::cuda_stream_view stream) -{ - // GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy - auto constexpr shared_set_capacity = - static_cast(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43); - using extent_type = cuco::extent; - using shared_set_type = cuco::static_set, - cuco::storage>; - using shared_set_ref_type = typename shared_set_type::ref_type<>; - auto constexpr window_extent = cuco::make_window_extent(extent_type{}); - - auto const num_input_rows = keys.num_rows(); - - auto row_bitmask = - skip_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - - auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - auto const grid_size = max_occupancy_grid_size( - compute_mapping_indices, - num_input_rows); - // 'local_mapping_index' maps from the global row index of the input table to the row index of - // the local pre-aggregate table - rmm::device_uvector local_mapping_index(num_input_rows, stream); - // 'global_mapping_index' maps from the local pre-aggregate table to the row index of - // global aggregate table - rmm::device_uvector global_mapping_index(grid_size * GROUPBY_SHM_MAX_ELEMENTS, - stream); - rmm::device_uvector block_cardinality(grid_size, stream); - rmm::device_scalar direct_aggregations(false, stream); - compute_mapping_indices - <<>>(global_set_ref, - num_input_rows, - window_extent, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - direct_aggregations.data()); - stream.synchronize(); - - // 'populated_keys' contains inserted row_indices (keys) of global hash set - rmm::device_uvector populated_keys(keys.num_rows(), stream); - - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - auto const d_agg_kinds = cudf::detail::make_device_uvector_async( - agg_kinds, stream, rmm::mr::get_current_device_resource()); - // make table that will hold sparse results - cudf::table sparse_table = create_sparse_results_table(flattened_values, - d_agg_kinds.data(), - agg_kinds, - direct_aggregations.value(stream), - global_set, - populated_keys, - stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - - compute_aggregations(grid_size, - num_input_rows, - static_cast(row_bitmask.data()), - skip_rows_with_nulls, - local_mapping_index.data(), - global_mapping_index.data(), - block_cardinality.data(), - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - stream); - - if (direct_aggregations.value(stream)) { - auto const stride = GROUPBY_BLOCK_SIZE * grid_size; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - compute_direct_aggregates{global_set_ref, - *d_values, - *d_sparse_table, - d_agg_kinds.data(), - block_cardinality.data(), - stride, - static_cast(row_bitmask.data()), - skip_rows_with_nulls}); - extract_populated_keys(global_set, populated_keys, stream); - } - - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } - - return populated_keys; -} - -} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp index 848ace94ff9..6cbea9fcd3c 100644 --- a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -25,10 +24,7 @@ #include #include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { /** * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` @@ -39,11 +35,6 @@ rmm::device_uvector compute_single_pass_aggs( cudf::host_span requests, cudf::detail::result_cache* sparse_results, SetType& global_set, - bool keys_have_nulls, - null_policy include_null_keys, + bool skip_rows_with_nulls, rmm::cuda_stream_view stream); - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu b/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu deleted file mode 100644 index e566c2c5d27..00000000000 --- a/cpp/src/groupby/hash/compute_single_pass_aggs_null.cu +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "compute_single_pass_aggs.cuh" -#include "helpers.cuh" - -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { - -using global_set_t = cuco::static_set, - cuda::thread_scope_device, - nullable_row_comparator_t, - probing_scheme_t, - cudf::detail::cuco_allocator, - cuco::storage>; - -template void extract_populated_keys( - global_set_t const& key_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - -template auto create_sparse_results_table( - cudf::table_view const& flattened_values, - cudf::aggregation::Kind const* d_agg_kinds, - std::vector aggs, - bool direct_aggregations, - global_set_t const& global_set, - rmm::device_uvector& populated_keys, - rmm::cuda_stream_view stream); - -template rmm::device_uvector compute_single_pass_aggs( - cudf::table_view const& keys, - cudf::host_span requests, - cudf::detail::result_cache* sparse_results, - global_set_t& global_set, - bool skip_rows_with_nulls, - rmm::cuda_stream_view stream); - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 62434bf5fd2..b307b8a8d1f 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "compute_groupby.cuh" +#include "compute_groupby.hpp" #include "groupby/common/utils.hpp" #include "helpers.cuh" diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu index 760926afa13..a416e2124ce 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cu +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,11 +13,55 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once -#include "helpers.cuh" -#include "sparse_to_dense_results.cuh" +#include "hash_compound_agg_finalizer.cuh" + +#include +#include +#include +#include +#include + +#include +#include namespace cudf::groupby::detail::hash { +/** + * @brief Gather sparse results into dense using `gather_map` and add to + * `dense_cache` + * + * @see groupby_null_templated() + */ +template +void sparse_to_dense_results(table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto row_bitmask = + cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; + bitmask_type const* row_bitmask_ptr = + skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; + + // Given an aggregation, this will get the result from sparse_results and + // convert and return dense, compacted result + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } + } +} template void sparse_to_dense_results(table_view const& keys, host_span requests, @@ -29,4 +73,15 @@ template void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +template void sparse_to_dense_results( + table_view const& keys, + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + nullable_hash_set_ref_t set, + bool skip_key_rows_with_nulls, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cuh b/cpp/src/groupby/hash/sparse_to_dense_results.hpp similarity index 57% rename from cpp/src/groupby/hash/sparse_to_dense_results.cuh rename to cpp/src/groupby/hash/sparse_to_dense_results.hpp index b89fc308e6e..bfdc42953ad 100644 --- a/cpp/src/groupby/hash/sparse_to_dense_results.cuh +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -15,18 +15,16 @@ */ #pragma once -#include "compute_single_pass_aggs.cuh" -#include "hash_compound_agg_finalizer.cuh" -#include "var_hash_functor.cuh" - -#include #include -#include -#include -#include +#include +#include +#include +#include -namespace cudf::groupby::detail::hash { +#include +#include +namespace cudf::groupby::detail::hash { /** * @brief Gather sparse results into dense using `gather_map` and add to * `dense_cache` @@ -42,25 +40,5 @@ void sparse_to_dense_results(table_view const& keys, SetType set, bool skip_key_rows_with_nulls, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - auto const& col = request.values; - - // Given an aggregation, this will get the result from sparse_results and - // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { - agg->finalize(finalizer); - } - } -} - + rmm::device_async_resource_ref mr); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu b/cpp/src/groupby/hash/sparse_to_dense_results_null.cu deleted file mode 100644 index b6820f7f6db..00000000000 --- a/cpp/src/groupby/hash/sparse_to_dense_results_null.cu +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "helpers.cuh" -#include "sparse_to_dense_results.cuh" - -namespace cudf::groupby::detail::hash { - -template void sparse_to_dense_results( - table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - nullable_hash_set_ref_t set, - bool skip_key_rows_with_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace cudf::groupby::detail::hash