From 8ebf0d40669014e42b9c9079cdc1cc3ab5c4f7a8 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 17 Oct 2024 18:27:58 -0700 Subject: [PATCH] Add device aggregators used by shared memory groupby (#17031) This work is part of splitting the original bulk shared memory groupby PR #16619. It introduces two device-side element aggregators: - `shmem_element_aggregator`: aggregates data from global memory sources to shared memory targets, - `gmem_element_aggregator`: aggregates from shared memory sources to global memory targets. These two aggregators are similar to the `elementwise_aggregator` functionality. Follow-up work is tracked via #17032. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17031 --- .../detail/aggregation/device_aggregators.cuh | 63 ++-- .../groupby/hash/global_memory_aggregator.cuh | 277 ++++++++++++++++++ .../groupby/hash/shared_memory_aggregator.cuh | 263 +++++++++++++++++ 3 files changed, 570 insertions(+), 33 deletions(-) create mode 100644 cpp/src/groupby/hash/global_memory_aggregator.cuh create mode 100644 cpp/src/groupby/hash/shared_memory_aggregator.cuh diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 10be5e1d36f..204eee49a2a 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -29,12 +28,31 @@ #include namespace cudf::detail { +/// Checks if an aggregation kind needs to operate on the underlying storage type +template +__device__ constexpr bool uses_underlying_type() +{ + return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM; +} + +/// Gets the underlying target type for the given source type and aggregation kind +template +using underlying_target_t = + cuda::std::conditional_t(), + cudf::device_storage_type_t>, + cudf::detail::target_type_t>; + +/// Gets the underlying source type for the given source type and aggregation kind +template +using underlying_source_t = + cuda::std::conditional_t(), cudf::device_storage_type_t, Source>; + template struct update_target_element { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } @@ -51,8 +69,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_min(&target.element(target_index), static_cast(source.element(source_index))); @@ -72,8 +88,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -96,8 +110,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_max(&target.element(target_index), static_cast(source.element(source_index))); @@ -117,8 +129,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -141,8 +151,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_add(&target.element(target_index), static_cast(source.element(source_index))); @@ -162,8 +170,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -197,10 +203,10 @@ struct update_target_from_dictionary { template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept { } }; @@ -227,8 +233,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - dispatch_type_and_aggregation( source.child(cudf::dictionary_column_view::keys_column_index).type(), k, @@ -249,8 +253,6 @@ struct update_target_element; auto value = static_cast(source.element(source_index)); cudf::detail::atomic_add(&target.element(target_index), value * value); @@ -267,8 +269,6 @@ struct update_target_element; cudf::detail::atomic_mul(&target.element(target_index), static_cast(source.element(source_index))); @@ -286,8 +286,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_add(&target.element(target_index), Target{1}); @@ -323,8 +321,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; auto old = cudf::detail::atomic_cas( &target.element(target_index), ARGMAX_SENTINEL, source_index); @@ -349,8 +345,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; auto old = cudf::detail::atomic_cas( &target.element(target_index), ARGMIN_SENTINEL, source_index); @@ -376,6 +370,9 @@ struct elementwise_aggregator { column_device_view source, size_type source_index) const noexcept { + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } update_target_element{}(target, target_index, source, source_index); } }; diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh new file mode 100644 index 00000000000..50e89c727ff --- /dev/null +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +struct update_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view, + cudf::size_type, + cudf::column_device_view, + cuda::std::byte*, + cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_timestamp()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// The shared memory will already have it squared +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + Target value = static_cast(source_casted[source_index]); + + cudf::detail::atomic_add(&target.element(target_index), value); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and +// non-fixed point column +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_VALID, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_ALL, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_ALL is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmax_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source_column.element(source_argmax_index) > + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmax_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmin_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source_column.element(source_argmin_index) < + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmin_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief A functor that updates a single element in the target column stored in global memory by + * applying an aggregation operation to a corresponding element from a source column in shared + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ +struct gmem_element_aggregator { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + bool* source_mask, + cudf::size_type source_index) const noexcept + { + // Early exit for all aggregation kinds since shared memory aggregation of + // `COUNT_ALL` is always valid + if (!source_mask[source_index]) { return; } + + update_target_element_gmem{}( + target, target_index, source_column, source, source_index); + } +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh new file mode 100644 index 00000000000..9cbeeb34b86 --- /dev/null +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +struct update_target_element_shmem { + __device__ void operator()( + cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_min(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_max(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_timestamp()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target_casted[target_index], value * value); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_mul(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_VALID, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + // The nullability was checked prior to this call in the `shmem_element_aggregator` functor + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_ALL, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + + // Assumes target is already set to be valid + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +/** + * @brief A functor that updates a single element in the target column stored in shared memory by + * applying an aggregation operation to a corresponding element from a source column in global + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ +struct shmem_element_aggregator { + template + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + // Check nullability for all aggregation kinds but `COUNT_ALL` + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } + update_target_element_shmem{}( + target, target_mask, target_index, source, source_index); + } +}; +} // namespace cudf::groupby::detail::hash