From 4e3afd26127ebd0c04b739032873d3fce01eb1b7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:00:11 -0500 Subject: [PATCH] Change binops for-each kernel to thrust::for_each_n (#17419) Replaces the custom `for_each_kernel` in `binary_ops.cuh` with `thrust::for_each_n` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17419 --- cpp/src/binaryop/compiled/binary_ops.cuh | 56 ++++------------------- cpp/src/binaryop/compiled/equality_ops.cu | 38 ++++++++------- 2 files changed, 31 insertions(+), 63 deletions(-) diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 06987139188..ec63504a414 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -244,44 +244,6 @@ struct binary_op_double_device_dispatcher { } }; -/** - * @brief Simplified for_each kernel - * - * @param size number of elements to process. - * @param f Functor object to call for each element. - */ -template -CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f) -{ - auto start = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - -#pragma unroll - for (auto i = start; i < size; i += stride) { - f(i); - } -} - -/** - * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions. - * - * @tparam Functor - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param size number of elements to process. - * @param f Functor object to call for each element. - */ -template -void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel)); - auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */); - for_each_kernel<<>>( - size, std::forward(f)); -} - template void apply_binary_op(mutable_column_view& out, column_view const& lhs, @@ -298,16 +260,18 @@ void apply_binary_op(mutable_column_view& out, // Create binop functor instance if (common_dtype) { // Execute it on every element - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else { // Execute it on every element - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu index 041fca76494..d8c50683026 100644 --- a/cpp/src/binaryop/compiled/equality_ops.cu +++ b/cpp/src/binaryop/compiled/equality_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,27 +34,31 @@ void dispatch_equality_op(mutable_column_view& out, auto rhsd = column_device_view::create(rhs, stream); if (common_dtype) { if (op == binary_operator::EQUAL) { - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else if (op == binary_operator::NOT_EQUAL) { - for_each(stream, - out.size(), - binary_op_device_dispatcher{ - *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_device_dispatcher{ + *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } else { if (op == binary_operator::EQUAL) { - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } else if (op == binary_operator::NOT_EQUAL) { - for_each(stream, - out.size(), - binary_op_double_device_dispatcher{ - *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + out.size(), + binary_op_double_device_dispatcher{ + *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar}); } } }