From 4e3afd26127ebd0c04b739032873d3fce01eb1b7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:00:11 -0500
Subject: [PATCH] Change binops for-each kernel to thrust::for_each_n (#17419)

Replaces the custom `for_each_kernel` in `binary_ops.cuh` with `thrust::for_each_n`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17419
---
 cpp/src/binaryop/compiled/binary_ops.cuh  | 56 ++++-------------------
 cpp/src/binaryop/compiled/equality_ops.cu | 38 ++++++++-------
 2 files changed, 31 insertions(+), 63 deletions(-)
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 06987139188..ec63504a414 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -244,44 +244,6 @@ struct binary_op_double_device_dispatcher {
   }
 };
 
-/**
- * @brief Simplified for_each kernel
- *
- * @param size number of elements to process.
- * @param f Functor object to call for each element.
- */
-template <typename Functor>
-CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
-{
-  auto start        = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
-
-#pragma unroll
-  for (auto i = start; i < size; i += stride) {
-    f(i);
-  }
-}
-
-/**
- * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions.
- *
- * @tparam Functor
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param size number of elements to process.
- * @param f Functor object to call for each element.
- */
-template <typename Functor>
-void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
-{
-  int block_size;
-  int min_grid_size;
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
-  auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */);
-  for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    size, std::forward<Functor&&>(f));
-}
-
 template <class BinaryOperator>
 void apply_binary_op(mutable_column_view& out,
                      column_view const& lhs,
@@ -298,16 +260,18 @@ void apply_binary_op(mutable_column_view& out,
   // Create binop functor instance
   if (common_dtype) {
     // Execute it on every element
-    for_each(stream,
-             out.size(),
-             binary_op_device_dispatcher<BinaryOperator>{
-               *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       out.size(),
+                       binary_op_device_dispatcher<BinaryOperator>{
+                         *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   } else {
     // Execute it on every element
-    for_each(stream,
-             out.size(),
-             binary_op_double_device_dispatcher<BinaryOperator>{
-               *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       out.size(),
+                       binary_op_double_device_dispatcher<BinaryOperator>{
+                         *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   }
 }
 
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index 041fca76494..d8c50683026 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,27 +34,31 @@ void dispatch_equality_op(mutable_column_view& out,
   auto rhsd         = column_device_view::create(rhs, stream);
   if (common_dtype) {
     if (op == binary_operator::EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_device_dispatcher<ops::Equal>{
-                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_device_dispatcher<ops::Equal>{
+                           *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_device_dispatcher<ops::NotEqual>{
-                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_device_dispatcher<ops::NotEqual>{
+                           *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   } else {
     if (op == binary_operator::EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_double_device_dispatcher<ops::Equal>{
-                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_double_device_dispatcher<ops::Equal>{
+                           *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
-      for_each(stream,
-               out.size(),
-               binary_op_double_device_dispatcher<ops::NotEqual>{
-                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
+      thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         out.size(),
+                         binary_op_double_device_dispatcher<ops::NotEqual>{
+                           *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   }
 }