From 96f2cc5262e5b6b0f50109d327857e306214b3a4 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 26 Aug 2024 10:21:48 -0400
Subject: [PATCH] Remove CUDA whole compilation ODR violations (#16603)

CUDA whole compilation mode requires that all kernels are only launched from TUs that compile them. Previously libcudf would compile a subset of kernels in separate TUs from where they are launched.
To keep compile times ( and library size ) as low as possible I have introduced a single C++ function call between the original call site and the kernel launch. In testing this neglibile differences on compile time and binary size.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16603
---
 cpp/src/join/mixed_join.cu                   | 191 ++++++++++---------
 cpp/src/join/mixed_join_kernel.cu            |  10 +-
 cpp/src/join/mixed_join_kernel.cuh           |  64 +++++--
 cpp/src/join/mixed_join_kernel.hpp           |  80 ++++++++
 cpp/src/join/mixed_join_kernel_nulls.cu      |  10 +-
 cpp/src/join/mixed_join_kernels.cuh          | 124 ------------
 cpp/src/join/mixed_join_kernels_semi.cu      |  86 +++++----
 cpp/src/join/mixed_join_kernels_semi.cuh     |  29 +--
 cpp/src/join/mixed_join_semi.cu              |  38 ++--
 cpp/src/join/mixed_join_size_kernel.cu       |  12 +-
 cpp/src/join/mixed_join_size_kernel.cuh      |  64 +++++--
 cpp/src/join/mixed_join_size_kernel.hpp      |  85 +++++++++
 cpp/src/join/mixed_join_size_kernel_nulls.cu |  12 +-
 13 files changed, 472 insertions(+), 333 deletions(-)
 create mode 100644 cpp/src/join/mixed_join_kernel.hpp
 delete mode 100644 cpp/src/join/mixed_join_kernels.cuh
 create mode 100644 cpp/src/join/mixed_join_size_kernel.hpp
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 48b94c777de..eb12065c6a9 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -16,7 +16,8 @@
 
 #include "join_common_utils.cuh"
 #include "join_common_utils.hpp"
-#include "mixed_join_kernels.cuh"
+#include "mixed_join_kernel.hpp"
+#include "mixed_join_size_kernel.hpp"
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
@@ -178,9 +179,6 @@ mixed_join(
     join_size            = output_size_data->first;
     matches_per_row_span = output_size_data->second;
   } else {
-    // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
-
     matches_per_row =
       rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
     // Note that the view goes out of scope after this else statement, but the
@@ -190,37 +188,38 @@ mixed_join(
     matches_per_row_span = cudf::device_span<size_type const>{
       matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
     if (has_nulls) {
-      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
+      join_size = launch_compute_mixed_join_output_size<true>(*left_conditional_view,
+                                                              *right_conditional_view,
+                                                              *probe_view,
+                                                              *build_view,
+                                                              hash_probe,
+                                                              equality_probe,
+                                                              kernel_join_type,
+                                                              hash_table_view,
+                                                              parser.device_expression_data,
+                                                              swap_tables,
+                                                              mutable_matches_per_row_span,
+                                                              config,
+                                                              shmem_size_per_block,
+                                                              stream,
+                                                              mr);
     } else {
-      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
+      join_size = launch_compute_mixed_join_output_size<false>(*left_conditional_view,
+                                                               *right_conditional_view,
+                                                               *probe_view,
+                                                               *build_view,
+                                                               hash_probe,
+                                                               equality_probe,
+                                                               kernel_join_type,
+                                                               hash_table_view,
+                                                               parser.device_expression_data,
+                                                               swap_tables,
+                                                               mutable_matches_per_row_span,
+                                                               config,
+                                                               shmem_size_per_block,
+                                                               stream,
+                                                               mr);
     }
-    join_size = size.value(stream);
   }
 
   // The initial early exit clauses guarantee that we will not reach this point
@@ -249,37 +248,39 @@ mixed_join(
   auto const& join_output_r = right_indices->data();
 
   if (has_nulls) {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        kernel_join_type,
-        hash_table_view,
-        join_output_l,
-        join_output_r,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+    launch_mixed_join<true>(*left_conditional_view,
+                            *right_conditional_view,
+                            *probe_view,
+                            *build_view,
+                            hash_probe,
+                            equality_probe,
+                            kernel_join_type,
+                            hash_table_view,
+                            join_output_l,
+                            join_output_r,
+                            parser.device_expression_data,
+                            join_result_offsets.data(),
+                            swap_tables,
+                            config,
+                            shmem_size_per_block,
+                            stream);
   } else {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        kernel_join_type,
-        hash_table_view,
-        join_output_l,
-        join_output_r,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+    launch_mixed_join<false>(*left_conditional_view,
+                             *right_conditional_view,
+                             *probe_view,
+                             *build_view,
+                             hash_probe,
+                             equality_probe,
+                             kernel_join_type,
+                             hash_table_view,
+                             join_output_l,
+                             join_output_r,
+                             parser.device_expression_data,
+                             join_result_offsets.data(),
+                             swap_tables,
+                             config,
+                             shmem_size_per_block,
+                             stream);
   }
 
   auto join_indices = std::pair(std::move(left_indices), std::move(right_indices));
@@ -423,9 +424,6 @@ compute_mixed_join_output_size(table_view const& left_equality,
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
-  // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
-
   auto const preprocessed_probe =
     experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
@@ -436,39 +434,42 @@ compute_mixed_join_output_size(table_view const& left_equality,
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
+  std::size_t size = 0;
   if (has_nulls) {
-    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
+    size = launch_compute_mixed_join_output_size<true>(*left_conditional_view,
+                                                       *right_conditional_view,
+                                                       *probe_view,
+                                                       *build_view,
+                                                       hash_probe,
+                                                       equality_probe,
+                                                       join_type,
+                                                       hash_table_view,
+                                                       parser.device_expression_data,
+                                                       swap_tables,
+                                                       matches_per_row_span,
+                                                       config,
+                                                       shmem_size_per_block,
+                                                       stream,
+                                                       mr);
   } else {
-    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
+    size = launch_compute_mixed_join_output_size<false>(*left_conditional_view,
+                                                        *right_conditional_view,
+                                                        *probe_view,
+                                                        *build_view,
+                                                        hash_probe,
+                                                        equality_probe,
+                                                        join_type,
+                                                        hash_table_view,
+                                                        parser.device_expression_data,
+                                                        swap_tables,
+                                                        matches_per_row_span,
+                                                        config,
+                                                        shmem_size_per_block,
+                                                        stream,
+                                                        mr);
   }
 
-  return {size.value(stream), std::move(matches_per_row)};
+  return {size, std::move(matches_per_row)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/mixed_join_kernel.cu b/cpp/src/join/mixed_join_kernel.cu
index 61cfa168b03..cd4016837cc 100644
--- a/cpp/src/join/mixed_join_kernel.cu
+++ b/cpp/src/join/mixed_join_kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_kernel.cuh"
+#include "mixed_join_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
+template void launch_mixed_join<false>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -32,7 +33,10 @@ template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
   size_type* join_output_r,
   cudf::ast::detail::expression_device_view device_expression_data,
   cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  bool const swap_tables,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index ea59f23c77f..9d011d43de6 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -19,6 +19,7 @@
 #include "join_common_utils.cuh"
 #include "join_common_utils.hpp"
 #include "mixed_join_common_utils.cuh"
+#include "mixed_join_kernel.hpp"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -39,20 +40,20 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__
-  void mixed_join(table_device_view left_table,
-                  table_device_view right_table,
-                  table_device_view probe,
-                  table_device_view build,
-                  row_hash const hash_probe,
-                  row_equality const equality_probe,
-                  join_kind const join_type,
-                  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                  size_type* join_output_l,
-                  size_type* join_output_r,
-                  cudf::ast::detail::expression_device_view device_expression_data,
-                  cudf::size_type const* join_result_offsets,
-                  bool const swap_tables)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  mixed_join(table_device_view left_table,
+             table_device_view right_table,
+             table_device_view probe,
+             table_device_view build,
+             row_hash const hash_probe,
+             row_equality const equality_probe,
+             join_kind const join_type,
+             cudf::detail::mixed_multimap_type::device_view hash_table_view,
+             size_type* join_output_l,
+             size_type* join_output_r,
+             cudf::ast::detail::expression_device_view device_expression_data,
+             cudf::size_type const* join_result_offsets,
+             bool const swap_tables)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -111,6 +112,41 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__
   }
 }
 
+template <bool has_nulls>
+void launch_mixed_join(table_device_view left_table,
+                       table_device_view right_table,
+                       table_device_view probe,
+                       table_device_view build,
+                       row_hash const hash_probe,
+                       row_equality const equality_probe,
+                       join_kind const join_type,
+                       cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                       size_type* join_output_l,
+                       size_type* join_output_r,
+                       cudf::ast::detail::expression_device_view device_expression_data,
+                       cudf::size_type const* join_result_offsets,
+                       bool const swap_tables,
+                       detail::grid_1d const config,
+                       int64_t shmem_size_per_block,
+                       rmm::cuda_stream_view stream)
+{
+  mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
+    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+      left_table,
+      right_table,
+      probe,
+      build,
+      hash_probe,
+      equality_probe,
+      join_type,
+      hash_table_view,
+      join_output_l,
+      join_output_r,
+      device_expression_data,
+      join_result_offsets,
+      swap_tables);
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernel.hpp b/cpp/src/join/mixed_join_kernel.hpp
new file mode 100644
index 00000000000..cc92e9d8ba4
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernel.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "join/join_common_utils.hpp"
+#include "join/mixed_join_common_utils.cuh"
+
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Performs a join using the combination of a hash lookup to identify
+ * equal rows between one pair of tables and the evaluation of an expression
+ * containing an arbitrary expression.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[out] join_output_l The left result of the join operation
+ * @param[out] join_output_r The right result of the join operation
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] join_result_offsets The starting indices in join_output[l|r]
+ * where the matches for each row begin. Equivalent to a prefix sum of
+ * matches_per_row.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ */
+template <bool has_nulls>
+void launch_mixed_join(table_device_view left_table,
+                       table_device_view right_table,
+                       table_device_view probe,
+                       table_device_view build,
+                       row_hash const hash_probe,
+                       row_equality const equality_probe,
+                       join_kind const join_type,
+                       cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                       size_type* join_output_l,
+                       size_type* join_output_r,
+                       cudf::ast::detail::expression_device_view device_expression_data,
+                       cudf::size_type const* join_result_offsets,
+                       bool const swap_tables,
+                       detail::grid_1d const config,
+                       int64_t shmem_size_per_block,
+                       rmm::cuda_stream_view stream);
+
+}  // namespace detail
+
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/mixed_join_kernel_nulls.cu b/cpp/src/join/mixed_join_kernel_nulls.cu
index 518f8ed8555..185aa133f2d 100644
--- a/cpp/src/join/mixed_join_kernel_nulls.cu
+++ b/cpp/src/join/mixed_join_kernel_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_kernel.cuh"
+#include "mixed_join_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
+template void launch_mixed_join<true>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -32,7 +33,10 @@ template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
   size_type* join_output_r,
   cudf::ast::detail::expression_device_view device_expression_data,
   cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  bool const swap_tables,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
deleted file mode 100644
index 037c02666d4..00000000000
--- a/cpp/src/join/mixed_join_kernels.cuh
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
-
-#include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/span.hpp>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Computes the output size of joining the left table to the right table.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- * @param[out] output_size The resulting output size
- * @param[out] matches_per_row The number of matches in one pair of
- * equality/conditional tables for each row in the other pair of tables. If
- * swap_tables is true, matches_per_row corresponds to the right_table,
- * otherwise it corresponds to the left_table. Note that corresponding swap of
- * left/right tables to determine which is the build table and which is the
- * probe table has already happened on the host.
- */
-
-template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-/**
- * @brief Performs a join using the combination of a hash lookup to identify
- * equal rows between one pair of tables and the evaluation of an expression
- * containing an arbitrary expression.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[out] join_output_l The left result of the join operation
- * @param[out] join_output_r The right result of the join operation
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] join_result_offsets The starting indices in join_output[l|r]
- * where the matches for each row begin. Equivalent to a prefix sum of
- * matches_per_row.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- */
-template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join(table_device_view left_table,
-                           table_device_view right_table,
-                           table_device_view probe,
-                           table_device_view build,
-                           row_hash const hash_probe,
-                           row_equality const equality_probe,
-                           join_kind const join_type,
-                           cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                           size_type* join_output_l,
-                           size_type* join_output_r,
-                           cudf::ast::detail::expression_device_view device_expression_data,
-                           cudf::size_type const* join_result_offsets,
-                           bool const swap_tables);
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 1f31eaa7878..7459ac3e99c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include "join/join_common_utils.cuh"
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
+#include "join/mixed_join_kernels_semi.cuh"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -35,16 +33,16 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__
-  void mixed_join_semi(table_device_view left_table,
-                       table_device_view right_table,
-                       table_device_view probe,
-                       table_device_view build,
-                       row_hash const hash_probe,
-                       row_equality const equality_probe,
-                       cudf::detail::semi_map_type::device_view hash_table_view,
-                       cudf::device_span<bool> left_table_keep_mask,
-                       cudf::ast::detail::expression_device_view device_expression_data)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  mixed_join_semi(table_device_view left_table,
+                  table_device_view right_table,
+                  table_device_view probe,
+                  table_device_view build,
+                  row_hash const hash_probe,
+                  row_equality const equality_probe,
+                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  cudf::device_span<bool> left_table_keep_mask,
+                  cudf::ast::detail::expression_device_view device_expression_data)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -75,28 +73,46 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__
   }
 }
 
-template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  cudf::device_span<bool> left_table_keep_mask,
-  cudf::ast::detail::expression_device_view device_expression_data);
-
-template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  cudf::device_span<bool> left_table_keep_mask,
-  cudf::ast::detail::expression_device_view device_expression_data);
+void launch_mixed_join_semi(bool has_nulls,
+                            table_device_view left_table,
+                            table_device_view right_table,
+                            table_device_view probe,
+                            table_device_view build,
+                            row_hash const hash_probe,
+                            row_equality const equality_probe,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            cudf::device_span<bool> left_table_keep_mask,
+                            cudf::ast::detail::expression_device_view device_expression_data,
+                            detail::grid_1d const config,
+                            int64_t shmem_size_per_block,
+                            rmm::cuda_stream_view stream)
+{
+  if (has_nulls) {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        left_table,
+        right_table,
+        probe,
+        build,
+        hash_probe,
+        equality_probe,
+        hash_table_view,
+        left_table_keep_mask,
+        device_expression_data);
+  } else {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        left_table,
+        right_table,
+        probe,
+        build,
+        hash_probe,
+        equality_probe,
+        hash_table_view,
+        left_table_keep_mask,
+        device_expression_data);
+  }
+}
 
 }  // namespace detail
-
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 4ea404d451c..43714ffb36a 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -16,8 +16,9 @@
 
 #pragma once
 
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -39,6 +40,7 @@ namespace detail {
  * @tparam block_size The number of threads per block for this kernel
  * @tparam has_nulls Whether or not the inputs may contain nulls.
  *
+ * @param[in] has_nulls If the input has nulls
  * @param[in] left_table The left table
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
@@ -51,16 +53,19 @@ namespace detail {
  * @param[in] device_expression_data Container of device data required to evaluate the desired
  * expression.
  */
-template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join_semi(table_device_view left_table,
-                                table_device_view right_table,
-                                table_device_view probe,
-                                table_device_view build,
-                                row_hash const hash_probe,
-                                row_equality const equality_probe,
-                                cudf::detail::semi_map_type::device_view hash_table_view,
-                                cudf::device_span<bool> left_table_keep_mask,
-                                cudf::ast::detail::expression_device_view device_expression_data);
+void launch_mixed_join_semi(bool has_nulls,
+                            table_device_view left_table,
+                            table_device_view right_table,
+                            table_device_view probe,
+                            table_device_view build,
+                            row_hash const hash_probe,
+                            row_equality const equality_probe,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            cudf::device_span<bool> left_table_keep_mask,
+                            cudf::ast::detail::expression_device_view device_expression_data,
+                            detail::grid_1d const config,
+                            int64_t shmem_size_per_block,
+                            rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 3e4188a0fbd..a79aa6673d6 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -227,31 +227,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
-  if (has_nulls) {
-    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        hash_table_view,
-        cudf::device_span<bool>(left_table_keep_mask),
-        parser.device_expression_data);
-  } else {
-    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        hash_table_view,
-        cudf::device_span<bool>(left_table_keep_mask),
-        parser.device_expression_data);
-  }
+  launch_mixed_join_semi(has_nulls,
+                         *left_conditional_view,
+                         *right_conditional_view,
+                         *probe_view,
+                         *build_view,
+                         hash_probe,
+                         equality_probe,
+                         hash_table_view,
+                         cudf::device_span<bool>(left_table_keep_mask),
+                         parser.device_expression_data,
+                         config,
+                         shmem_size_per_block,
+                         stream);
 
   auto gather_map = std::make_unique<rmm::device_uvector<size_type>>(probe.num_rows(), stream, mr);
 
diff --git a/cpp/src/join/mixed_join_size_kernel.cu b/cpp/src/join/mixed_join_size_kernel.cu
index 4011acb65d6..4882c8769e6 100644
--- a/cpp/src/join/mixed_join_size_kernel.cu
+++ b/cpp/src/join/mixed_join_size_kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_size_kernel.cuh"
+#include "mixed_join_size_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
+template std::size_t launch_compute_mixed_join_output_size<false>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -30,8 +31,11 @@ template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE,
   cudf::detail::mixed_multimap_type::device_view hash_table_view,
   ast::detail::expression_device_view device_expression_data,
   bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 00a90f8273f..a1066e32331 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -36,19 +36,19 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <int block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  compute_mixed_join_output_size(table_device_view left_table,
+                                 table_device_view right_table,
+                                 table_device_view probe,
+                                 table_device_view build,
+                                 row_hash const hash_probe,
+                                 row_equality const equality_probe,
+                                 join_kind const join_type,
+                                 cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                                 ast::detail::expression_device_view device_expression_data,
+                                 bool const swap_tables,
+                                 std::size_t* output_size,
+                                 cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
@@ -103,5 +103,43 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_out
   }
 }
 
+template <bool has_nulls>
+std::size_t launch_compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<std::size_t> size(0, stream, mr);
+
+  compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+      left_table,
+      right_table,
+      probe,
+      build,
+      hash_probe,
+      equality_probe,
+      join_type,
+      hash_table_view,
+      device_expression_data,
+      swap_tables,
+      size.data(),
+      matches_per_row);
+  return size.value(stream);
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp
new file mode 100644
index 00000000000..b09805c14dc
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernel.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Computes the output size of joining the left table to the right table.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ * @param[out] output_size The resulting output size
+ * @param[out] matches_per_row The number of matches in one pair of
+ * equality/conditional tables for each row in the other pair of tables. If
+ * swap_tables is true, matches_per_row corresponds to the right_table,
+ * otherwise it corresponds to the left_table. Note that corresponding swap of
+ * left/right tables to determine which is the build table and which is the
+ * probe table has already happened on the host.
+ */
+
+template <bool has_nulls>
+std::size_t launch_compute_mixed_join_output_size(
+  cudf::table_device_view left_table,
+  cudf::table_device_view right_table,
+  cudf::table_device_view probe,
+  cudf::table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/mixed_join_size_kernel_nulls.cu b/cpp/src/join/mixed_join_size_kernel_nulls.cu
index 2868113bf33..11f9103da4d 100644
--- a/cpp/src/join/mixed_join_size_kernel_nulls.cu
+++ b/cpp/src/join/mixed_join_size_kernel_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 namespace cudf {
 namespace detail {
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
+template std::size_t launch_compute_mixed_join_output_size<true>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -30,8 +30,10 @@ template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE,
   cudf::detail::mixed_multimap_type::device_view hash_table_view,
   ast::detail::expression_device_view device_expression_data,
   bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf