From 96f2cc5262e5b6b0f50109d327857e306214b3a4 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 26 Aug 2024 10:21:48 -0400 Subject: [PATCH] Remove CUDA whole compilation ODR violations (#16603) CUDA whole compilation mode requires that all kernels are only launched from TUs that compile them. Previously libcudf would compile a subset of kernels in separate TUs from where they are launched. To keep compile times ( and library size ) as low as possible I have introduced a single C++ function call between the original call site and the kernel launch. In testing this neglibile differences on compile time and binary size. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Mark Harris (https://github.com/harrism) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16603 --- cpp/src/join/mixed_join.cu | 191 ++++++++++--------- cpp/src/join/mixed_join_kernel.cu | 10 +- cpp/src/join/mixed_join_kernel.cuh | 64 +++++-- cpp/src/join/mixed_join_kernel.hpp | 80 ++++++++ cpp/src/join/mixed_join_kernel_nulls.cu | 10 +- cpp/src/join/mixed_join_kernels.cuh | 124 ------------ cpp/src/join/mixed_join_kernels_semi.cu | 86 +++++---- cpp/src/join/mixed_join_kernels_semi.cuh | 29 +-- cpp/src/join/mixed_join_semi.cu | 38 ++-- cpp/src/join/mixed_join_size_kernel.cu | 12 +- cpp/src/join/mixed_join_size_kernel.cuh | 64 +++++-- cpp/src/join/mixed_join_size_kernel.hpp | 85 +++++++++ cpp/src/join/mixed_join_size_kernel_nulls.cu | 12 +- 13 files changed, 472 insertions(+), 333 deletions(-) create mode 100644 cpp/src/join/mixed_join_kernel.hpp delete mode 100644 cpp/src/join/mixed_join_kernels.cuh create mode 100644 cpp/src/join/mixed_join_size_kernel.hpp diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 48b94c777de..eb12065c6a9 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -16,7 +16,8 @@ #include "join_common_utils.cuh" #include "join_common_utils.hpp" -#include "mixed_join_kernels.cuh" +#include "mixed_join_kernel.hpp" +#include "mixed_join_size_kernel.hpp" #include #include @@ -178,9 +179,6 @@ mixed_join( join_size = output_size_data->first; matches_per_row_span = output_size_data->second; } else { - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); - matches_per_row = rmm::device_uvector{static_cast(outer_num_rows), stream, mr}; // Note that the view goes out of scope after this else statement, but the @@ -190,37 +188,38 @@ mixed_join( matches_per_row_span = cudf::device_span{ matches_per_row->begin(), static_cast(outer_num_rows)}; if (has_nulls) { - compute_mixed_join_output_size - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - mutable_matches_per_row_span); + join_size = launch_compute_mixed_join_output_size(*left_conditional_view, + *right_conditional_view, + *probe_view, + *build_view, + hash_probe, + equality_probe, + kernel_join_type, + hash_table_view, + parser.device_expression_data, + swap_tables, + mutable_matches_per_row_span, + config, + shmem_size_per_block, + stream, + mr); } else { - compute_mixed_join_output_size - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - mutable_matches_per_row_span); + join_size = launch_compute_mixed_join_output_size(*left_conditional_view, + *right_conditional_view, + *probe_view, + *build_view, + hash_probe, + equality_probe, + kernel_join_type, + hash_table_view, + parser.device_expression_data, + swap_tables, + mutable_matches_per_row_span, + config, + shmem_size_per_block, + stream, + mr); } - join_size = size.value(stream); } // The initial early exit clauses guarantee that we will not reach this point @@ -249,37 +248,39 @@ mixed_join( auto const& join_output_r = right_indices->data(); if (has_nulls) { - mixed_join - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - join_output_l, - join_output_r, - parser.device_expression_data, - join_result_offsets.data(), - swap_tables); + launch_mixed_join(*left_conditional_view, + *right_conditional_view, + *probe_view, + *build_view, + hash_probe, + equality_probe, + kernel_join_type, + hash_table_view, + join_output_l, + join_output_r, + parser.device_expression_data, + join_result_offsets.data(), + swap_tables, + config, + shmem_size_per_block, + stream); } else { - mixed_join - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - join_output_l, - join_output_r, - parser.device_expression_data, - join_result_offsets.data(), - swap_tables); + launch_mixed_join(*left_conditional_view, + *right_conditional_view, + *probe_view, + *build_view, + hash_probe, + equality_probe, + kernel_join_type, + hash_table_view, + join_output_l, + join_output_r, + parser.device_expression_data, + join_result_offsets.data(), + swap_tables, + config, + shmem_size_per_block, + stream); } auto join_indices = std::pair(std::move(left_indices), std::move(right_indices)); @@ -423,9 +424,6 @@ compute_mixed_join_output_size(table_view const& left_equality, detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); - auto const preprocessed_probe = experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; @@ -436,39 +434,42 @@ compute_mixed_join_output_size(table_view const& left_equality, // Determine number of output rows without actually building the output to simply // find what the size of the output will be. + std::size_t size = 0; if (has_nulls) { - compute_mixed_join_output_size - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - matches_per_row_span); + size = launch_compute_mixed_join_output_size(*left_conditional_view, + *right_conditional_view, + *probe_view, + *build_view, + hash_probe, + equality_probe, + join_type, + hash_table_view, + parser.device_expression_data, + swap_tables, + matches_per_row_span, + config, + shmem_size_per_block, + stream, + mr); } else { - compute_mixed_join_output_size - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - matches_per_row_span); + size = launch_compute_mixed_join_output_size(*left_conditional_view, + *right_conditional_view, + *probe_view, + *build_view, + hash_probe, + equality_probe, + join_type, + hash_table_view, + parser.device_expression_data, + swap_tables, + matches_per_row_span, + config, + shmem_size_per_block, + stream, + mr); } - return {size.value(stream), std::move(matches_per_row)}; + return {size, std::move(matches_per_row)}; } } // namespace detail diff --git a/cpp/src/join/mixed_join_kernel.cu b/cpp/src/join/mixed_join_kernel.cu index 61cfa168b03..cd4016837cc 100644 --- a/cpp/src/join/mixed_join_kernel.cu +++ b/cpp/src/join/mixed_join_kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ */ #include "mixed_join_kernel.cuh" +#include "mixed_join_kernel.hpp" namespace cudf { namespace detail { -template __global__ void mixed_join( +template void launch_mixed_join( table_device_view left_table, table_device_view right_table, table_device_view probe, @@ -32,7 +33,10 @@ template __global__ void mixed_join( size_type* join_output_r, cudf::ast::detail::expression_device_view device_expression_data, cudf::size_type const* join_result_offsets, - bool const swap_tables); + bool const swap_tables, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index ea59f23c77f..9d011d43de6 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -19,6 +19,7 @@ #include "join_common_utils.cuh" #include "join_common_utils.hpp" #include "mixed_join_common_utils.cuh" +#include "mixed_join_kernel.hpp" #include #include @@ -39,20 +40,20 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -CUDF_HIDDEN __launch_bounds__(block_size) __global__ - void mixed_join(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) +CUDF_KERNEL void __launch_bounds__(block_size) + mixed_join(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + size_type* join_output_l, + size_type* join_output_r, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const* join_result_offsets, + bool const swap_tables) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is @@ -111,6 +112,41 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__ } } +template +void launch_mixed_join(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + size_type* join_output_l, + size_type* join_output_r, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const* join_result_offsets, + bool const swap_tables, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream) +{ + mixed_join + <<>>( + left_table, + right_table, + probe, + build, + hash_probe, + equality_probe, + join_type, + hash_table_view, + join_output_l, + join_output_r, + device_expression_data, + join_result_offsets, + swap_tables); +} + } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernel.hpp b/cpp/src/join/mixed_join_kernel.hpp new file mode 100644 index 00000000000..cc92e9d8ba4 --- /dev/null +++ b/cpp/src/join/mixed_join_kernel.hpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "join/join_common_utils.hpp" +#include "join/mixed_join_common_utils.cuh" + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief Performs a join using the combination of a hash lookup to identify + * equal rows between one pair of tables and the evaluation of an expression + * containing an arbitrary expression. + * + * This method probes the hash table with each row in the probe table using a + * custom equality comparator that also checks that the conditional expression + * evaluates to true between the left/right tables when a match is found + * between probe and build rows. + * + * @tparam block_size The number of threads per block for this kernel + * @tparam has_nulls Whether or not the inputs may contain nulls. + * + * @param[in] left_table The left table + * @param[in] right_table The right table + * @param[in] probe The table with which to probe the hash table for matches. + * @param[in] build The table with which the hash table was built. + * @param[in] hash_probe The hasher used for the probe table. + * @param[in] equality_probe The equality comparator used when probing the hash table. + * @param[in] join_type The type of join to be performed + * @param[in] hash_table_view The hash table built from `build`. + * @param[out] join_output_l The left result of the join operation + * @param[out] join_output_r The right result of the join operation + * @param[in] device_expression_data Container of device data required to evaluate the desired + * expression. + * @param[in] join_result_offsets The starting indices in join_output[l|r] + * where the matches for each row begin. Equivalent to a prefix sum of + * matches_per_row. + * @param[in] swap_tables If true, the kernel was launched with one thread per right row and + * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. + */ +template +void launch_mixed_join(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + size_type* join_output_l, + size_type* join_output_r, + cudf::ast::detail::expression_device_view device_expression_data, + cudf::size_type const* join_result_offsets, + bool const swap_tables, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream); + +} // namespace detail + +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/join/mixed_join_kernel_nulls.cu b/cpp/src/join/mixed_join_kernel_nulls.cu index 518f8ed8555..185aa133f2d 100644 --- a/cpp/src/join/mixed_join_kernel_nulls.cu +++ b/cpp/src/join/mixed_join_kernel_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ */ #include "mixed_join_kernel.cuh" +#include "mixed_join_kernel.hpp" namespace cudf { namespace detail { -template __global__ void mixed_join( +template void launch_mixed_join( table_device_view left_table, table_device_view right_table, table_device_view probe, @@ -32,7 +33,10 @@ template __global__ void mixed_join( size_type* join_output_r, cudf::ast::detail::expression_device_view device_expression_data, cudf::size_type const* join_result_offsets, - bool const swap_tables); + bool const swap_tables, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh deleted file mode 100644 index 037c02666d4..00000000000 --- a/cpp/src/join/mixed_join_kernels.cuh +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "join/join_common_utils.hpp" -#include "join/mixed_join_common_utils.cuh" - -#include -#include -#include - -namespace cudf { -namespace detail { - -/** - * @brief Computes the output size of joining the left table to the right table. - * - * This method probes the hash table with each row in the probe table using a - * custom equality comparator that also checks that the conditional expression - * evaluates to true between the left/right tables when a match is found - * between probe and build rows. - * - * @tparam block_size The number of threads per block for this kernel - * @tparam has_nulls Whether or not the inputs may contain nulls. - * - * @param[in] left_table The left table - * @param[in] right_table The right table - * @param[in] probe The table with which to probe the hash table for matches. - * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. - * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed - * @param[in] hash_table_view The hash table built from `build`. - * @param[in] device_expression_data Container of device data required to evaluate the desired - * expression. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. - * @param[out] output_size The resulting output size - * @param[out] matches_per_row The number of matches in one pair of - * equality/conditional tables for each row in the other pair of tables. If - * swap_tables is true, matches_per_row corresponds to the right_table, - * otherwise it corresponds to the left_table. Note that corresponding swap of - * left/right tables to determine which is the build table and which is the - * probe table has already happened on the host. - */ - -template -__global__ void compute_mixed_join_output_size( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row); - -/** - * @brief Performs a join using the combination of a hash lookup to identify - * equal rows between one pair of tables and the evaluation of an expression - * containing an arbitrary expression. - * - * This method probes the hash table with each row in the probe table using a - * custom equality comparator that also checks that the conditional expression - * evaluates to true between the left/right tables when a match is found - * between probe and build rows. - * - * @tparam block_size The number of threads per block for this kernel - * @tparam has_nulls Whether or not the inputs may contain nulls. - * - * @param[in] left_table The left table - * @param[in] right_table The right table - * @param[in] probe The table with which to probe the hash table for matches. - * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. - * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed - * @param[in] hash_table_view The hash table built from `build`. - * @param[out] join_output_l The left result of the join operation - * @param[out] join_output_r The right result of the join operation - * @param[in] device_expression_data Container of device data required to evaluate the desired - * expression. - * @param[in] join_result_offsets The starting indices in join_output[l|r] - * where the matches for each row begin. Equivalent to a prefix sum of - * matches_per_row. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. - */ -template -__global__ void mixed_join(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables); - -} // namespace detail - -} // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 1f31eaa7878..7459ac3e99c 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -14,9 +14,7 @@ * limitations under the License. */ -#include "join/join_common_utils.cuh" -#include "join/join_common_utils.hpp" -#include "join/mixed_join_common_utils.cuh" +#include "join/mixed_join_kernels_semi.cuh" #include #include @@ -35,16 +33,16 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -CUDF_HIDDEN __launch_bounds__(block_size) __global__ - void mixed_join_semi(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, - cudf::device_span left_table_keep_mask, - cudf::ast::detail::expression_device_view device_expression_data) +CUDF_KERNEL void __launch_bounds__(block_size) + mixed_join_semi(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + cudf::detail::semi_map_type::device_view hash_table_view, + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is @@ -75,28 +73,46 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__ } } -template __global__ void mixed_join_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, - cudf::device_span left_table_keep_mask, - cudf::ast::detail::expression_device_view device_expression_data); - -template __global__ void mixed_join_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, - cudf::device_span left_table_keep_mask, - cudf::ast::detail::expression_device_view device_expression_data); +void launch_mixed_join_semi(bool has_nulls, + table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + cudf::detail::semi_map_type::device_view hash_table_view, + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream) +{ + if (has_nulls) { + mixed_join_semi + <<>>( + left_table, + right_table, + probe, + build, + hash_probe, + equality_probe, + hash_table_view, + left_table_keep_mask, + device_expression_data); + } else { + mixed_join_semi + <<>>( + left_table, + right_table, + probe, + build, + hash_probe, + equality_probe, + hash_table_view, + left_table_keep_mask, + device_expression_data); + } +} } // namespace detail - } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index 4ea404d451c..43714ffb36a 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -16,8 +16,9 @@ #pragma once -#include "join/join_common_utils.hpp" -#include "join/mixed_join_common_utils.cuh" +#include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include "mixed_join_common_utils.cuh" #include #include @@ -39,6 +40,7 @@ namespace detail { * @tparam block_size The number of threads per block for this kernel * @tparam has_nulls Whether or not the inputs may contain nulls. * + * @param[in] has_nulls If the input has nulls * @param[in] left_table The left table * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. @@ -51,16 +53,19 @@ namespace detail { * @param[in] device_expression_data Container of device data required to evaluate the desired * expression. */ -template -__global__ void mixed_join_semi(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, - cudf::device_span left_table_keep_mask, - cudf::ast::detail::expression_device_view device_expression_data); +void launch_mixed_join_semi(bool has_nulls, + table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + cudf::detail::semi_map_type::device_view hash_table_view, + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 3e4188a0fbd..a79aa6673d6 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -227,31 +227,19 @@ std::unique_ptr> mixed_join_semi( // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); - if (has_nulls) { - mixed_join_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - hash_table_view, - cudf::device_span(left_table_keep_mask), - parser.device_expression_data); - } else { - mixed_join_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - hash_table_view, - cudf::device_span(left_table_keep_mask), - parser.device_expression_data); - } + launch_mixed_join_semi(has_nulls, + *left_conditional_view, + *right_conditional_view, + *probe_view, + *build_view, + hash_probe, + equality_probe, + hash_table_view, + cudf::device_span(left_table_keep_mask), + parser.device_expression_data, + config, + shmem_size_per_block, + stream); auto gather_map = std::make_unique>(probe.num_rows(), stream, mr); diff --git a/cpp/src/join/mixed_join_size_kernel.cu b/cpp/src/join/mixed_join_size_kernel.cu index 4011acb65d6..4882c8769e6 100644 --- a/cpp/src/join/mixed_join_size_kernel.cu +++ b/cpp/src/join/mixed_join_size_kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ */ #include "mixed_join_size_kernel.cuh" +#include "mixed_join_size_kernel.hpp" namespace cudf { namespace detail { -template __global__ void compute_mixed_join_output_size( +template std::size_t launch_compute_mixed_join_output_size( table_device_view left_table, table_device_view right_table, table_device_view probe, @@ -30,8 +31,11 @@ template __global__ void compute_mixed_join_output_size matches_per_row); + cudf::device_span matches_per_row, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 00a90f8273f..a1066e32331 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -36,19 +36,19 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) +CUDF_KERNEL void __launch_bounds__(block_size) + compute_mixed_join_output_size(table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + ast::detail::expression_device_view device_expression_data, + bool const swap_tables, + std::size_t* output_size, + cudf::device_span matches_per_row) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest @@ -103,5 +103,43 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_out } } +template +std::size_t launch_compute_mixed_join_output_size( + table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + ast::detail::expression_device_view device_expression_data, + bool const swap_tables, + cudf::device_span matches_per_row, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // Allocate storage for the counter used to get the size of the join output + rmm::device_scalar size(0, stream, mr); + + compute_mixed_join_output_size + <<>>( + left_table, + right_table, + probe, + build, + hash_probe, + equality_probe, + join_type, + hash_table_view, + device_expression_data, + swap_tables, + size.data(), + matches_per_row); + return size.value(stream); +} + } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp new file mode 100644 index 00000000000..b09805c14dc --- /dev/null +++ b/cpp/src/join/mixed_join_size_kernel.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include "mixed_join_common_utils.cuh" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief Computes the output size of joining the left table to the right table. + * + * This method probes the hash table with each row in the probe table using a + * custom equality comparator that also checks that the conditional expression + * evaluates to true between the left/right tables when a match is found + * between probe and build rows. + * + * @tparam block_size The number of threads per block for this kernel + * @tparam has_nulls Whether or not the inputs may contain nulls. + * + * @param[in] left_table The left table + * @param[in] right_table The right table + * @param[in] probe The table with which to probe the hash table for matches. + * @param[in] build The table with which the hash table was built. + * @param[in] hash_probe The hasher used for the probe table. + * @param[in] equality_probe The equality comparator used when probing the hash table. + * @param[in] join_type The type of join to be performed + * @param[in] hash_table_view The hash table built from `build`. + * @param[in] device_expression_data Container of device data required to evaluate the desired + * expression. + * @param[in] swap_tables If true, the kernel was launched with one thread per right row and + * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. + * @param[out] output_size The resulting output size + * @param[out] matches_per_row The number of matches in one pair of + * equality/conditional tables for each row in the other pair of tables. If + * swap_tables is true, matches_per_row corresponds to the right_table, + * otherwise it corresponds to the left_table. Note that corresponding swap of + * left/right tables to determine which is the build table and which is the + * probe table has already happened on the host. + */ + +template +std::size_t launch_compute_mixed_join_output_size( + cudf::table_device_view left_table, + cudf::table_device_view right_table, + cudf::table_device_view probe, + cudf::table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + ast::detail::expression_device_view device_expression_data, + bool const swap_tables, + cudf::device_span matches_per_row, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/join/mixed_join_size_kernel_nulls.cu b/cpp/src/join/mixed_join_size_kernel_nulls.cu index 2868113bf33..11f9103da4d 100644 --- a/cpp/src/join/mixed_join_size_kernel_nulls.cu +++ b/cpp/src/join/mixed_join_size_kernel_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ namespace cudf { namespace detail { -template __global__ void compute_mixed_join_output_size( +template std::size_t launch_compute_mixed_join_output_size( table_device_view left_table, table_device_view right_table, table_device_view probe, @@ -30,8 +30,10 @@ template __global__ void compute_mixed_join_output_size matches_per_row); - + cudf::device_span matches_per_row, + detail::grid_1d const config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); } // namespace detail } // namespace cudf