From 439ffdc413eabed49a24f7effddcadb0beabe5fa Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 9 Jan 2024 16:13:30 -0500 Subject: [PATCH] Mark all cuco kernels as static so they have hidden visibility --- cpp/cmake/thirdparty/get_cucollections.cmake | 7 +- .../thirdparty/patches/cuco_hide_kernels.diff | 481 ++++++++++++++++++ .../thirdparty/patches/cuco_override.json | 14 + 3 files changed, 501 insertions(+), 1 deletion(-) create mode 100644 cpp/cmake/thirdparty/patches/cuco_hide_kernels.diff create mode 100644 cpp/cmake/thirdparty/patches/cuco_override.json diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 9758958b44f..6ec35ddcaf1 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -15,6 +15,11 @@ # This function finds cuCollections and performs any additional configuration. function(find_and_configure_cucollections) include(${rapids-cmake-dir}/cpm/cuco.cmake) + include(${rapids-cmake-dir}/cpm/package_override.cmake) + + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json") + if(BUILD_SHARED_LIBS) rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports) else() diff --git a/cpp/cmake/thirdparty/patches/cuco_hide_kernels.diff b/cpp/cmake/thirdparty/patches/cuco_hide_kernels.diff new file mode 100644 index 00000000000..13287046b0d --- /dev/null +++ b/cpp/cmake/thirdparty/patches/cuco_hide_kernels.diff @@ -0,0 +1,481 @@ +From b520b69b059d0082742da3f69910806c6b1cf86c Mon Sep 17 00:00:00 2001 +From: Robert Maynard +Date: Tue, 9 Jan 2024 15:49:53 -0500 +Subject: [PATCH] Mark all cuco kernels as static so they have hidden + visibility + +--- + include/cuco/detail/dynamic_map_kernels.cuh | 17 ++++++++-------- + .../cuco/detail/open_addressing/kernels.cuh | 12 +++++------ + include/cuco/detail/static_map/kernels.cuh | 4 ++-- + include/cuco/detail/static_map_kernels.cuh | 20 +++++++++---------- + .../cuco/detail/static_multimap/kernels.cuh | 17 ++++++++-------- + include/cuco/detail/static_set/kernels.cuh | 2 +- + include/cuco/detail/storage/kernels.cuh | 2 +- + .../detail/trie/dynamic_bitset/kernels.cuh | 18 ++++++++--------- + include/cuco/detail/utility/cuda.cuh | 4 ++++ + include/cuco/static_map.cuh | 2 +- + 10 files changed, 52 insertions(+), 46 deletions(-) + +diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh +index 566576e..79aab00 100644 +--- a/include/cuco/detail/dynamic_map_kernels.cuh ++++ b/include/cuco/detail/dynamic_map_kernels.cuh +@@ -14,6 +14,7 @@ + * limitations under the License. + */ + #pragma once ++#include + + #include + +@@ -62,7 +63,7 @@ template +-__global__ void insert(InputIt first, ++CUCO_KERNEL void insert(InputIt first, + InputIt last, + viewT* submap_views, + mutableViewT* submap_mutable_views, +@@ -147,7 +148,7 @@ template +-__global__ void insert(InputIt first, ++CUCO_KERNEL void insert(InputIt first, + InputIt last, + viewT* submap_views, + mutableViewT* submap_mutable_views, +@@ -225,7 +226,7 @@ template +-__global__ void erase(InputIt first, ++CUCO_KERNEL void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, +@@ -296,7 +297,7 @@ template +-__global__ void erase(InputIt first, ++CUCO_KERNEL void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, +@@ -368,7 +369,7 @@ template +-__global__ void find(InputIt first, ++CUCO_KERNEL void find(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, +@@ -443,7 +444,7 @@ template +-__global__ void find(InputIt first, ++CUCO_KERNEL void find(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, +@@ -514,7 +515,7 @@ template +-__global__ void contains(InputIt first, ++CUCO_KERNEL void contains(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, +@@ -582,7 +583,7 @@ template +-__global__ void contains(InputIt first, ++CUCO_KERNEL void contains(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, +diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh +index 6772014..ec45581 100644 +--- a/include/cuco/detail/open_addressing/kernels.cuh ++++ b/include/cuco/detail/open_addressing/kernels.cuh +@@ -61,7 +61,7 @@ template +-__global__ void insert_if_n(InputIt first, ++CUCO_KERNEL void insert_if_n(InputIt first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, +@@ -127,7 +127,7 @@ template +-__global__ void insert_if_n( ++CUCO_KERNEL void insert_if_n( + InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref) + { + auto const loop_stride = cuco::detail::grid_stride() / CGSize; +@@ -162,7 +162,7 @@ __global__ void insert_if_n( + * @param ref Non-owning container device ref used to access the slot storage + */ + template +-__global__ void erase(InputIt first, cuco::detail::index_type n, Ref ref) ++CUCO_KERNEL void erase(InputIt first, cuco::detail::index_type n, Ref ref) + { + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; +@@ -212,7 +212,7 @@ template +-__global__ void contains_if_n(InputIt first, ++CUCO_KERNEL void contains_if_n(InputIt first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, +@@ -267,7 +267,7 @@ __global__ void contains_if_n(InputIt first, + * @param count Number of filled slots + */ + template +-__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count) ++CUCO_KERNEL void size(StorageRef storage, Predicate is_filled, AtomicT* count) + { + using size_type = typename StorageRef::size_type; + +@@ -293,7 +293,7 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count) + } + + template +-__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref, ++CUCO_KERNEL void rehash(typename ContainerRef::storage_ref_type storage_ref, + ContainerRef container_ref, + Predicate is_filled) + { +diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh +index 76923f8..d566d82 100644 +--- a/include/cuco/detail/static_map/kernels.cuh ++++ b/include/cuco/detail/static_map/kernels.cuh +@@ -48,7 +48,7 @@ namespace detail { + * @param ref Non-owning container device ref used to access the slot storage + */ + template +-__global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref) ++CUCO_KERNEL void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref) + { + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; +@@ -87,7 +87,7 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref + * @param ref Non-owning map device ref used to access the slot storage + */ + template +-__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) ++CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) + { + namespace cg = cooperative_groups; + +diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh +index aa4f56d..2b6b386 100644 +--- a/include/cuco/detail/static_map_kernels.cuh ++++ b/include/cuco/detail/static_map_kernels.cuh +@@ -47,7 +47,7 @@ template +-__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) ++CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) + { + int64_t const loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; +@@ -85,7 +85,7 @@ template +-__global__ void insert( ++CUCO_KERNEL void insert( + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + { + typedef cub::BlockReduce BlockReduce; +@@ -140,7 +140,7 @@ template +-__global__ void insert( ++CUCO_KERNEL void insert( + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + { + typedef cub::BlockReduce BlockReduce; +@@ -194,7 +194,7 @@ template +-__global__ void erase( ++CUCO_KERNEL void erase( + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + { + using BlockReduce = cub::BlockReduce; +@@ -247,7 +247,7 @@ template +-__global__ void erase( ++CUCO_KERNEL void erase( + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + { + typedef cub::BlockReduce BlockReduce; +@@ -311,7 +311,7 @@ template +-__global__ void insert_if_n(InputIt first, ++CUCO_KERNEL void insert_if_n(InputIt first, + int64_t n, + atomicT* num_successes, + viewT view, +@@ -375,7 +375,7 @@ template +-__global__ void find( ++CUCO_KERNEL void find( + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + { + int64_t const loop_stride = gridDim.x * block_size; +@@ -437,7 +437,7 @@ template +-__global__ void find( ++CUCO_KERNEL void find( + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + { + auto tile = cg::tiled_partition(cg::this_thread_block()); +@@ -494,7 +494,7 @@ template +-__global__ void contains( ++CUCO_KERNEL void contains( + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + { + int64_t const loop_stride = gridDim.x * block_size; +@@ -551,7 +551,7 @@ template +-__global__ void contains( ++CUCO_KERNEL void contains( + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + { + auto tile = cg::tiled_partition(cg::this_thread_block()); +diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh +index 67fb360..61f341b 100644 +--- a/include/cuco/detail/static_multimap/kernels.cuh ++++ b/include/cuco/detail/static_multimap/kernels.cuh +@@ -15,6 +15,7 @@ + */ + #pragma once + ++#include + #include + + #include +@@ -51,7 +52,7 @@ template +-__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) ++CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) + { + int64_t const loop_stride = gridDim.x * blockDim.x; + int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; +@@ -82,7 +83,7 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_ + * @param view Mutable device view used to access the hash map's slot storage + */ + template +-__global__ void insert(InputIt first, int64_t n, viewT view) ++CUCO_KERNEL void insert(InputIt first, int64_t n, viewT view) + { + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; +@@ -130,7 +131,7 @@ template +-__global__ void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) ++CUCO_KERNEL void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) + { + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; +@@ -177,7 +178,7 @@ template +-__global__ void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) ++CUCO_KERNEL void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) + { + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; +@@ -235,7 +236,7 @@ template +-__global__ void count( ++CUCO_KERNEL void count( + InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal) + { + auto tile = cg::tiled_partition(cg::this_thread_block()); +@@ -294,7 +295,7 @@ template +-__global__ void pair_count( ++CUCO_KERNEL void pair_count( + InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal) + { + auto tile = cg::tiled_partition(cg::this_thread_block()); +@@ -363,7 +364,7 @@ template +-__global__ void retrieve(InputIt first, ++CUCO_KERNEL void retrieve(InputIt first, + int64_t n, + OutputIt output_begin, + atomicT* num_matches, +@@ -476,7 +477,7 @@ template +-__global__ void pair_retrieve(InputIt first, ++CUCO_KERNEL void pair_retrieve(InputIt first, + int64_t n, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, +diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh +index dce4dc8..d19922b 100644 +--- a/include/cuco/detail/static_set/kernels.cuh ++++ b/include/cuco/detail/static_set/kernels.cuh +@@ -50,7 +50,7 @@ namespace detail { + * @param ref Non-owning set device ref used to access the slot storage + */ + template +-__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) ++CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) + { + namespace cg = cooperative_groups; + +diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh +index 6b3b102..2ac7edb 100644 +--- a/include/cuco/detail/storage/kernels.cuh ++++ b/include/cuco/detail/storage/kernels.cuh +@@ -32,7 +32,7 @@ namespace detail { + * @param value Value to which all values in `slots` are initialized + */ + template +-__global__ void initialize(WindowT* windows, ++CUCO_KERNEL void initialize(WindowT* windows, + cuco::detail::index_type n, + typename WindowT::value_type value) + { +diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +index c92ab60..ce61c67 100644 +--- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh ++++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +@@ -41,10 +41,10 @@ namespace detail { + * @param num_keys Number of input keys + */ + template +-__global__ void bitset_test_kernel(BitsetRef ref, +- KeyIt keys, +- OutputIt outputs, +- cuco::detail::index_type num_keys) ++CUCO_KERNEL void bitset_test_kernel(BitsetRef ref, ++ KeyIt keys, ++ OutputIt outputs, ++ cuco::detail::index_type num_keys) + { + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); +@@ -70,7 +70,7 @@ __global__ void bitset_test_kernel(BitsetRef ref, + * @param num_keys Number of input keys + */ + template +-__global__ void bitset_rank_kernel(BitsetRef ref, ++CUCO_KERNEL void bitset_rank_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +@@ -99,7 +99,7 @@ __global__ void bitset_rank_kernel(BitsetRef ref, + * @param num_keys Number of input keys + */ + template +-__global__ void bitset_select_kernel(BitsetRef ref, ++CUCO_KERNEL void bitset_select_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +@@ -125,7 +125,7 @@ __global__ void bitset_select_kernel(BitsetRef ref, + * @param flip_bits Boolean to request negation of words before counting bits + */ + template +-__global__ void bit_counts_kernel(WordType const* words, ++CUCO_KERNEL void bit_counts_kernel(WordType const* words, + SizeType* bit_counts, + cuco::detail::index_type num_words, + bool flip_bits) +@@ -157,7 +157,7 @@ __global__ void bit_counts_kernel(WordType const* words, + * @param words_per_block Number of words in each block + */ + template +-__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, ++CUCO_KERNEL void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, + rank* ranks, + SizeType num_words, + SizeType num_blocks, +@@ -200,7 +200,7 @@ __global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_c + * @param bits_per_block Number of bits in each block + */ + template +-__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, ++CUCO_KERNEL void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, + SizeType* select_markers, + SizeType num_blocks, + SizeType words_per_block, +diff --git a/include/cuco/detail/utility/cuda.cuh b/include/cuco/detail/utility/cuda.cuh +index 6e5f13f..d063e50 100644 +--- a/include/cuco/detail/utility/cuda.cuh ++++ b/include/cuco/detail/utility/cuda.cuh +@@ -17,6 +17,10 @@ + + #include + ++#ifndef CUCO_KERNEL ++#define CUCO_KERNEL __global__ static ++#endif ++ + namespace cuco { + namespace detail { + +diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh +index 187f930..d662370 100644 +--- a/include/cuco/static_map.cuh ++++ b/include/cuco/static_map.cuh +@@ -1820,7 +1820,7 @@ class static_map { + * Example: + * @code{.cpp} + * template +- * __global__ void use_device_view(const typename MapType::device_view device_view, ++ * CUCO_KERNEL void use_device_view(const typename MapType::device_view device_view, + * map_key_t const* const keys_to_search, + * map_value_t* const values_found, + * const size_t number_of_elements) +-- +2.43.0 diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json new file mode 100644 index 00000000000..98a35738d27 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/cuco_override.json @@ -0,0 +1,14 @@ + +{ + "packages" : { + "cuco" : { + "patches" : [ + { + "file" : "${current_json_dir}/cuco_hide_kernels.diff", + "issue" : "Fix hide kernel visibility [https://github.com/NVIDIA/cuCollections/pull/422]", + "fixed_in" : "" + } + ] + } + } +}