From 38820ff0e8cd7cd54793fd5c49fb1566a24686b1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 4 Dec 2024 11:43:37 -0600 Subject: [PATCH] Update to CCCL 2.7.0-rc2. (#17233) This PR updates to CCCL 2.7.0-rc2. Do not merge until all of RAPIDS is ready to update. Depends on https://github.com/rapidsai/rapids-cmake/pull/710 and should be admin-merged immediately after that PR. Part of https://github.com/rapidsai/build-planning/issues/115. --------- Co-authored-by: Michael Schellenberger Costa --- .../thirdparty/patches/cccl_override.json | 5 -- .../patches/cccl_symbol_visibility.diff | 27 -------- .../thrust_disable_64bit_dispatching.diff | 66 ++++++++++++++----- .../thrust_faster_sort_compile_times.diff | 12 ++-- 4 files changed, 56 insertions(+), 54 deletions(-) delete mode 100644 cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index dcf9c1139f9..2f29578f7ae 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,11 +3,6 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "${current_json_dir}/cccl_symbol_visibility.diff", - "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]", - "fixed_in" : "2.6" - }, { "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff deleted file mode 100644 index f745d5fa314..00000000000 --- a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff +++ /dev/null @@ -1,27 +0,0 @@ -diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config -index e7c62c031b..5db861853a 100644 ---- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config -+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config -@@ -1049,7 +1049,6 @@ typedef __char32_t char32_t; - # define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport) - # endif - --# define _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_HIDDEN -@@ -1448,14 +1447,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*, - # define _LIBCUDACXX_WEAK __attribute__((__weak__)) - # endif - --// Redefine some macros for internal use --# if defined(__cuda_std__) --# undef _LIBCUDACXX_FUNC_VIS --# define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY --# undef _LIBCUDACXX_TYPE_VIS --# define _LIBCUDACXX_TYPE_VIS --# endif // __cuda_std__ -- - // Thread API - # ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL - # if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__) diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff index 6ae1e1c917b..291eabe25fd 100644 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -1,25 +1,59 @@ diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 2a3cc4e33..8fb337b26 100644 +index 971b93d62..0d6b25b07 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -44,8 +44,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ +@@ -36,16 +36,15 @@ + * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch + * interfaces, that always deduce the size type from the arguments. + */ +-#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ +- if (count <= thrust::detail::integer_traits::const_max) \ +- { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ +- } \ +- else \ +- { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ ++#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ ++ if (count <= thrust::detail::integer_traits::const_max) \ ++ { \ ++ auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ ++ status = call arguments; \ ++ } \ ++ else \ ++ { \ + throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ } - + /** -@@ -66,9 +65,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ +@@ -55,18 +54,16 @@ + * + * This version of the macro supports providing two count variables, which is necessary for set algorithms. + */ +-#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ +- if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ +- { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ +- } \ +- else \ +- { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ ++#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ ++ if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ ++ { \ ++ auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ ++ auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ ++ status = call arguments; \ ++ } \ ++ else \ ++ { \ + throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ } + /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff index cb0cc55f4d2..5f1981e9806 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -1,20 +1,20 @@ diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index eb76ebb0b..c6c529a50 100644 +index 29510db5e..cf57e5786 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; - + -#pragma unroll +#pragma unroll 1 for (int item = 0; item < ITEMS_PER_THREAD; ++item) { - bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); -@@ -376,7 +376,7 @@ public: + const bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -374,7 +374,7 @@ public: // KeyT max_key = oob_default; - + -#pragma unroll +#pragma unroll 1 for (int item = 1; item < ITEMS_PER_THREAD; ++item) @@ -27,7 +27,7 @@ index 7d9e8622f..da5627306 100644 @@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE { constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; - + -#pragma unroll +#pragma unroll 1 for (int i = 0; i < ITEMS_PER_THREAD; ++i)