Mark all CUDA kernel launches as hidden

Effect on binary size of libraft.a 23.12: 133361630 pr: 129748904 Effect on binary size of libraft.so 23.12: 83603224 pr: 83873088
rapidsai · Oct 12, 2023 · e630d0f · e630d0f
1 parent 1a660d2
commit e630d0f
Show file tree

Hide file tree

Showing 146 changed files with 303 additions and 309 deletions.
diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
@@ -46,7 +46,7 @@ struct Params {
   AdjacencyPattern pattern;
 };  // struct Params
 
-__global__ void init_adj(AdjacencyPattern pattern,
+RAFT_KERNEL init_adj(AdjacencyPattern pattern,
                          int n,
                          raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
                          raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)

diff --git a/cpp/bench/prims/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu
@@ -30,7 +30,7 @@ struct bench_param {
 };
 
 template <typename index_t>
-__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
+RAFT_KERNEL init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor)
 {
   index_t r = blockDim.y * blockIdx.y + threadIdx.y;
   index_t c = blockDim.x * blockIdx.x + threadIdx.x;

diff --git a/cpp/include/raft/cluster/detail/agglomerative.cuh b/cpp/include/raft/cluster/detail/agglomerative.cuh
@@ -155,7 +155,7 @@ void build_dendrogram_host(raft::resources const& handle,
 }
 
 template <typename value_idx>
-__global__ void write_levels_kernel(const value_idx* children,
+RAFT_KERNEL write_levels_kernel(const value_idx* children,
                                     value_idx* parents,
                                     value_idx n_vertices)
 {
@@ -179,7 +179,7 @@ __global__ void write_levels_kernel(const value_idx* children,
  * @param labels
  */
 template <typename value_idx>
-__global__ void inherit_labels(const value_idx* children,
+RAFT_KERNEL inherit_labels(const value_idx* children,
                                const value_idx* levels,
                                std::size_t n_leaves,
                                value_idx* labels,

diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -107,7 +107,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
 };
 
 template <typename value_idx>
-__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz)
+RAFT_KERNEL fill_indices2(value_idx* indices, size_t m, size_t nnz)
 {
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;

diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -434,7 +434,7 @@ template <uint32_t BlockDimY,
           typename LabelT,
           typename CounterT,
           typename MappingOpT>
-__global__ void __launch_bounds__((WarpSize * BlockDimY))
+__launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL
   adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
                         IdxT n_clusters,
                         IdxT dim,

diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -92,7 +92,7 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
  *    initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(index_type_t n,
+RAFT_KERNEL computeDistances(index_type_t n,
                                         index_type_t d,
                                         index_type_t k,
                                         const value_type_t* __restrict__ obs,
@@ -173,7 +173,7 @@ static __global__ void computeDistances(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n,
+RAFT_KERNEL minDistances(index_type_t n,
                                     index_type_t k,
                                     value_type_t* __restrict__ dists,
                                     index_type_t* __restrict__ codes,
@@ -233,7 +233,7 @@ static __global__ void minDistances(index_type_t n,
  *  @param code_new Index associated with new centroid.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances2(index_type_t n,
+RAFT_KERNEL minDistances2(index_type_t n,
                                      value_type_t* __restrict__ dists_old,
                                      const value_type_t* __restrict__ dists_new,
                                      index_type_t* __restrict__ codes_old,
@@ -275,7 +275,7 @@ static __global__ void minDistances2(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t>
-static __global__ void computeClusterSizes(index_type_t n,
+RAFT_KERNEL computeClusterSizes(index_type_t n,
                                            const index_type_t* __restrict__ codes,
                                            index_type_t* __restrict__ clusterSizes)
 {
@@ -308,7 +308,7 @@ static __global__ void computeClusterSizes(index_type_t n,
  *    column is the mean position of a cluster).
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(index_type_t d,
+RAFT_KERNEL divideCentroids(index_type_t d,
                                        index_type_t k,
                                        const index_type_t* __restrict__ clusterSizes,
                                        value_type_t* __restrict__ centroids)

diff --git a/cpp/include/raft/common/detail/scatter.cuh b/cpp/include/raft/common/detail/scatter.cuh
@@ -22,7 +22,7 @@
 namespace raft::detail {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+RAFT_KERNEL scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
 {
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;

diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
@@ -329,7 +329,7 @@ __device__ auto increment_indices(IdxType* indices,
  * parameters.
  */
 template <typename DstType, typename SrcType>
-__global__ mdspan_copyable_with_kernel_t<DstType, SrcType> mdspan_copy_kernel(DstType dst,
+RAFT_KERNEL mdspan_copyable_with_kernel_t<DstType, SrcType> mdspan_copy_kernel(DstType dst,
                                                                               SrcType src)
 {
   using config = mdspan_copyable<true, DstType, SrcType>;

diff --git a/cpp/include/raft/distance/detail/compress_to_bits.cuh b/cpp/include/raft/distance/detail/compress_to_bits.cuh
@@ -35,7 +35,7 @@ namespace raft::distance::detail {
  *                          Note: the division (`/`) is a ceilDiv.
  */
 template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
-__global__ void compress_to_bits_kernel(
+RAFT_KERNEL compress_to_bits_kernel(
   raft::device_matrix_view<const bool, int, raft::layout_c_contiguous> in,
   raft::device_matrix_view<T, int, raft::layout_c_contiguous> out)
 {

diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -87,7 +87,7 @@ struct MinReduceOpImpl {
 };
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+RAFT_KERNEL initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
 {
   auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
   if (tid < m) { redOp.init(min + tid, maxVal); }
@@ -139,7 +139,7 @@ template <typename DataT,
           typename KVPReduceOpT,
           typename OpT,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
+__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min,
                                                                   const DataT* x,
                                                                   const DataT* y,
                                                                   const DataT* xn,

diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -36,7 +36,7 @@ namespace raft::distance::kernels::detail {
  * @param offset
  */
 template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel_nopad(
+RAFT_KERNEL polynomial_kernel_nopad(
   math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
 {
   for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
@@ -56,7 +56,7 @@ __global__ void polynomial_kernel_nopad(
  * @param offset
  */
 template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel(
+RAFT_KERNEL polynomial_kernel(
   math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
@@ -75,7 +75,7 @@ __global__ void polynomial_kernel(
  * @param offset
  */
 template <typename math_t>
-__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
 {
   for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
        tid += blockDim.x * gridDim.x) {
@@ -93,7 +93,7 @@ __global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t
  * @param offset
  */
 template <typename math_t>
-__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
        tidy += blockDim.y * gridDim.y)
@@ -121,7 +121,7 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
  * @param gain
  */
 template <typename math_t>
-__global__ void rbf_kernel_expanded(
+RAFT_KERNEL rbf_kernel_expanded(
   math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;

diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh
@@ -40,7 +40,7 @@ template <typename DataT,
           typename KVPReduceOpT,
           typename CoreLambda,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void masked_l2_nn_kernel(OutT* min,
+__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL masked_l2_nn_kernel(OutT* min,
                                                                       const DataT* x,
                                                                       const DataT* y,
                                                                       const DataT* xn,

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -31,7 +31,7 @@ template <typename Policy,
           typename DataT,
           typename OutT,
           typename FinOpT>
-__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(
+__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL pairwise_matrix_kernel(
   OpT distance_op, pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params)
 {
   // Early exit to minimize the size of the kernel when it is not supposed to be compiled.

diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
@@ -119,7 +119,7 @@ void getOvrlabels(
 // +/-1, return array with the new class labels and corresponding indices.
 
 template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(Type* map_ids,
+RAFT_KERNEL map_label_kernel(Type* map_ids,
                                  size_t N_labels,
                                  Type* in,
                                  Type* out,

diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -32,7 +32,7 @@ namespace detail {
  *  For an additional cost we can build the graph with edges
  *  E={(A[i], B[i]) | M[i]=1} and make this step faster */
 template <typename value_idx, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X)
+RAFT_KERNEL __launch_bounds__(TPB_X)
   propagate_label_kernel(const value_idx* __restrict__ labels_a,
                          const value_idx* __restrict__ labels_b,
                          value_idx* __restrict__ R,
@@ -65,7 +65,7 @@ __global__ void __launch_bounds__(TPB_X)
 }
 
 template <typename value_idx, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X)
+RAFT_KERNEL __launch_bounds__(TPB_X)
   reassign_label_kernel(value_idx* __restrict__ labels_a,
                         const value_idx* __restrict__ labels_b,
                         const value_idx* __restrict__ R,

diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
@@ -38,7 +38,7 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st
 }
 
 template <class InT, typename IdxType, typename OutT = InT>
-__global__ void add_dev_scalar_kernel(OutT* outDev,
+RAFT_KERNEL add_dev_scalar_kernel(OutT* outDev,
                                       const InT* inDev,
                                       const InT* singleScalarDev,
                                       IdxType len)

diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -40,7 +40,7 @@ template <typename Policy,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   coalescedReductionThinKernel(OutType* dots,
                                const InType* data,
                                IdxType D,
@@ -137,7 +137,7 @@ template <int TPB,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
+RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
                                                                       const InType* data,
                                                                       IdxType D,
                                                                       IdxType N,
@@ -225,7 +225,7 @@ template <typename Policy,
           typename IdxType,
           typename MainLambda,
           typename ReduceLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   coalescedReductionThickKernel(OutType* buffer,
                                 const InType* data,
                                 IdxType D,

diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
@@ -65,7 +65,7 @@ __device__ __forceinline__ void map_kernel_mainloop(
 }
 
 template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
+RAFT_KERNEL map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
 {
   const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
   if constexpr (R <= 1) {

diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -52,7 +52,7 @@ template <typename InType,
           typename ReduceLambda,
           int TPB,
           typename... Args>
-__global__ void mapThenReduceKernel(OutType* out,
+RAFT_KERNEL mapThenReduceKernel(OutType* out,
                                     IdxType len,
                                     OutType neutral,
                                     MapOp map,

diff --git a/cpp/include/raft/linalg/detail/normalize.cuh b/cpp/include/raft/linalg/detail/normalize.cuh
@@ -35,7 +35,7 @@ template <typename Policy,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   coalesced_normalize_thin_kernel(Type* out,
                                   const Type* in,
                                   IdxType D,
@@ -92,7 +92,7 @@ template <int TPB,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out,
+RAFT_KERNEL __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out,
                                                                          const Type* in,
                                                                          IdxType D,
                                                                          IdxType N,

diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
@@ -29,7 +29,7 @@ namespace detail {
 ///@todo: specialize this to support shared-mem based atomics
 
 template <typename T, typename KeyIteratorT, typename IdxType>
-__global__ void reduce_cols_by_key_direct_kernel(
+RAFT_KERNEL reduce_cols_by_key_direct_kernel(
   const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
 {
   typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
@@ -44,7 +44,7 @@ __global__ void reduce_cols_by_key_direct_kernel(
 }
 
 template <typename T, typename KeyIteratorT, typename IdxType>
-__global__ void reduce_cols_by_key_cached_kernel(
+RAFT_KERNEL reduce_cols_by_key_cached_kernel(
   const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
 {
   typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;

diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -33,7 +33,7 @@ namespace detail {
 //
 
 template <typename IteratorT1, typename IteratorT2>
-void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
+RAFT_KERNEL convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
 {
   for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) {
     dst[idx] = src[idx];
@@ -95,7 +95,7 @@ struct quadSum {
 template <typename DataIteratorT, typename WeightT, typename SumsT, typename IdxT>
 __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
 
-  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
+  RAFT_KERNEL sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
                                                      IdxT lda,
                                                      const char* d_keys,
                                                      const WeightT* d_weights,
@@ -193,7 +193,7 @@ template <typename DataIteratorT,
           typename WeightT,
           typename SumsT,
           typename IdxT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
+RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
                                                             IdxT lda,
                                                             KeysIteratorT d_keys,
                                                             const WeightT* d_weights,
@@ -269,7 +269,7 @@ template <typename DataIteratorT,
           typename WeightT,
           typename SumsT,
           typename IdxT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
+RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
                                                             IdxT lda,
                                                             const WeightT* d_weights,
                                                             KeysIteratorT d_keys,

diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -30,7 +30,7 @@ namespace detail {
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
 template <typename Type, typename MainLambda>
-__global__ void stridedSummationKernel(
+RAFT_KERNEL stridedSummationKernel(
   Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
 {
   // Thread reduction
@@ -68,7 +68,7 @@ template <typename InType,
           typename IdxType,
           typename MainLambda,
           typename ReduceLambda>
-__global__ void stridedReductionKernel(OutType* dots,
+RAFT_KERNEL stridedReductionKernel(OutType* dots,
                                        const InType* data,
                                        int D,
                                        int N,

diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -38,7 +38,7 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream
 }
 
 template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t* outDev,
+RAFT_KERNEL subtract_dev_scalar_kernel(math_t* outDev,
                                            const math_t* inDev,
                                            const math_t* singleScalarDev,
                                            IdxType len)