Mark all cugraph CUDA kernels with internal linkage

rapidsai · Jan 17, 2024 · ffcfbcc · ffcfbcc
1 parent 8672534
commit ffcfbcc
Show file tree

Hide file tree

Showing 19 changed files with 243 additions and 219 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -15,6 +15,7 @@ jobs:
       - checks
       - conda-cpp-build
       - conda-cpp-tests
+      - conda-cpp-checks
       - conda-notebook-tests
       - conda-python-build
       - conda-python-tests
@@ -50,6 +51,14 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)      
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -14,6 +14,16 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
+      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]

diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ namespace detail {
 int32_t constexpr decompress_edge_partition_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_mid_degree(
+__global__ static void decompress_to_edgelist_mid_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
@@ -73,7 +73,7 @@ __global__ void decompress_to_edgelist_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_high_degree(
+__global__ static void decompress_to_edgelist_high_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,

diff --git a/cpp/src/community/legacy/ecg.cu b/cpp/src/community/legacy/ecg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexT
 // FIXME: This shouldn't need to be a custom kernel, this
 //        seems like it should just be a thrust::transform
 template <typename IdxT, typename ValT>
-__global__ void match_check_kernel(
+__global__ static void match_check_kernel(
   IdxT size, IdxT num_verts, IdxT* offsets, IdxT* indices, IdxT* parts, ValT* weights)
 {
   IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;

diff --git a/cpp/src/components/legacy/weak_cc.cuh b/cpp/src/components/legacy/weak_cc.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,15 +57,15 @@ class WeakCCState {
 };
 
 template <typename vertex_t, typename edge_t, int TPB_X = 32>
-__global__ void weak_cc_label_device(vertex_t* labels,
-                                     edge_t const* offsets,
-                                     vertex_t const* indices,
-                                     edge_t nnz,
-                                     bool* fa,
-                                     bool* xa,
-                                     bool* m,
-                                     vertex_t startVertexId,
-                                     vertex_t batchSize)
+__global__ static void weak_cc_label_device(vertex_t* labels,
+                                            edge_t const* offsets,
+                                            vertex_t const* indices,
+                                            edge_t nnz,
+                                            bool* fa,
+                                            bool* xa,
+                                            bool* m,
+                                            vertex_t startVertexId,
+                                            vertex_t batchSize)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < batchSize) {
@@ -116,11 +116,11 @@ __global__ void weak_cc_label_device(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32, typename Lambda>
-__global__ void weak_cc_init_label_kernel(vertex_t* labels,
-                                          vertex_t startVertexId,
-                                          vertex_t batchSize,
-                                          vertex_t MAX_LABEL,
-                                          Lambda filter_op)
+__global__ static void weak_cc_init_label_kernel(vertex_t* labels,
+                                                 vertex_t startVertexId,
+                                                 vertex_t batchSize,
+                                                 vertex_t MAX_LABEL,
+                                                 Lambda filter_op)
 {
   /** F1 and F2 in the paper correspond to fa and xa */
   /** Cd in paper corresponds to db_cluster */
@@ -132,7 +132,7 @@ __global__ void weak_cc_init_label_kernel(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32>
-__global__ void weak_cc_init_all_kernel(
+__global__ static void weak_cc_init_all_kernel(
   vertex_t* labels, bool* fa, bool* xa, vertex_t N, vertex_t MAX_LABEL)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;

diff --git a/cpp/src/layout/legacy/bh_kernels.cuh b/cpp/src/layout/legacy/bh_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,9 +42,9 @@ namespace detail {
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
-__global__ void InitializationKernel(unsigned* restrict limiter,
-                                     int* restrict maxdepthd,
-                                     float* restrict radiusd)
+__global__ static void InitializationKernel(unsigned* restrict limiter,
+                                            int* restrict maxdepthd,
+                                            float* restrict radiusd)
 {
   maxdepthd[0] = 1;
   limiter[0]   = 0;
@@ -54,10 +54,10 @@ __global__ void InitializationKernel(unsigned* restrict limiter,
 /**
  * Reset root.
  */
-__global__ void ResetKernel(float* restrict radiusd_squared,
-                            int* restrict bottomd,
-                            const int NNODES,
-                            const float* restrict radiusd)
+__global__ static void ResetKernel(float* restrict radiusd_squared,
+                                   int* restrict bottomd,
+                                   const int NNODES,
+                                   const float* restrict radiusd)
 {
   radiusd_squared[0] = radiusd[0] * radiusd[0];
   // create root node
@@ -67,20 +67,21 @@ __global__ void ResetKernel(float* restrict radiusd_squared,
 /**
  * Figures the bounding boxes for every point in the embedding.
  */
-__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* restrict startd,
-                                                                       int* restrict childd,
-                                                                       int* restrict massd,
-                                                                       float* restrict posxd,
-                                                                       float* restrict posyd,
-                                                                       float* restrict maxxd,
-                                                                       float* restrict maxyd,
-                                                                       float* restrict minxd,
-                                                                       float* restrict minyd,
-                                                                       const int FOUR_NNODES,
-                                                                       const int NNODES,
-                                                                       const int N,
-                                                                       unsigned* restrict limiter,
-                                                                       float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS1,
+                                    FACTOR1) void BoundingBoxKernel(int* restrict startd,
+                                                                    int* restrict childd,
+                                                                    int* restrict massd,
+                                                                    float* restrict posxd,
+                                                                    float* restrict posyd,
+                                                                    float* restrict maxxd,
+                                                                    float* restrict maxyd,
+                                                                    float* restrict minxd,
+                                                                    float* restrict minyd,
+                                                                    const int FOUR_NNODES,
+                                                                    const int NNODES,
+                                                                    const int N,
+                                                                    unsigned* restrict limiter,
+                                                                    float* restrict radiusd)
 {
   float val, minx, maxx, miny, maxy;
   __shared__ float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1];
@@ -158,9 +159,9 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* rest
 /**
  * Clear some of the state vectors up.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
-                                                        const int FOUR_NNODES,
-                                                        const int FOUR_N)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
+                                                               const int FOUR_NNODES,
+                                                               const int FOUR_N)
 {
   const int inc = blockDim.x * gridDim.x;
   int k         = (FOUR_N & -32) + threadIdx.x + blockIdx.x * blockDim.x;
@@ -175,15 +176,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
 /**
  * Build the actual KD Tree.
  */
-__global__ __launch_bounds__(THREADS2,
-                             FACTOR2) void TreeBuildingKernel(int* restrict childd,
-                                                              const float* restrict posxd,
-                                                              const float* restrict posyd,
-                                                              const int NNODES,
-                                                              const int N,
-                                                              int* restrict maxdepthd,
-                                                              int* restrict bottomd,
-                                                              const float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS2,
+                                    FACTOR2) void TreeBuildingKernel(int* restrict childd,
+                                                                     const float* restrict posxd,
+                                                                     const float* restrict posyd,
+                                                                     const int NNODES,
+                                                                     const int N,
+                                                                     int* restrict maxdepthd,
+                                                                     int* restrict bottomd,
+                                                                     const float* restrict radiusd)
 {
   int j, depth;
   float x, y, r;
@@ -296,10 +297,10 @@ __global__ __launch_bounds__(THREADS2,
 /**
  * Clean more state vectors.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
-                                                        int* restrict massd,
-                                                        const int NNODES,
-                                                        const int* restrict bottomd)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
+                                                               int* restrict massd,
+                                                               const int NNODES,
+                                                               const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int inc    = blockDim.x * gridDim.x;
@@ -317,15 +318,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
 /**
  * Summarize the KD Tree via cell gathering
  */
-__global__ __launch_bounds__(THREADS3,
-                             FACTOR3) void SummarizationKernel(int* restrict countd,
-                                                               const int* restrict childd,
-                                                               volatile int* restrict massd,
-                                                               float* restrict posxd,
-                                                               float* restrict posyd,
-                                                               const int NNODES,
-                                                               const int N,
-                                                               const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS3,
+                                    FACTOR3) void SummarizationKernel(int* restrict countd,
+                                                                      const int* restrict childd,
+                                                                      volatile int* restrict massd,
+                                                                      float* restrict posxd,
+                                                                      float* restrict posyd,
+                                                                      const int NNODES,
+                                                                      const int N,
+                                                                      const int* restrict bottomd)
 {
   bool flag = 0;
   float cm, px, py;
@@ -453,13 +454,14 @@ __global__ __launch_bounds__(THREADS3,
 /**
  * Sort the cells
  */
-__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict sortd,
-                                                                const int* restrict countd,
-                                                                volatile int* restrict startd,
-                                                                int* restrict childd,
-                                                                const int NNODES,
-                                                                const int N,
-                                                                const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS4,
+                                    FACTOR4) void SortKernel(int* restrict sortd,
+                                                             const int* restrict countd,
+                                                             volatile int* restrict startd,
+                                                             int* restrict childd,
+                                                             const int NNODES,
+                                                             const int N,
+                                                             const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int dec    = blockDim.x * gridDim.x;
@@ -502,7 +504,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict so
 /**
  * Calculate the repulsive forces using the KD Tree
  */
-__global__ __launch_bounds__(
+__global__ static __launch_bounds__(
   THREADS5, FACTOR5) void RepulsionKernel(/* int *restrict errd, */
                                           const float scaling_ratio,
                                           const float theta,
@@ -612,18 +614,18 @@ __global__ __launch_bounds__(
   }
 }
 
-__global__ __launch_bounds__(THREADS6,
-                             FACTOR6) void apply_forces_bh(float* restrict Y_x,
-                                                           float* restrict Y_y,
-                                                           const float* restrict attract_x,
-                                                           const float* restrict attract_y,
-                                                           const float* restrict repel_x,
-                                                           const float* restrict repel_y,
-                                                           float* restrict old_dx,
-                                                           float* restrict old_dy,
-                                                           const float* restrict swinging,
-                                                           const float speed,
-                                                           const int n)
+__global__ static __launch_bounds__(THREADS6,
+                                    FACTOR6) void apply_forces_bh(float* restrict Y_x,
+                                                                  float* restrict Y_y,
+                                                                  const float* restrict attract_x,
+                                                                  const float* restrict attract_y,
+                                                                  const float* restrict repel_x,
+                                                                  const float* restrict repel_y,
+                                                                  float* restrict old_dx,
+                                                                  float* restrict old_dy,
+                                                                  const float* restrict swinging,
+                                                                  const float speed,
+                                                                  const int n)
 {
   // For evrery vertex
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {