From 2e74f2f46db83942a4662e37acd19b31c07b158e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 17 Jan 2024 11:41:54 -0500
Subject: [PATCH 1/5] Mark all cugraph CUDA kernels with internal linkage

---
 .github/workflows/pr.yaml                     |   9 ++
 .github/workflows/test.yaml                   |  10 ++
 .../detail/decompress_edge_partition.cuh      |   4 +-
 cpp/src/community/legacy/ecg.cu               |   2 +-
 cpp/src/components/legacy/weak_cc.cuh         |  30 ++--
 cpp/src/layout/legacy/bh_kernels.cuh          | 136 +++++++++---------
 cpp/src/layout/legacy/exact_repulsion.cuh     |  16 +--
 cpp/src/layout/legacy/fa2_kernels.cuh         |  98 ++++++-------
 .../detail/extract_transform_v_frontier_e.cuh |   6 +-
 ...v_transform_reduce_incoming_outgoing_e.cuh |   8 +-
 cpp/src/prims/transform_e.cuh                 |   2 +-
 cpp/src/prims/transform_reduce_e.cuh          |   8 +-
 .../transform_reduce_e_by_src_dst_key.cuh     |   8 +-
 cpp/src/structure/graph_view_impl.cuh         |   4 +-
 .../traversal/od_shortest_distances_impl.cuh  |   2 +-
 cpp/src/utilities/eidecl_graph_utils.hpp      |  11 +-
 cpp/src/utilities/eidir_graph_utils.hpp       |  17 +--
 cpp/src/utilities/graph_utils.cuh             |  49 ++++---
 cpp/src/utilities/path_retrieval.cu           |  14 +-
 19 files changed, 229 insertions(+), 205 deletions(-)
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7f0b95e3573..f178a99cc1b 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -15,6 +15,7 @@ jobs:
       - checks
       - conda-cpp-build
       - conda-cpp-tests
+      - conda-cpp-checks
       - conda-notebook-tests
       - conda-python-build
       - conda-python-tests
@@ -52,6 +53,14 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)      
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 32fb2d62b29..205c60de007 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,6 +14,16 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
+      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
index dad5ce77e45..6b974a326dd 100644
--- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -44,7 +44,7 @@ namespace detail {
 int32_t constexpr decompress_edge_partition_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_mid_degree(
+__global__ static void decompress_to_edgelist_mid_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
@@ -74,7 +74,7 @@ __global__ void decompress_to_edgelist_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_high_degree(
+__global__ static void decompress_to_edgelist_high_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
diff --git a/cpp/src/community/legacy/ecg.cu b/cpp/src/community/legacy/ecg.cu
index d93a4446faa..b2ad79204ed 100644
--- a/cpp/src/community/legacy/ecg.cu
+++ b/cpp/src/community/legacy/ecg.cu
@@ -52,7 +52,7 @@ binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexT
 // FIXME: This shouldn't need to be a custom kernel, this
 //        seems like it should just be a thrust::transform
 template <typename IdxT, typename ValT>
-__global__ void match_check_kernel(
+__global__ static void match_check_kernel(
   IdxT size, IdxT num_verts, IdxT* offsets, IdxT* indices, IdxT* parts, ValT* weights)
 {
   IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/cpp/src/components/legacy/weak_cc.cuh b/cpp/src/components/legacy/weak_cc.cuh
index ad9aa773590..f4254e2d55d 100644
--- a/cpp/src/components/legacy/weak_cc.cuh
+++ b/cpp/src/components/legacy/weak_cc.cuh
@@ -59,15 +59,15 @@ class WeakCCState {
 };
 
 template <typename vertex_t, typename edge_t, int TPB_X = 32>
-__global__ void weak_cc_label_device(vertex_t* labels,
-                                     edge_t const* offsets,
-                                     vertex_t const* indices,
-                                     edge_t nnz,
-                                     bool* fa,
-                                     bool* xa,
-                                     bool* m,
-                                     vertex_t startVertexId,
-                                     vertex_t batchSize)
+__global__ static void weak_cc_label_device(vertex_t* labels,
+                                            edge_t const* offsets,
+                                            vertex_t const* indices,
+                                            edge_t nnz,
+                                            bool* fa,
+                                            bool* xa,
+                                            bool* m,
+                                            vertex_t startVertexId,
+                                            vertex_t batchSize)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < batchSize) {
@@ -118,11 +118,11 @@ __global__ void weak_cc_label_device(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32, typename Lambda>
-__global__ void weak_cc_init_label_kernel(vertex_t* labels,
-                                          vertex_t startVertexId,
-                                          vertex_t batchSize,
-                                          vertex_t MAX_LABEL,
-                                          Lambda filter_op)
+__global__ static void weak_cc_init_label_kernel(vertex_t* labels,
+                                                 vertex_t startVertexId,
+                                                 vertex_t batchSize,
+                                                 vertex_t MAX_LABEL,
+                                                 Lambda filter_op)
 {
   /** F1 and F2 in the paper correspond to fa and xa */
   /** Cd in paper corresponds to db_cluster */
@@ -134,7 +134,7 @@ __global__ void weak_cc_init_label_kernel(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32>
-__global__ void weak_cc_init_all_kernel(
+__global__ static void weak_cc_init_all_kernel(
   vertex_t* labels, bool* fa, bool* xa, vertex_t N, vertex_t MAX_LABEL)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
diff --git a/cpp/src/layout/legacy/bh_kernels.cuh b/cpp/src/layout/legacy/bh_kernels.cuh
index 5b101363314..f6e163ab306 100644
--- a/cpp/src/layout/legacy/bh_kernels.cuh
+++ b/cpp/src/layout/legacy/bh_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,9 +42,9 @@ namespace detail {
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
-__global__ void InitializationKernel(unsigned* restrict limiter,
-                                     int* restrict maxdepthd,
-                                     float* restrict radiusd)
+__global__ static void InitializationKernel(unsigned* restrict limiter,
+                                            int* restrict maxdepthd,
+                                            float* restrict radiusd)
 {
   maxdepthd[0] = 1;
   limiter[0]   = 0;
@@ -54,10 +54,10 @@ __global__ void InitializationKernel(unsigned* restrict limiter,
 /**
  * Reset root.
  */
-__global__ void ResetKernel(float* restrict radiusd_squared,
-                            int* restrict bottomd,
-                            const int NNODES,
-                            const float* restrict radiusd)
+__global__ static void ResetKernel(float* restrict radiusd_squared,
+                                   int* restrict bottomd,
+                                   const int NNODES,
+                                   const float* restrict radiusd)
 {
   radiusd_squared[0] = radiusd[0] * radiusd[0];
   // create root node
@@ -67,20 +67,21 @@ __global__ void ResetKernel(float* restrict radiusd_squared,
 /**
  * Figures the bounding boxes for every point in the embedding.
  */
-__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* restrict startd,
-                                                                       int* restrict childd,
-                                                                       int* restrict massd,
-                                                                       float* restrict posxd,
-                                                                       float* restrict posyd,
-                                                                       float* restrict maxxd,
-                                                                       float* restrict maxyd,
-                                                                       float* restrict minxd,
-                                                                       float* restrict minyd,
-                                                                       const int FOUR_NNODES,
-                                                                       const int NNODES,
-                                                                       const int N,
-                                                                       unsigned* restrict limiter,
-                                                                       float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS1,
+                                    FACTOR1) void BoundingBoxKernel(int* restrict startd,
+                                                                    int* restrict childd,
+                                                                    int* restrict massd,
+                                                                    float* restrict posxd,
+                                                                    float* restrict posyd,
+                                                                    float* restrict maxxd,
+                                                                    float* restrict maxyd,
+                                                                    float* restrict minxd,
+                                                                    float* restrict minyd,
+                                                                    const int FOUR_NNODES,
+                                                                    const int NNODES,
+                                                                    const int N,
+                                                                    unsigned* restrict limiter,
+                                                                    float* restrict radiusd)
 {
   float val, minx, maxx, miny, maxy;
   __shared__ float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1];
@@ -158,9 +159,9 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* rest
 /**
  * Clear some of the state vectors up.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
-                                                        const int FOUR_NNODES,
-                                                        const int FOUR_N)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
+                                                               const int FOUR_NNODES,
+                                                               const int FOUR_N)
 {
   const int inc = blockDim.x * gridDim.x;
   int k         = (FOUR_N & -32) + threadIdx.x + blockIdx.x * blockDim.x;
@@ -175,15 +176,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
 /**
  * Build the actual KD Tree.
  */
-__global__ __launch_bounds__(THREADS2,
-                             FACTOR2) void TreeBuildingKernel(int* restrict childd,
-                                                              const float* restrict posxd,
-                                                              const float* restrict posyd,
-                                                              const int NNODES,
-                                                              const int N,
-                                                              int* restrict maxdepthd,
-                                                              int* restrict bottomd,
-                                                              const float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS2,
+                                    FACTOR2) void TreeBuildingKernel(int* restrict childd,
+                                                                     const float* restrict posxd,
+                                                                     const float* restrict posyd,
+                                                                     const int NNODES,
+                                                                     const int N,
+                                                                     int* restrict maxdepthd,
+                                                                     int* restrict bottomd,
+                                                                     const float* restrict radiusd)
 {
   int j, depth;
   float x, y, r;
@@ -296,10 +297,10 @@ __global__ __launch_bounds__(THREADS2,
 /**
  * Clean more state vectors.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
-                                                        int* restrict massd,
-                                                        const int NNODES,
-                                                        const int* restrict bottomd)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
+                                                               int* restrict massd,
+                                                               const int NNODES,
+                                                               const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int inc    = blockDim.x * gridDim.x;
@@ -317,15 +318,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
 /**
  * Summarize the KD Tree via cell gathering
  */
-__global__ __launch_bounds__(THREADS3,
-                             FACTOR3) void SummarizationKernel(int* restrict countd,
-                                                               const int* restrict childd,
-                                                               volatile int* restrict massd,
-                                                               float* restrict posxd,
-                                                               float* restrict posyd,
-                                                               const int NNODES,
-                                                               const int N,
-                                                               const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS3,
+                                    FACTOR3) void SummarizationKernel(int* restrict countd,
+                                                                      const int* restrict childd,
+                                                                      volatile int* restrict massd,
+                                                                      float* restrict posxd,
+                                                                      float* restrict posyd,
+                                                                      const int NNODES,
+                                                                      const int N,
+                                                                      const int* restrict bottomd)
 {
   bool flag = 0;
   float cm, px, py;
@@ -453,13 +454,14 @@ __global__ __launch_bounds__(THREADS3,
 /**
  * Sort the cells
  */
-__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict sortd,
-                                                                const int* restrict countd,
-                                                                volatile int* restrict startd,
-                                                                int* restrict childd,
-                                                                const int NNODES,
-                                                                const int N,
-                                                                const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS4,
+                                    FACTOR4) void SortKernel(int* restrict sortd,
+                                                             const int* restrict countd,
+                                                             volatile int* restrict startd,
+                                                             int* restrict childd,
+                                                             const int NNODES,
+                                                             const int N,
+                                                             const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int dec    = blockDim.x * gridDim.x;
@@ -502,7 +504,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict so
 /**
  * Calculate the repulsive forces using the KD Tree
  */
-__global__ __launch_bounds__(
+__global__ static __launch_bounds__(
   THREADS5, FACTOR5) void RepulsionKernel(/* int *restrict errd, */
                                           const float scaling_ratio,
                                           const float theta,
@@ -612,18 +614,18 @@ __global__ __launch_bounds__(
   }
 }
 
-__global__ __launch_bounds__(THREADS6,
-                             FACTOR6) void apply_forces_bh(float* restrict Y_x,
-                                                           float* restrict Y_y,
-                                                           const float* restrict attract_x,
-                                                           const float* restrict attract_y,
-                                                           const float* restrict repel_x,
-                                                           const float* restrict repel_y,
-                                                           float* restrict old_dx,
-                                                           float* restrict old_dy,
-                                                           const float* restrict swinging,
-                                                           const float speed,
-                                                           const int n)
+__global__ static __launch_bounds__(THREADS6,
+                                    FACTOR6) void apply_forces_bh(float* restrict Y_x,
+                                                                  float* restrict Y_y,
+                                                                  const float* restrict attract_x,
+                                                                  const float* restrict attract_y,
+                                                                  const float* restrict repel_x,
+                                                                  const float* restrict repel_y,
+                                                                  float* restrict old_dx,
+                                                                  float* restrict old_dy,
+                                                                  const float* restrict swinging,
+                                                                  const float speed,
+                                                                  const int n)
 {
   // For evrery vertex
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
diff --git a/cpp/src/layout/legacy/exact_repulsion.cuh b/cpp/src/layout/legacy/exact_repulsion.cuh
index fe895bae6a0..8530202afd5 100644
--- a/cpp/src/layout/legacy/exact_repulsion.cuh
+++ b/cpp/src/layout/legacy/exact_repulsion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,13 +22,13 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t>
-__global__ void repulsion_kernel(const float* restrict x_pos,
-                                 const float* restrict y_pos,
-                                 float* restrict repel_x,
-                                 float* restrict repel_y,
-                                 const int* restrict mass,
-                                 const float scaling_ratio,
-                                 const vertex_t n)
+__global__ static void repulsion_kernel(const float* restrict x_pos,
+                                        const float* restrict y_pos,
+                                        float* restrict repel_x,
+                                        float* restrict repel_y,
+                                        const int* restrict mass,
+                                        const float scaling_ratio,
+                                        const vertex_t n)
 {
   int j = (blockIdx.x * blockDim.x) + threadIdx.x;  // for every item in row
   int i = (blockIdx.y * blockDim.y) + threadIdx.y;  // for every row
diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh
index 4f1ce520387..33e7841a380 100644
--- a/cpp/src/layout/legacy/fa2_kernels.cuh
+++ b/cpp/src/layout/legacy/fa2_kernels.cuh
@@ -23,19 +23,19 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-__global__ void attraction_kernel(const vertex_t* restrict row,
-                                  const vertex_t* restrict col,
-                                  const weight_t* restrict v,
-                                  const edge_t e,
-                                  const float* restrict x_pos,
-                                  const float* restrict y_pos,
-                                  float* restrict attract_x,
-                                  float* restrict attract_y,
-                                  const int* restrict mass,
-                                  bool outbound_attraction_distribution,
-                                  bool lin_log_mode,
-                                  const float edge_weight_influence,
-                                  const float coef)
+__global__ static void attraction_kernel(const vertex_t* restrict row,
+                                         const vertex_t* restrict col,
+                                         const weight_t* restrict v,
+                                         const edge_t e,
+                                         const float* restrict x_pos,
+                                         const float* restrict y_pos,
+                                         float* restrict attract_x,
+                                         float* restrict attract_y,
+                                         const int* restrict mass,
+                                         bool outbound_attraction_distribution,
+                                         bool lin_log_mode,
+                                         const float edge_weight_influence,
+                                         const float coef)
 {
   vertex_t i, src, dst;
   weight_t weight = 1;
@@ -116,13 +116,13 @@ void apply_attraction(const vertex_t* restrict row,
 }
 
 template <typename vertex_t>
-__global__ void linear_gravity_kernel(const float* restrict x_pos,
-                                      const float* restrict y_pos,
-                                      float* restrict attract_x,
-                                      float* restrict attract_y,
-                                      const int* restrict mass,
-                                      const float gravity,
-                                      const vertex_t n)
+__global__ static void linear_gravity_kernel(const float* restrict x_pos,
+                                             const float* restrict y_pos,
+                                             float* restrict attract_x,
+                                             float* restrict attract_y,
+                                             const int* restrict mass,
+                                             const float gravity,
+                                             const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -136,14 +136,14 @@ __global__ void linear_gravity_kernel(const float* restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void strong_gravity_kernel(const float* restrict x_pos,
-                                      const float* restrict y_pos,
-                                      float* restrict attract_x,
-                                      float* restrict attract_y,
-                                      const int* restrict mass,
-                                      const float gravity,
-                                      const float scaling_ratio,
-                                      const vertex_t n)
+__global__ static void strong_gravity_kernel(const float* restrict x_pos,
+                                             const float* restrict y_pos,
+                                             float* restrict attract_x,
+                                             float* restrict attract_y,
+                                             const int* restrict mass,
+                                             const float gravity,
+                                             const float scaling_ratio,
+                                             const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -187,16 +187,16 @@ void apply_gravity(const float* restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void local_speed_kernel(const float* restrict repel_x,
-                                   const float* restrict repel_y,
-                                   const float* restrict attract_x,
-                                   const float* restrict attract_y,
-                                   const float* restrict old_dx,
-                                   const float* restrict old_dy,
-                                   const int* restrict mass,
-                                   float* restrict swinging,
-                                   float* restrict traction,
-                                   const vertex_t n)
+__global__ static void local_speed_kernel(const float* restrict repel_x,
+                                          const float* restrict repel_y,
+                                          const float* restrict attract_x,
+                                          const float* restrict attract_y,
+                                          const float* restrict old_dx,
+                                          const float* restrict old_dy,
+                                          const int* restrict mass,
+                                          float* restrict swinging,
+                                          float* restrict traction,
+                                          const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -272,17 +272,17 @@ void adapt_speed(const float jitter_tolerance,
 }
 
 template <typename vertex_t>
-__global__ void update_positions_kernel(float* restrict x_pos,
-                                        float* restrict y_pos,
-                                        const float* restrict repel_x,
-                                        const float* restrict repel_y,
-                                        const float* restrict attract_x,
-                                        const float* restrict attract_y,
-                                        float* restrict old_dx,
-                                        float* restrict old_dy,
-                                        const float* restrict swinging,
-                                        const float speed,
-                                        const vertex_t n)
+__global__ static void update_positions_kernel(float* restrict x_pos,
+                                               float* restrict y_pos,
+                                               const float* restrict repel_x,
+                                               const float* restrict repel_y,
+                                               const float* restrict attract_x,
+                                               const float* restrict attract_y,
+                                               float* restrict old_dx,
+                                               float* restrict old_dy,
+                                               const float* restrict swinging,
+                                               const float speed,
+                                               const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index fc3da3cac07..0b6447f50d9 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -127,7 +127,7 @@ template <bool hypersparse,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_hypersparse_or_low_degree(
+__global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -295,7 +295,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_mid_degree(
+__global__ static void extract_transform_v_frontier_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -396,7 +396,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_high_degree(
+__global__ static void extract_transform_v_frontier_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 083487fa5b4..509ab56d3fe 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -149,7 +149,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_hypersparse(
+__global__ static void per_v_transform_reduce_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -251,7 +251,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_low_degree(
+__global__ static void per_v_transform_reduce_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -350,7 +350,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_mid_degree(
+__global__ static void per_v_transform_reduce_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -466,7 +466,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_high_degree(
+__global__ static void per_v_transform_reduce_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index 2cb1a5358b0..9c7670f68d2 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -51,7 +51,7 @@ template <bool check_edge_mask,
           typename EdgePartitionEdgeMaskWrapper,
           typename EdgePartitionEdgeValueOutputWrapper,
           typename EdgeOp>
-__global__ void transform_e_packed_bool(
+__global__ static void transform_e_packed_bool(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index e5855b105ee..43722550c58 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -61,7 +61,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_hypersparse(
+__global__ static void transform_reduce_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -153,7 +153,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_low_degree(
+__global__ static void transform_reduce_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -242,7 +242,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_mid_degree(
+__global__ static void transform_reduce_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -320,7 +320,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_high_degree(
+__global__ static void transform_reduce_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 42203085077..eee0ed03d1c 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -97,7 +97,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_hypersparse(
+__global__ static void transform_reduce_by_src_dst_key_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -156,7 +156,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_low_degree(
+__global__ static void transform_reduce_by_src_dst_key_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -214,7 +214,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_mid_degree(
+__global__ static void transform_reduce_by_src_dst_key_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -274,7 +274,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_high_degree(
+__global__ static void transform_reduce_by_src_dst_key_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 4ee5ad5ca02..29dca6ef409 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -241,7 +241,7 @@ rmm::device_uvector<edge_t> compute_minor_degrees(
 int32_t constexpr count_edge_partition_multi_edges_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_mid_degree(
+__global__ static void for_all_major_for_all_nbr_mid_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
@@ -275,7 +275,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_high_degree(
+__global__ static void for_all_major_for_all_nbr_high_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index c2a3f1160ca..612eb0c48f2 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -215,7 +215,7 @@ template <int32_t max_num_partitions,
           typename key_t,
           typename PartitionOp,
           typename KeyOp>
-__global__ void multi_partition_copy(
+__global__ static void multi_partition_copy(
   InputIterator input_first,
   InputIterator input_last,
   raft::device_span<key_t*> output_buffer_ptrs,
diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp
index 84240ba2845..abf026cbbfe 100644
--- a/cpp/src/utilities/eidecl_graph_utils.hpp
+++ b/cpp/src/utilities/eidecl_graph_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,9 +29,12 @@ extern template void offsets_to_indices<int, int>(int const*, int, int*);
 extern template void offsets_to_indices<long, int>(long const*, int, int*);
 extern template void offsets_to_indices<long, long>(long const*, long, long*);
 
-extern template __global__ void offsets_to_indices_kernel<int, int>(int const*, int, int*);
-extern template __global__ void offsets_to_indices_kernel<long, int>(long const*, int, int*);
-extern template __global__ void offsets_to_indices_kernel<long, long>(long const*, long, long*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<int, int>(int const*, int, int*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<long, int>(long const*, int, int*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<long, long>(long const*, long, long*);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp
index 033bb197ce8..ba06c6f56ea 100644
--- a/cpp/src/utilities/eidir_graph_utils.hpp
+++ b/cpp/src/utilities/eidir_graph_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,15 +29,12 @@ template void offsets_to_indices<int32_t, int32_t>(int32_t const*, int32_t, int3
 template void offsets_to_indices<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
 template void offsets_to_indices<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
 
-template __global__ void offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*,
-                                                                     int32_t,
-                                                                     int32_t*);
-template __global__ void offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*,
-                                                                     int32_t,
-                                                                     int32_t*);
-template __global__ void offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*,
-                                                                     int64_t,
-                                                                     int64_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*, int32_t, int32_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
index 2d542956531..0b257e7abde 100644
--- a/cpp/src/utilities/graph_utils.cuh
+++ b/cpp/src/utilities/graph_utils.cuh
@@ -247,34 +247,36 @@ void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor)
 
 // google matrix kernels
 template <typename IndexType, typename ValueType>
-__global__ void degree_coo(const IndexType n,
-                           const IndexType e,
-                           const IndexType* ind,
-                           ValueType* degree)
+__global__ static void degree_coo(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* ind,
+                                  ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
     atomicAdd(&degree[ind[i]], (ValueType)1.0);
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void flag_leafs_kernel(const size_t n, const IndexType* degree, ValueType* bookmark)
+__global__ static void flag_leafs_kernel(const size_t n,
+                                         const IndexType* degree,
+                                         ValueType* bookmark)
 {
   for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     if (degree[i] == 0) bookmark[i] = 1.0;
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void degree_offsets(const IndexType n,
-                               const IndexType e,
-                               const IndexType* ind,
-                               ValueType* degree)
+__global__ static void degree_offsets(const IndexType n,
+                                      const IndexType e,
+                                      const IndexType* ind,
+                                      ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     degree[i] += ind[i + 1] - ind[i];
 }
 
 template <typename FromType, typename ToType>
-__global__ void type_convert(FromType* array, int n)
+__global__ static void type_convert(FromType* array, int n)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
     ToType val   = array[i];
@@ -284,12 +286,12 @@ __global__ void type_convert(FromType* array, int n)
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void equi_prob3(const IndexType n,
-                           const IndexType e,
-                           const IndexType* csrPtr,
-                           const IndexType* csrInd,
-                           ValueType* val,
-                           IndexType* degree)
+__global__ static void equi_prob3(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* csrPtr,
+                                  const IndexType* csrInd,
+                                  ValueType* val,
+                                  IndexType* degree)
 {
   int j, row, col;
   for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
@@ -303,12 +305,12 @@ __global__ void equi_prob3(const IndexType n,
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void equi_prob2(const IndexType n,
-                           const IndexType e,
-                           const IndexType* csrPtr,
-                           const IndexType* csrInd,
-                           ValueType* val,
-                           IndexType* degree)
+__global__ static void equi_prob2(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* csrPtr,
+                                  const IndexType* csrInd,
+                                  ValueType* val,
+                                  IndexType* degree)
 {
   int row = blockIdx.x * blockDim.x + threadIdx.x;
   if (row < n) {
@@ -372,7 +374,8 @@ void HT_matrix_csc_coo(const IndexType n,
 }
 
 template <typename offsets_t, typename index_t>
-__global__ void offsets_to_indices_kernel(const offsets_t* offsets, index_t v, index_t* indices)
+__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel(
+  const offsets_t* offsets, index_t v, index_t* indices)
 {
   auto tid{threadIdx.x};
   auto ctaStart{blockIdx.x};
diff --git a/cpp/src/utilities/path_retrieval.cu b/cpp/src/utilities/path_retrieval.cu
index e37ce3a3ced..eda60941c23 100644
--- a/cpp/src/utilities/path_retrieval.cu
+++ b/cpp/src/utilities/path_retrieval.cu
@@ -29,13 +29,13 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename weight_t>
-__global__ void get_traversed_cost_kernel(vertex_t const* vertices,
-                                          vertex_t const* preds,
-                                          vertex_t const* vtx_map,
-                                          weight_t const* info_weights,
-                                          weight_t* out,
-                                          vertex_t stop_vertex,
-                                          vertex_t num_vertices)
+__global__ static void get_traversed_cost_kernel(vertex_t const* vertices,
+                                                 vertex_t const* preds,
+                                                 vertex_t const* vtx_map,
+                                                 weight_t const* info_weights,
+                                                 weight_t* out,
+                                                 vertex_t stop_vertex,
+                                                 vertex_t num_vertices)
 {
   for (vertex_t i = threadIdx.x + blockIdx.x * blockDim.x; i < num_vertices;
        i += gridDim.x * blockDim.x) {

From 129180d5cbaebf3b3c357bfae35b4ffab8c05015 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 17 Jan 2024 17:22:08 -0500
Subject: [PATCH 2/5] Mark all cugraph etl CUDA kernels with internal linkage

---
 .../include/hash/helper_functions.cuh         |  4 ++--
 cpp/libcugraph_etl/src/renumbering.cu         | 20 +++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/libcugraph_etl/include/hash/helper_functions.cuh b/cpp/libcugraph_etl/include/hash/helper_functions.cuh
index db377f938d2..8a11867f7e2 100644
--- a/cpp/libcugraph_etl/include/hash/helper_functions.cuh
+++ b/cpp/libcugraph_etl/include/hash/helper_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,7 +131,7 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co
 }
 
 template <typename value_type, typename size_type, typename key_type, typename elem_type>
-__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values,
+__global__ static void init_hashtbl(value_type* __restrict__ const hashtbl_values,
                              const size_type n,
                              const key_type key_val,
                              const elem_type elem_val)
diff --git a/cpp/libcugraph_etl/src/renumbering.cu b/cpp/libcugraph_etl/src/renumbering.cu
index 08759702ab4..1cbeeeeea05 100644
--- a/cpp/libcugraph_etl/src/renumbering.cu
+++ b/cpp/libcugraph_etl/src/renumbering.cu
@@ -270,7 +270,7 @@ __device__ __inline__ int32_t validate_ht_col_insert(volatile int32_t* ptr_col)
   return col;
 }
 
-__global__ void concat_and_create_histogram(int8_t* col_1,
+__global__ static void concat_and_create_histogram(int8_t* col_1,
                                             int32_t* offset_1,
                                             int8_t* col_2,
                                             int32_t* offset_2,
@@ -349,7 +349,7 @@ __global__ void concat_and_create_histogram(int8_t* col_1,
   }
 }
 
-__global__ void concat_and_create_histogram_2(int8_t* col_1,
+__global__ static void concat_and_create_histogram_2(int8_t* col_1,
                                               int32_t* offset_1,
                                               int8_t* col_2,
                                               int32_t* offset_2,
@@ -452,7 +452,7 @@ __global__ void concat_and_create_histogram_2(int8_t* col_1,
 }
 
 template <typename T>
-__global__ void set_src_vertex_idx(int8_t* col_1,
+__global__ static void set_src_vertex_idx(int8_t* col_1,
                                    int32_t* offset_1,
                                    int8_t* col_2,
                                    int32_t* offset_2,
@@ -509,7 +509,7 @@ __global__ void set_src_vertex_idx(int8_t* col_1,
 }
 
 template <typename T>
-__global__ void set_dst_vertex_idx(int8_t* col_1,
+__global__ static void set_dst_vertex_idx(int8_t* col_1,
                                    int32_t* offset_1,
                                    int8_t* col_2,
                                    int32_t* offset_2,
@@ -585,7 +585,7 @@ __global__ void set_dst_vertex_idx(int8_t* col_1,
   }
 }
 
-__global__ void create_mapping_histogram(uint32_t* hash_value,
+__global__ static void create_mapping_histogram(uint32_t* hash_value,
                                          str_hash_value* payload,
                                          cudf_map_type hash_map,
                                          accum_type count)
@@ -595,7 +595,7 @@ __global__ void create_mapping_histogram(uint32_t* hash_value,
   if (idx < count) { auto it = hash_map.insert(thrust::make_pair(hash_value[idx], payload[idx])); }
 }
 
-__global__ void assign_histogram_idx(cudf_map_type cuda_map_obj,
+__global__ static void assign_histogram_idx(cudf_map_type cuda_map_obj,
                                      size_t slot_count,
                                      str_hash_value* key,
                                      uint32_t* value,
@@ -621,7 +621,7 @@ __global__ void assign_histogram_idx(cudf_map_type cuda_map_obj,
   }
 }
 
-__global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count)
+__global__ static void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count)
 {
   accum_type tid = threadIdx.x + blockIdx.x * blockDim.x;
   // change count_ to renumber_idx
@@ -630,7 +630,7 @@ __global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type
   }
 }
 
-__global__ void set_output_col_offsets(str_hash_value* row_col_pair,
+__global__ static void set_output_col_offsets(str_hash_value* row_col_pair,
                                        int32_t* out_col1_offset,
                                        int32_t* out_col2_offset,
                                        int dst_pair_match,
@@ -653,7 +653,7 @@ __global__ void set_output_col_offsets(str_hash_value* row_col_pair,
   }
 }
 
-__global__ void offset_buffer_size_comp(int32_t* out_col1_length,
+__global__ static void offset_buffer_size_comp(int32_t* out_col1_length,
                                         int32_t* out_col2_length,
                                         int32_t* out_col1_offsets,
                                         int32_t* out_col2_offsets,
@@ -673,7 +673,7 @@ __global__ void offset_buffer_size_comp(int32_t* out_col1_length,
   }
 }
 
-__global__ void select_unrenumber_string(str_hash_value* idx_to_col_row,
+__global__ static void select_unrenumber_string(str_hash_value* idx_to_col_row,
                                          int32_t total_elements,
                                          int8_t* src_col1,
                                          int8_t* src_col2,

From 8770d421d9ac079f8d043966db4518fce12e4ade Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 18 Jan 2024 16:49:33 -0500
Subject: [PATCH 3/5] Add CI checks for external CUDA kernels

---
 .github/workflows/pr.yaml   | 2 +-
 .github/workflows/test.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f178a99cc1b..d35c3840fd3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -60,7 +60,7 @@ jobs:
     with:
       build_type: pull-request
       enable_check_symbols: true
-      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)      
+      symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 205c60de007..9feb8f5e7ce 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -23,7 +23,7 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       enable_check_symbols: true
-      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
+      symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04

From 28e6e8b6a2a134b81d31b39c11012e0fb17bf121 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 11 Mar 2024 08:54:38 -0400
Subject: [PATCH 4/5] Mark all new cugraph CUDA kernels with internal linkage

---
 cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 9cb3365116e..5240c49cb80 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -328,7 +328,7 @@ struct return_value_compute_offset_t {
 };
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree(
+__global__ static void compute_valid_local_nbr_inclusive_sums_mid_local_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
   raft::device_span<vertex_t const> edge_partition_frontier_majors,
@@ -382,7 +382,7 @@ __global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void compute_valid_local_nbr_inclusive_sums_high_local_degree(
+__global__ static void compute_valid_local_nbr_inclusive_sums_high_local_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask,
   raft::device_span<vertex_t const> edge_partition_frontier_majors,

From 46fca71402ee5d4daf5fb923a30840f329660bab Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 11 Mar 2024 08:56:42 -0400
Subject: [PATCH 5/5] Update symbol vis ci hooks to 24.04

---
 .github/workflows/pr.yaml   | 2 +-
 .github/workflows/test.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d35c3840fd3..7c8c9973462 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -56,7 +56,7 @@ jobs:
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
     with:
       build_type: pull-request
       enable_check_symbols: true
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9feb8f5e7ce..0bd095bfa94 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}