diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 84d22f8e896..0cc731ca622 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -15,6 +15,7 @@ jobs:
       - checks
       - conda-cpp-build
       - conda-cpp-tests
+      - conda-cpp-checks
       - conda-notebook-tests
       - conda-python-build
       - conda-python-tests
@@ -50,6 +51,14 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)      
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 773358ede8d..4d467656cc4 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,6 +14,16 @@ on:
         type: string
 
 jobs:
+  conda-cpp-checks:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      enable_check_symbols: true
+      symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
index 4b256a0413a..75e4fdd09e0 100644
--- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ namespace detail {
 int32_t constexpr decompress_edge_partition_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_mid_degree(
+__global__ static void decompress_to_edgelist_mid_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
@@ -73,7 +73,7 @@ __global__ void decompress_to_edgelist_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void decompress_to_edgelist_high_degree(
+__global__ static void decompress_to_edgelist_high_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
diff --git a/cpp/src/community/legacy/ecg.cu b/cpp/src/community/legacy/ecg.cu
index 78c3412f16a..046db594286 100644
--- a/cpp/src/community/legacy/ecg.cu
+++ b/cpp/src/community/legacy/ecg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexT
 // FIXME: This shouldn't need to be a custom kernel, this
 //        seems like it should just be a thrust::transform
 template <typename IdxT, typename ValT>
-__global__ void match_check_kernel(
+__global__ static void match_check_kernel(
   IdxT size, IdxT num_verts, IdxT* offsets, IdxT* indices, IdxT* parts, ValT* weights)
 {
   IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/cpp/src/components/legacy/weak_cc.cuh b/cpp/src/components/legacy/weak_cc.cuh
index ef301473379..911df852d1f 100644
--- a/cpp/src/components/legacy/weak_cc.cuh
+++ b/cpp/src/components/legacy/weak_cc.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,15 +57,15 @@ class WeakCCState {
 };
 
 template <typename vertex_t, typename edge_t, int TPB_X = 32>
-__global__ void weak_cc_label_device(vertex_t* labels,
-                                     edge_t const* offsets,
-                                     vertex_t const* indices,
-                                     edge_t nnz,
-                                     bool* fa,
-                                     bool* xa,
-                                     bool* m,
-                                     vertex_t startVertexId,
-                                     vertex_t batchSize)
+__global__ static void weak_cc_label_device(vertex_t* labels,
+                                            edge_t const* offsets,
+                                            vertex_t const* indices,
+                                            edge_t nnz,
+                                            bool* fa,
+                                            bool* xa,
+                                            bool* m,
+                                            vertex_t startVertexId,
+                                            vertex_t batchSize)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < batchSize) {
@@ -116,11 +116,11 @@ __global__ void weak_cc_label_device(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32, typename Lambda>
-__global__ void weak_cc_init_label_kernel(vertex_t* labels,
-                                          vertex_t startVertexId,
-                                          vertex_t batchSize,
-                                          vertex_t MAX_LABEL,
-                                          Lambda filter_op)
+__global__ static void weak_cc_init_label_kernel(vertex_t* labels,
+                                                 vertex_t startVertexId,
+                                                 vertex_t batchSize,
+                                                 vertex_t MAX_LABEL,
+                                                 Lambda filter_op)
 {
   /** F1 and F2 in the paper correspond to fa and xa */
   /** Cd in paper corresponds to db_cluster */
@@ -132,7 +132,7 @@ __global__ void weak_cc_init_label_kernel(vertex_t* labels,
 }
 
 template <typename vertex_t, int TPB_X = 32>
-__global__ void weak_cc_init_all_kernel(
+__global__ static void weak_cc_init_all_kernel(
   vertex_t* labels, bool* fa, bool* xa, vertex_t N, vertex_t MAX_LABEL)
 {
   vertex_t tid = threadIdx.x + blockIdx.x * TPB_X;
diff --git a/cpp/src/layout/legacy/bh_kernels.cuh b/cpp/src/layout/legacy/bh_kernels.cuh
index 5b101363314..f6e163ab306 100644
--- a/cpp/src/layout/legacy/bh_kernels.cuh
+++ b/cpp/src/layout/legacy/bh_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,9 +42,9 @@ namespace detail {
 /**
  * Intializes the states of objects. This speeds the overall kernel up.
  */
-__global__ void InitializationKernel(unsigned* restrict limiter,
-                                     int* restrict maxdepthd,
-                                     float* restrict radiusd)
+__global__ static void InitializationKernel(unsigned* restrict limiter,
+                                            int* restrict maxdepthd,
+                                            float* restrict radiusd)
 {
   maxdepthd[0] = 1;
   limiter[0]   = 0;
@@ -54,10 +54,10 @@ __global__ void InitializationKernel(unsigned* restrict limiter,
 /**
  * Reset root.
  */
-__global__ void ResetKernel(float* restrict radiusd_squared,
-                            int* restrict bottomd,
-                            const int NNODES,
-                            const float* restrict radiusd)
+__global__ static void ResetKernel(float* restrict radiusd_squared,
+                                   int* restrict bottomd,
+                                   const int NNODES,
+                                   const float* restrict radiusd)
 {
   radiusd_squared[0] = radiusd[0] * radiusd[0];
   // create root node
@@ -67,20 +67,21 @@ __global__ void ResetKernel(float* restrict radiusd_squared,
 /**
  * Figures the bounding boxes for every point in the embedding.
  */
-__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* restrict startd,
-                                                                       int* restrict childd,
-                                                                       int* restrict massd,
-                                                                       float* restrict posxd,
-                                                                       float* restrict posyd,
-                                                                       float* restrict maxxd,
-                                                                       float* restrict maxyd,
-                                                                       float* restrict minxd,
-                                                                       float* restrict minyd,
-                                                                       const int FOUR_NNODES,
-                                                                       const int NNODES,
-                                                                       const int N,
-                                                                       unsigned* restrict limiter,
-                                                                       float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS1,
+                                    FACTOR1) void BoundingBoxKernel(int* restrict startd,
+                                                                    int* restrict childd,
+                                                                    int* restrict massd,
+                                                                    float* restrict posxd,
+                                                                    float* restrict posyd,
+                                                                    float* restrict maxxd,
+                                                                    float* restrict maxyd,
+                                                                    float* restrict minxd,
+                                                                    float* restrict minyd,
+                                                                    const int FOUR_NNODES,
+                                                                    const int NNODES,
+                                                                    const int N,
+                                                                    unsigned* restrict limiter,
+                                                                    float* restrict radiusd)
 {
   float val, minx, maxx, miny, maxy;
   __shared__ float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1];
@@ -158,9 +159,9 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* rest
 /**
  * Clear some of the state vectors up.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
-                                                        const int FOUR_NNODES,
-                                                        const int FOUR_N)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
+                                                               const int FOUR_NNODES,
+                                                               const int FOUR_N)
 {
   const int inc = blockDim.x * gridDim.x;
   int k         = (FOUR_N & -32) + threadIdx.x + blockIdx.x * blockDim.x;
@@ -175,15 +176,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd,
 /**
  * Build the actual KD Tree.
  */
-__global__ __launch_bounds__(THREADS2,
-                             FACTOR2) void TreeBuildingKernel(int* restrict childd,
-                                                              const float* restrict posxd,
-                                                              const float* restrict posyd,
-                                                              const int NNODES,
-                                                              const int N,
-                                                              int* restrict maxdepthd,
-                                                              int* restrict bottomd,
-                                                              const float* restrict radiusd)
+__global__ static __launch_bounds__(THREADS2,
+                                    FACTOR2) void TreeBuildingKernel(int* restrict childd,
+                                                                     const float* restrict posxd,
+                                                                     const float* restrict posyd,
+                                                                     const int NNODES,
+                                                                     const int N,
+                                                                     int* restrict maxdepthd,
+                                                                     int* restrict bottomd,
+                                                                     const float* restrict radiusd)
 {
   int j, depth;
   float x, y, r;
@@ -296,10 +297,10 @@ __global__ __launch_bounds__(THREADS2,
 /**
  * Clean more state vectors.
  */
-__global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
-                                                        int* restrict massd,
-                                                        const int NNODES,
-                                                        const int* restrict bottomd)
+__global__ static __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
+                                                               int* restrict massd,
+                                                               const int NNODES,
+                                                               const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int inc    = blockDim.x * gridDim.x;
@@ -317,15 +318,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd,
 /**
  * Summarize the KD Tree via cell gathering
  */
-__global__ __launch_bounds__(THREADS3,
-                             FACTOR3) void SummarizationKernel(int* restrict countd,
-                                                               const int* restrict childd,
-                                                               volatile int* restrict massd,
-                                                               float* restrict posxd,
-                                                               float* restrict posyd,
-                                                               const int NNODES,
-                                                               const int N,
-                                                               const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS3,
+                                    FACTOR3) void SummarizationKernel(int* restrict countd,
+                                                                      const int* restrict childd,
+                                                                      volatile int* restrict massd,
+                                                                      float* restrict posxd,
+                                                                      float* restrict posyd,
+                                                                      const int NNODES,
+                                                                      const int N,
+                                                                      const int* restrict bottomd)
 {
   bool flag = 0;
   float cm, px, py;
@@ -453,13 +454,14 @@ __global__ __launch_bounds__(THREADS3,
 /**
  * Sort the cells
  */
-__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict sortd,
-                                                                const int* restrict countd,
-                                                                volatile int* restrict startd,
-                                                                int* restrict childd,
-                                                                const int NNODES,
-                                                                const int N,
-                                                                const int* restrict bottomd)
+__global__ static __launch_bounds__(THREADS4,
+                                    FACTOR4) void SortKernel(int* restrict sortd,
+                                                             const int* restrict countd,
+                                                             volatile int* restrict startd,
+                                                             int* restrict childd,
+                                                             const int NNODES,
+                                                             const int N,
+                                                             const int* restrict bottomd)
 {
   const int bottom = bottomd[0];
   const int dec    = blockDim.x * gridDim.x;
@@ -502,7 +504,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict so
 /**
  * Calculate the repulsive forces using the KD Tree
  */
-__global__ __launch_bounds__(
+__global__ static __launch_bounds__(
   THREADS5, FACTOR5) void RepulsionKernel(/* int *restrict errd, */
                                           const float scaling_ratio,
                                           const float theta,
@@ -612,18 +614,18 @@ __global__ __launch_bounds__(
   }
 }
 
-__global__ __launch_bounds__(THREADS6,
-                             FACTOR6) void apply_forces_bh(float* restrict Y_x,
-                                                           float* restrict Y_y,
-                                                           const float* restrict attract_x,
-                                                           const float* restrict attract_y,
-                                                           const float* restrict repel_x,
-                                                           const float* restrict repel_y,
-                                                           float* restrict old_dx,
-                                                           float* restrict old_dy,
-                                                           const float* restrict swinging,
-                                                           const float speed,
-                                                           const int n)
+__global__ static __launch_bounds__(THREADS6,
+                                    FACTOR6) void apply_forces_bh(float* restrict Y_x,
+                                                                  float* restrict Y_y,
+                                                                  const float* restrict attract_x,
+                                                                  const float* restrict attract_y,
+                                                                  const float* restrict repel_x,
+                                                                  const float* restrict repel_y,
+                                                                  float* restrict old_dx,
+                                                                  float* restrict old_dy,
+                                                                  const float* restrict swinging,
+                                                                  const float speed,
+                                                                  const int n)
 {
   // For evrery vertex
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
diff --git a/cpp/src/layout/legacy/exact_repulsion.cuh b/cpp/src/layout/legacy/exact_repulsion.cuh
index fe895bae6a0..8530202afd5 100644
--- a/cpp/src/layout/legacy/exact_repulsion.cuh
+++ b/cpp/src/layout/legacy/exact_repulsion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,13 +22,13 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t>
-__global__ void repulsion_kernel(const float* restrict x_pos,
-                                 const float* restrict y_pos,
-                                 float* restrict repel_x,
-                                 float* restrict repel_y,
-                                 const int* restrict mass,
-                                 const float scaling_ratio,
-                                 const vertex_t n)
+__global__ static void repulsion_kernel(const float* restrict x_pos,
+                                        const float* restrict y_pos,
+                                        float* restrict repel_x,
+                                        float* restrict repel_y,
+                                        const int* restrict mass,
+                                        const float scaling_ratio,
+                                        const vertex_t n)
 {
   int j = (blockIdx.x * blockDim.x) + threadIdx.x;  // for every item in row
   int i = (blockIdx.y * blockDim.y) + threadIdx.y;  // for every row
diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh
index e33111e8a99..070c7eece29 100644
--- a/cpp/src/layout/legacy/fa2_kernels.cuh
+++ b/cpp/src/layout/legacy/fa2_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,19 +23,19 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-__global__ void attraction_kernel(const vertex_t* restrict row,
-                                  const vertex_t* restrict col,
-                                  const weight_t* restrict v,
-                                  const edge_t e,
-                                  const float* restrict x_pos,
-                                  const float* restrict y_pos,
-                                  float* restrict attract_x,
-                                  float* restrict attract_y,
-                                  const int* restrict mass,
-                                  bool outbound_attraction_distribution,
-                                  bool lin_log_mode,
-                                  const float edge_weight_influence,
-                                  const float coef)
+__global__ static void attraction_kernel(const vertex_t* restrict row,
+                                         const vertex_t* restrict col,
+                                         const weight_t* restrict v,
+                                         const edge_t e,
+                                         const float* restrict x_pos,
+                                         const float* restrict y_pos,
+                                         float* restrict attract_x,
+                                         float* restrict attract_y,
+                                         const int* restrict mass,
+                                         bool outbound_attraction_distribution,
+                                         bool lin_log_mode,
+                                         const float edge_weight_influence,
+                                         const float coef)
 {
   vertex_t i, src, dst;
   weight_t weight = 1;
@@ -116,13 +116,13 @@ void apply_attraction(const vertex_t* restrict row,
 }
 
 template <typename vertex_t>
-__global__ void linear_gravity_kernel(const float* restrict x_pos,
-                                      const float* restrict y_pos,
-                                      float* restrict attract_x,
-                                      float* restrict attract_y,
-                                      const int* restrict mass,
-                                      const float gravity,
-                                      const vertex_t n)
+__global__ static void linear_gravity_kernel(const float* restrict x_pos,
+                                             const float* restrict y_pos,
+                                             float* restrict attract_x,
+                                             float* restrict attract_y,
+                                             const int* restrict mass,
+                                             const float gravity,
+                                             const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -136,14 +136,14 @@ __global__ void linear_gravity_kernel(const float* restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void strong_gravity_kernel(const float* restrict x_pos,
-                                      const float* restrict y_pos,
-                                      float* restrict attract_x,
-                                      float* restrict attract_y,
-                                      const int* restrict mass,
-                                      const float gravity,
-                                      const float scaling_ratio,
-                                      const vertex_t n)
+__global__ static void strong_gravity_kernel(const float* restrict x_pos,
+                                             const float* restrict y_pos,
+                                             float* restrict attract_x,
+                                             float* restrict attract_y,
+                                             const int* restrict mass,
+                                             const float gravity,
+                                             const float scaling_ratio,
+                                             const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -187,16 +187,16 @@ void apply_gravity(const float* restrict x_pos,
 }
 
 template <typename vertex_t>
-__global__ void local_speed_kernel(const float* restrict repel_x,
-                                   const float* restrict repel_y,
-                                   const float* restrict attract_x,
-                                   const float* restrict attract_y,
-                                   const float* restrict old_dx,
-                                   const float* restrict old_dy,
-                                   const int* restrict mass,
-                                   float* restrict swinging,
-                                   float* restrict traction,
-                                   const vertex_t n)
+__global__ static void local_speed_kernel(const float* restrict repel_x,
+                                          const float* restrict repel_y,
+                                          const float* restrict attract_x,
+                                          const float* restrict attract_y,
+                                          const float* restrict old_dx,
+                                          const float* restrict old_dy,
+                                          const int* restrict mass,
+                                          float* restrict swinging,
+                                          float* restrict traction,
+                                          const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
@@ -272,17 +272,17 @@ void adapt_speed(const float jitter_tolerance,
 }
 
 template <typename vertex_t>
-__global__ void update_positions_kernel(float* restrict x_pos,
-                                        float* restrict y_pos,
-                                        const float* restrict repel_x,
-                                        const float* restrict repel_y,
-                                        const float* restrict attract_x,
-                                        const float* restrict attract_y,
-                                        float* restrict old_dx,
-                                        float* restrict old_dy,
-                                        const float* restrict swinging,
-                                        const float speed,
-                                        const vertex_t n)
+__global__ static void update_positions_kernel(float* restrict x_pos,
+                                               float* restrict y_pos,
+                                               const float* restrict repel_x,
+                                               const float* restrict repel_y,
+                                               const float* restrict attract_x,
+                                               const float* restrict attract_y,
+                                               float* restrict old_dx,
+                                               float* restrict old_dy,
+                                               const float* restrict swinging,
+                                               const float speed,
+                                               const vertex_t n)
 {
   // For every node.
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index 2d77d64e1ff..bb13f493590 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,7 +95,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_hypersparse(
+__global__ static void extract_transform_v_frontier_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -262,7 +262,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_low_degree(
+__global__ static void extract_transform_v_frontier_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -420,7 +420,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_mid_degree(
+__global__ static void extract_transform_v_frontier_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -527,7 +527,7 @@ template <typename GraphViewType,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_high_degree(
+__global__ static void extract_transform_v_frontier_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 1a7fc0130c4..f0362f1dbaf 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_hypersparse(
+__global__ static void per_v_transform_reduce_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -182,7 +182,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_low_degree(
+__global__ static void per_v_transform_reduce_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -291,7 +291,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_mid_degree(
+__global__ static void per_v_transform_reduce_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -384,7 +384,7 @@ template <bool update_major,
           typename EdgeOp,
           typename ReduceOp,
           typename T>
-__global__ void per_v_transform_reduce_e_high_degree(
+__global__ static void per_v_transform_reduce_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index c6623621d24..9c3a9d89853 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename EdgePartitionEdgeValueOutputWrapper,
           typename EdgeOp>
-__global__ void transform_e_packed_bool(
+__global__ static void transform_e_packed_bool(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index 483ab64dcd9..16b912f34cf 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_hypersparse(
+__global__ static void transform_reduce_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -147,7 +147,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_low_degree(
+__global__ static void transform_reduce_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -235,7 +235,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_mid_degree(
+__global__ static void transform_reduce_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -305,7 +305,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void transform_reduce_e_high_degree(
+__global__ static void transform_reduce_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 77bf195b4d7..719c10028b5 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,7 +97,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_hypersparse(
+__global__ static void transform_reduce_by_src_dst_key_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -156,7 +156,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_low_degree(
+__global__ static void transform_reduce_by_src_dst_key_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -214,7 +214,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_mid_degree(
+__global__ static void transform_reduce_by_src_dst_key_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -274,7 +274,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionSrcDstKeyInputWrapper,
           typename EdgeOp,
           typename ValueIterator>
-__global__ void transform_reduce_by_src_dst_key_high_degree(
+__global__ static void transform_reduce_by_src_dst_key_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index da0ecc991df..8b0196b517a 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -240,7 +240,7 @@ rmm::device_uvector<edge_t> compute_minor_degrees(
 int32_t constexpr count_edge_partition_multi_edges_block_size = 1024;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_mid_degree(
+__global__ static void for_all_major_for_all_nbr_mid_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
@@ -274,7 +274,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-__global__ void for_all_major_for_all_nbr_high_degree(
+__global__ static void for_all_major_for_all_nbr_high_degree(
   edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition,
   vertex_t major_range_first,
   vertex_t major_range_last,
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index cc69cb5f67f..ebd06dc514c 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -215,7 +215,7 @@ template <int32_t max_num_partitions,
           typename key_t,
           typename PartitionOp,
           typename KeyOp>
-__global__ void multi_partition_copy(
+__global__ static void multi_partition_copy(
   InputIterator input_first,
   InputIterator input_last,
   raft::device_span<key_t*> output_buffer_ptrs,
diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp
index 84240ba2845..abf026cbbfe 100644
--- a/cpp/src/utilities/eidecl_graph_utils.hpp
+++ b/cpp/src/utilities/eidecl_graph_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,9 +29,12 @@ extern template void offsets_to_indices<int, int>(int const*, int, int*);
 extern template void offsets_to_indices<long, int>(long const*, int, int*);
 extern template void offsets_to_indices<long, long>(long const*, long, long*);
 
-extern template __global__ void offsets_to_indices_kernel<int, int>(int const*, int, int*);
-extern template __global__ void offsets_to_indices_kernel<long, int>(long const*, int, int*);
-extern template __global__ void offsets_to_indices_kernel<long, long>(long const*, long, long*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<int, int>(int const*, int, int*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<long, int>(long const*, int, int*);
+extern template __attribute__((visibility("hidden"))) __global__ void
+offsets_to_indices_kernel<long, long>(long const*, long, long*);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp
index 033bb197ce8..ba06c6f56ea 100644
--- a/cpp/src/utilities/eidir_graph_utils.hpp
+++ b/cpp/src/utilities/eidir_graph_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,15 +29,12 @@ template void offsets_to_indices<int32_t, int32_t>(int32_t const*, int32_t, int3
 template void offsets_to_indices<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
 template void offsets_to_indices<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
 
-template __global__ void offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*,
-                                                                     int32_t,
-                                                                     int32_t*);
-template __global__ void offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*,
-                                                                     int32_t,
-                                                                     int32_t*);
-template __global__ void offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*,
-                                                                     int64_t,
-                                                                     int64_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*, int32_t, int32_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*, int32_t, int32_t*);
+template __global__ __attribute__((visibility("hidden"))) void
+offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*, int64_t, int64_t*);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
index 1def024e1e5..9b2a7a4a12d 100644
--- a/cpp/src/utilities/graph_utils.cuh
+++ b/cpp/src/utilities/graph_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -245,34 +245,36 @@ void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor)
 
 // google matrix kernels
 template <typename IndexType, typename ValueType>
-__global__ void degree_coo(const IndexType n,
-                           const IndexType e,
-                           const IndexType* ind,
-                           ValueType* degree)
+__global__ static void degree_coo(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* ind,
+                                  ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
     atomicAdd(&degree[ind[i]], (ValueType)1.0);
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void flag_leafs_kernel(const size_t n, const IndexType* degree, ValueType* bookmark)
+__global__ static void flag_leafs_kernel(const size_t n,
+                                         const IndexType* degree,
+                                         ValueType* bookmark)
 {
   for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     if (degree[i] == 0) bookmark[i] = 1.0;
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void degree_offsets(const IndexType n,
-                               const IndexType e,
-                               const IndexType* ind,
-                               ValueType* degree)
+__global__ static void degree_offsets(const IndexType n,
+                                      const IndexType e,
+                                      const IndexType* ind,
+                                      ValueType* degree)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
     degree[i] += ind[i + 1] - ind[i];
 }
 
 template <typename FromType, typename ToType>
-__global__ void type_convert(FromType* array, int n)
+__global__ static void type_convert(FromType* array, int n)
 {
   for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) {
     ToType val   = array[i];
@@ -282,12 +284,12 @@ __global__ void type_convert(FromType* array, int n)
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void equi_prob3(const IndexType n,
-                           const IndexType e,
-                           const IndexType* csrPtr,
-                           const IndexType* csrInd,
-                           ValueType* val,
-                           IndexType* degree)
+__global__ static void equi_prob3(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* csrPtr,
+                                  const IndexType* csrInd,
+                                  ValueType* val,
+                                  IndexType* degree)
 {
   int j, row, col;
   for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
@@ -301,12 +303,12 @@ __global__ void equi_prob3(const IndexType n,
 }
 
 template <typename IndexType, typename ValueType>
-__global__ void equi_prob2(const IndexType n,
-                           const IndexType e,
-                           const IndexType* csrPtr,
-                           const IndexType* csrInd,
-                           ValueType* val,
-                           IndexType* degree)
+__global__ static void equi_prob2(const IndexType n,
+                                  const IndexType e,
+                                  const IndexType* csrPtr,
+                                  const IndexType* csrInd,
+                                  ValueType* val,
+                                  IndexType* degree)
 {
   int row = blockIdx.x * blockDim.x + threadIdx.x;
   if (row < n) {
@@ -370,7 +372,8 @@ void HT_matrix_csc_coo(const IndexType n,
 }
 
 template <typename offsets_t, typename index_t>
-__global__ void offsets_to_indices_kernel(const offsets_t* offsets, index_t v, index_t* indices)
+__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel(
+  const offsets_t* offsets, index_t v, index_t* indices)
 {
   auto tid{threadIdx.x};
   auto ctaStart{blockIdx.x};
diff --git a/cpp/src/utilities/path_retrieval.cu b/cpp/src/utilities/path_retrieval.cu
index 00c70bdbfd5..14afd30f578 100644
--- a/cpp/src/utilities/path_retrieval.cu
+++ b/cpp/src/utilities/path_retrieval.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,13 +29,13 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename weight_t>
-__global__ void get_traversed_cost_kernel(vertex_t const* vertices,
-                                          vertex_t const* preds,
-                                          vertex_t const* vtx_map,
-                                          weight_t const* info_weights,
-                                          weight_t* out,
-                                          vertex_t stop_vertex,
-                                          vertex_t num_vertices)
+__global__ static void get_traversed_cost_kernel(vertex_t const* vertices,
+                                                 vertex_t const* preds,
+                                                 vertex_t const* vtx_map,
+                                                 weight_t const* info_weights,
+                                                 weight_t* out,
+                                                 vertex_t stop_vertex,
+                                                 vertex_t num_vertices)
 {
   for (vertex_t i = threadIdx.x + blockIdx.x * blockDim.x; i < num_vertices;
        i += gridDim.x * blockDim.x) {