From 2e74f2f46db83942a4662e37acd19b31c07b158e Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 17 Jan 2024 11:41:54 -0500 Subject: [PATCH 1/5] Mark all cugraph CUDA kernels with internal linkage --- .github/workflows/pr.yaml | 9 ++ .github/workflows/test.yaml | 10 ++ .../detail/decompress_edge_partition.cuh | 4 +- cpp/src/community/legacy/ecg.cu | 2 +- cpp/src/components/legacy/weak_cc.cuh | 30 ++-- cpp/src/layout/legacy/bh_kernels.cuh | 136 +++++++++--------- cpp/src/layout/legacy/exact_repulsion.cuh | 16 +-- cpp/src/layout/legacy/fa2_kernels.cuh | 98 ++++++------- .../detail/extract_transform_v_frontier_e.cuh | 6 +- ...v_transform_reduce_incoming_outgoing_e.cuh | 8 +- cpp/src/prims/transform_e.cuh | 2 +- cpp/src/prims/transform_reduce_e.cuh | 8 +- .../transform_reduce_e_by_src_dst_key.cuh | 8 +- cpp/src/structure/graph_view_impl.cuh | 4 +- .../traversal/od_shortest_distances_impl.cuh | 2 +- cpp/src/utilities/eidecl_graph_utils.hpp | 11 +- cpp/src/utilities/eidir_graph_utils.hpp | 17 +-- cpp/src/utilities/graph_utils.cuh | 49 ++++--- cpp/src/utilities/path_retrieval.cu | 14 +- 19 files changed, 229 insertions(+), 205 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7f0b95e3573..f178a99cc1b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -15,6 +15,7 @@ jobs: - checks - conda-cpp-build - conda-cpp-tests + - conda-cpp-checks - conda-notebook-tests - conda-python-build - conda-python-tests @@ -52,6 +53,14 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 with: build_type: pull-request + conda-cpp-checks: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02 + with: + build_type: pull-request + enable_check_symbols: true + symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 32fb2d62b29..205c60de007 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -14,6 +14,16 @@ on: type: string jobs: + conda-cpp-checks: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + enable_check_symbols: true + symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) conda-cpp-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh index dad5ce77e45..6b974a326dd 100644 --- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh +++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh @@ -44,7 +44,7 @@ namespace detail { int32_t constexpr decompress_edge_partition_block_size = 1024; template -__global__ void decompress_to_edgelist_mid_degree( +__global__ static void decompress_to_edgelist_mid_degree( edge_partition_device_view_t edge_partition, vertex_t major_range_first, vertex_t major_range_last, @@ -74,7 +74,7 @@ __global__ void decompress_to_edgelist_mid_degree( } template -__global__ void decompress_to_edgelist_high_degree( +__global__ static void decompress_to_edgelist_high_degree( edge_partition_device_view_t edge_partition, vertex_t major_range_first, vertex_t major_range_last, diff --git a/cpp/src/community/legacy/ecg.cu b/cpp/src/community/legacy/ecg.cu index d93a4446faa..b2ad79204ed 100644 --- a/cpp/src/community/legacy/ecg.cu +++ b/cpp/src/community/legacy/ecg.cu @@ -52,7 +52,7 @@ binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexT // FIXME: This shouldn't need to be a custom kernel, this // seems like it should just be a thrust::transform template -__global__ void match_check_kernel( +__global__ static void match_check_kernel( IdxT size, IdxT num_verts, IdxT* offsets, IdxT* indices, IdxT* parts, ValT* weights) { IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/cpp/src/components/legacy/weak_cc.cuh b/cpp/src/components/legacy/weak_cc.cuh index ad9aa773590..f4254e2d55d 100644 --- a/cpp/src/components/legacy/weak_cc.cuh +++ b/cpp/src/components/legacy/weak_cc.cuh @@ -59,15 +59,15 @@ class WeakCCState { }; template -__global__ void weak_cc_label_device(vertex_t* labels, - edge_t const* offsets, - vertex_t const* indices, - edge_t nnz, - bool* fa, - bool* xa, - bool* m, - vertex_t startVertexId, - vertex_t batchSize) +__global__ static void weak_cc_label_device(vertex_t* labels, + edge_t const* offsets, + vertex_t const* indices, + edge_t nnz, + bool* fa, + bool* xa, + bool* m, + vertex_t startVertexId, + vertex_t batchSize) { vertex_t tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < batchSize) { @@ -118,11 +118,11 @@ __global__ void weak_cc_label_device(vertex_t* labels, } template -__global__ void weak_cc_init_label_kernel(vertex_t* labels, - vertex_t startVertexId, - vertex_t batchSize, - vertex_t MAX_LABEL, - Lambda filter_op) +__global__ static void weak_cc_init_label_kernel(vertex_t* labels, + vertex_t startVertexId, + vertex_t batchSize, + vertex_t MAX_LABEL, + Lambda filter_op) { /** F1 and F2 in the paper correspond to fa and xa */ /** Cd in paper corresponds to db_cluster */ @@ -134,7 +134,7 @@ __global__ void weak_cc_init_label_kernel(vertex_t* labels, } template -__global__ void weak_cc_init_all_kernel( +__global__ static void weak_cc_init_all_kernel( vertex_t* labels, bool* fa, bool* xa, vertex_t N, vertex_t MAX_LABEL) { vertex_t tid = threadIdx.x + blockIdx.x * TPB_X; diff --git a/cpp/src/layout/legacy/bh_kernels.cuh b/cpp/src/layout/legacy/bh_kernels.cuh index 5b101363314..f6e163ab306 100644 --- a/cpp/src/layout/legacy/bh_kernels.cuh +++ b/cpp/src/layout/legacy/bh_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,9 +42,9 @@ namespace detail { /** * Intializes the states of objects. This speeds the overall kernel up. */ -__global__ void InitializationKernel(unsigned* restrict limiter, - int* restrict maxdepthd, - float* restrict radiusd) +__global__ static void InitializationKernel(unsigned* restrict limiter, + int* restrict maxdepthd, + float* restrict radiusd) { maxdepthd[0] = 1; limiter[0] = 0; @@ -54,10 +54,10 @@ __global__ void InitializationKernel(unsigned* restrict limiter, /** * Reset root. */ -__global__ void ResetKernel(float* restrict radiusd_squared, - int* restrict bottomd, - const int NNODES, - const float* restrict radiusd) +__global__ static void ResetKernel(float* restrict radiusd_squared, + int* restrict bottomd, + const int NNODES, + const float* restrict radiusd) { radiusd_squared[0] = radiusd[0] * radiusd[0]; // create root node @@ -67,20 +67,21 @@ __global__ void ResetKernel(float* restrict radiusd_squared, /** * Figures the bounding boxes for every point in the embedding. */ -__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* restrict startd, - int* restrict childd, - int* restrict massd, - float* restrict posxd, - float* restrict posyd, - float* restrict maxxd, - float* restrict maxyd, - float* restrict minxd, - float* restrict minyd, - const int FOUR_NNODES, - const int NNODES, - const int N, - unsigned* restrict limiter, - float* restrict radiusd) +__global__ static __launch_bounds__(THREADS1, + FACTOR1) void BoundingBoxKernel(int* restrict startd, + int* restrict childd, + int* restrict massd, + float* restrict posxd, + float* restrict posyd, + float* restrict maxxd, + float* restrict maxyd, + float* restrict minxd, + float* restrict minyd, + const int FOUR_NNODES, + const int NNODES, + const int N, + unsigned* restrict limiter, + float* restrict radiusd) { float val, minx, maxx, miny, maxy; __shared__ float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1]; @@ -158,9 +159,9 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* rest /** * Clear some of the state vectors up. */ -__global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd, - const int FOUR_NNODES, - const int FOUR_N) +__global__ static __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd, + const int FOUR_NNODES, + const int FOUR_N) { const int inc = blockDim.x * gridDim.x; int k = (FOUR_N & -32) + threadIdx.x + blockIdx.x * blockDim.x; @@ -175,15 +176,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd, /** * Build the actual KD Tree. */ -__global__ __launch_bounds__(THREADS2, - FACTOR2) void TreeBuildingKernel(int* restrict childd, - const float* restrict posxd, - const float* restrict posyd, - const int NNODES, - const int N, - int* restrict maxdepthd, - int* restrict bottomd, - const float* restrict radiusd) +__global__ static __launch_bounds__(THREADS2, + FACTOR2) void TreeBuildingKernel(int* restrict childd, + const float* restrict posxd, + const float* restrict posyd, + const int NNODES, + const int N, + int* restrict maxdepthd, + int* restrict bottomd, + const float* restrict radiusd) { int j, depth; float x, y, r; @@ -296,10 +297,10 @@ __global__ __launch_bounds__(THREADS2, /** * Clean more state vectors. */ -__global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd, - int* restrict massd, - const int NNODES, - const int* restrict bottomd) +__global__ static __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd, + int* restrict massd, + const int NNODES, + const int* restrict bottomd) { const int bottom = bottomd[0]; const int inc = blockDim.x * gridDim.x; @@ -317,15 +318,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd, /** * Summarize the KD Tree via cell gathering */ -__global__ __launch_bounds__(THREADS3, - FACTOR3) void SummarizationKernel(int* restrict countd, - const int* restrict childd, - volatile int* restrict massd, - float* restrict posxd, - float* restrict posyd, - const int NNODES, - const int N, - const int* restrict bottomd) +__global__ static __launch_bounds__(THREADS3, + FACTOR3) void SummarizationKernel(int* restrict countd, + const int* restrict childd, + volatile int* restrict massd, + float* restrict posxd, + float* restrict posyd, + const int NNODES, + const int N, + const int* restrict bottomd) { bool flag = 0; float cm, px, py; @@ -453,13 +454,14 @@ __global__ __launch_bounds__(THREADS3, /** * Sort the cells */ -__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict sortd, - const int* restrict countd, - volatile int* restrict startd, - int* restrict childd, - const int NNODES, - const int N, - const int* restrict bottomd) +__global__ static __launch_bounds__(THREADS4, + FACTOR4) void SortKernel(int* restrict sortd, + const int* restrict countd, + volatile int* restrict startd, + int* restrict childd, + const int NNODES, + const int N, + const int* restrict bottomd) { const int bottom = bottomd[0]; const int dec = blockDim.x * gridDim.x; @@ -502,7 +504,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict so /** * Calculate the repulsive forces using the KD Tree */ -__global__ __launch_bounds__( +__global__ static __launch_bounds__( THREADS5, FACTOR5) void RepulsionKernel(/* int *restrict errd, */ const float scaling_ratio, const float theta, @@ -612,18 +614,18 @@ __global__ __launch_bounds__( } } -__global__ __launch_bounds__(THREADS6, - FACTOR6) void apply_forces_bh(float* restrict Y_x, - float* restrict Y_y, - const float* restrict attract_x, - const float* restrict attract_y, - const float* restrict repel_x, - const float* restrict repel_y, - float* restrict old_dx, - float* restrict old_dy, - const float* restrict swinging, - const float speed, - const int n) +__global__ static __launch_bounds__(THREADS6, + FACTOR6) void apply_forces_bh(float* restrict Y_x, + float* restrict Y_y, + const float* restrict attract_x, + const float* restrict attract_y, + const float* restrict repel_x, + const float* restrict repel_y, + float* restrict old_dx, + float* restrict old_dy, + const float* restrict swinging, + const float speed, + const int n) { // For evrery vertex for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { diff --git a/cpp/src/layout/legacy/exact_repulsion.cuh b/cpp/src/layout/legacy/exact_repulsion.cuh index fe895bae6a0..8530202afd5 100644 --- a/cpp/src/layout/legacy/exact_repulsion.cuh +++ b/cpp/src/layout/legacy/exact_repulsion.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,13 +22,13 @@ namespace cugraph { namespace detail { template -__global__ void repulsion_kernel(const float* restrict x_pos, - const float* restrict y_pos, - float* restrict repel_x, - float* restrict repel_y, - const int* restrict mass, - const float scaling_ratio, - const vertex_t n) +__global__ static void repulsion_kernel(const float* restrict x_pos, + const float* restrict y_pos, + float* restrict repel_x, + float* restrict repel_y, + const int* restrict mass, + const float scaling_ratio, + const vertex_t n) { int j = (blockIdx.x * blockDim.x) + threadIdx.x; // for every item in row int i = (blockIdx.y * blockDim.y) + threadIdx.y; // for every row diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh index 4f1ce520387..33e7841a380 100644 --- a/cpp/src/layout/legacy/fa2_kernels.cuh +++ b/cpp/src/layout/legacy/fa2_kernels.cuh @@ -23,19 +23,19 @@ namespace cugraph { namespace detail { template -__global__ void attraction_kernel(const vertex_t* restrict row, - const vertex_t* restrict col, - const weight_t* restrict v, - const edge_t e, - const float* restrict x_pos, - const float* restrict y_pos, - float* restrict attract_x, - float* restrict attract_y, - const int* restrict mass, - bool outbound_attraction_distribution, - bool lin_log_mode, - const float edge_weight_influence, - const float coef) +__global__ static void attraction_kernel(const vertex_t* restrict row, + const vertex_t* restrict col, + const weight_t* restrict v, + const edge_t e, + const float* restrict x_pos, + const float* restrict y_pos, + float* restrict attract_x, + float* restrict attract_y, + const int* restrict mass, + bool outbound_attraction_distribution, + bool lin_log_mode, + const float edge_weight_influence, + const float coef) { vertex_t i, src, dst; weight_t weight = 1; @@ -116,13 +116,13 @@ void apply_attraction(const vertex_t* restrict row, } template -__global__ void linear_gravity_kernel(const float* restrict x_pos, - const float* restrict y_pos, - float* restrict attract_x, - float* restrict attract_y, - const int* restrict mass, - const float gravity, - const vertex_t n) +__global__ static void linear_gravity_kernel(const float* restrict x_pos, + const float* restrict y_pos, + float* restrict attract_x, + float* restrict attract_y, + const int* restrict mass, + const float gravity, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -136,14 +136,14 @@ __global__ void linear_gravity_kernel(const float* restrict x_pos, } template -__global__ void strong_gravity_kernel(const float* restrict x_pos, - const float* restrict y_pos, - float* restrict attract_x, - float* restrict attract_y, - const int* restrict mass, - const float gravity, - const float scaling_ratio, - const vertex_t n) +__global__ static void strong_gravity_kernel(const float* restrict x_pos, + const float* restrict y_pos, + float* restrict attract_x, + float* restrict attract_y, + const int* restrict mass, + const float gravity, + const float scaling_ratio, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -187,16 +187,16 @@ void apply_gravity(const float* restrict x_pos, } template -__global__ void local_speed_kernel(const float* restrict repel_x, - const float* restrict repel_y, - const float* restrict attract_x, - const float* restrict attract_y, - const float* restrict old_dx, - const float* restrict old_dy, - const int* restrict mass, - float* restrict swinging, - float* restrict traction, - const vertex_t n) +__global__ static void local_speed_kernel(const float* restrict repel_x, + const float* restrict repel_y, + const float* restrict attract_x, + const float* restrict attract_y, + const float* restrict old_dx, + const float* restrict old_dy, + const int* restrict mass, + float* restrict swinging, + float* restrict traction, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -272,17 +272,17 @@ void adapt_speed(const float jitter_tolerance, } template -__global__ void update_positions_kernel(float* restrict x_pos, - float* restrict y_pos, - const float* restrict repel_x, - const float* restrict repel_y, - const float* restrict attract_x, - const float* restrict attract_y, - float* restrict old_dx, - float* restrict old_dy, - const float* restrict swinging, - const float speed, - const vertex_t n) +__global__ static void update_positions_kernel(float* restrict x_pos, + float* restrict y_pos, + const float* restrict repel_x, + const float* restrict repel_y, + const float* restrict attract_x, + const float* restrict attract_y, + float* restrict old_dx, + float* restrict old_dy, + const float* restrict swinging, + const float speed, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index fc3da3cac07..0b6447f50d9 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -127,7 +127,7 @@ template -__global__ void extract_transform_v_frontier_e_hypersparse_or_low_degree( +__global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( edge_partition_device_view_t edge_partition, @@ -295,7 +295,7 @@ template -__global__ void extract_transform_v_frontier_e_mid_degree( +__global__ static void extract_transform_v_frontier_e_mid_degree( edge_partition_device_view_t edge_partition, @@ -396,7 +396,7 @@ template -__global__ void extract_transform_v_frontier_e_high_degree( +__global__ static void extract_transform_v_frontier_e_high_degree( edge_partition_device_view_t edge_partition, diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 083487fa5b4..509ab56d3fe 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -149,7 +149,7 @@ template -__global__ void per_v_transform_reduce_e_hypersparse( +__global__ static void per_v_transform_reduce_e_hypersparse( edge_partition_device_view_t edge_partition, @@ -251,7 +251,7 @@ template -__global__ void per_v_transform_reduce_e_low_degree( +__global__ static void per_v_transform_reduce_e_low_degree( edge_partition_device_view_t edge_partition, @@ -350,7 +350,7 @@ template -__global__ void per_v_transform_reduce_e_mid_degree( +__global__ static void per_v_transform_reduce_e_mid_degree( edge_partition_device_view_t edge_partition, @@ -466,7 +466,7 @@ template -__global__ void per_v_transform_reduce_e_high_degree( +__global__ static void per_v_transform_reduce_e_high_degree( edge_partition_device_view_t edge_partition, diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh index 2cb1a5358b0..9c7670f68d2 100644 --- a/cpp/src/prims/transform_e.cuh +++ b/cpp/src/prims/transform_e.cuh @@ -51,7 +51,7 @@ template -__global__ void transform_e_packed_bool( +__global__ static void transform_e_packed_bool( edge_partition_device_view_t edge_partition, diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh index e5855b105ee..43722550c58 100644 --- a/cpp/src/prims/transform_reduce_e.cuh +++ b/cpp/src/prims/transform_reduce_e.cuh @@ -61,7 +61,7 @@ template -__global__ void transform_reduce_e_hypersparse( +__global__ static void transform_reduce_e_hypersparse( edge_partition_device_view_t edge_partition, @@ -153,7 +153,7 @@ template -__global__ void transform_reduce_e_low_degree( +__global__ static void transform_reduce_e_low_degree( edge_partition_device_view_t edge_partition, @@ -242,7 +242,7 @@ template -__global__ void transform_reduce_e_mid_degree( +__global__ static void transform_reduce_e_mid_degree( edge_partition_device_view_t edge_partition, @@ -320,7 +320,7 @@ template -__global__ void transform_reduce_e_high_degree( +__global__ static void transform_reduce_e_high_degree( edge_partition_device_view_t edge_partition, diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh index 42203085077..eee0ed03d1c 100644 --- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh +++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh @@ -97,7 +97,7 @@ template -__global__ void transform_reduce_by_src_dst_key_hypersparse( +__global__ static void transform_reduce_by_src_dst_key_hypersparse( edge_partition_device_view_t edge_partition, @@ -156,7 +156,7 @@ template -__global__ void transform_reduce_by_src_dst_key_low_degree( +__global__ static void transform_reduce_by_src_dst_key_low_degree( edge_partition_device_view_t edge_partition, @@ -214,7 +214,7 @@ template -__global__ void transform_reduce_by_src_dst_key_mid_degree( +__global__ static void transform_reduce_by_src_dst_key_mid_degree( edge_partition_device_view_t edge_partition, @@ -274,7 +274,7 @@ template -__global__ void transform_reduce_by_src_dst_key_high_degree( +__global__ static void transform_reduce_by_src_dst_key_high_degree( edge_partition_device_view_t edge_partition, diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 4ee5ad5ca02..29dca6ef409 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -241,7 +241,7 @@ rmm::device_uvector compute_minor_degrees( int32_t constexpr count_edge_partition_multi_edges_block_size = 1024; template -__global__ void for_all_major_for_all_nbr_mid_degree( +__global__ static void for_all_major_for_all_nbr_mid_degree( edge_partition_device_view_t edge_partition, vertex_t major_range_first, vertex_t major_range_last, @@ -275,7 +275,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree( } template -__global__ void for_all_major_for_all_nbr_high_degree( +__global__ static void for_all_major_for_all_nbr_high_degree( edge_partition_device_view_t edge_partition, vertex_t major_range_first, vertex_t major_range_last, diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index c2a3f1160ca..612eb0c48f2 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -215,7 +215,7 @@ template -__global__ void multi_partition_copy( +__global__ static void multi_partition_copy( InputIterator input_first, InputIterator input_last, raft::device_span output_buffer_ptrs, diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp index 84240ba2845..abf026cbbfe 100644 --- a/cpp/src/utilities/eidecl_graph_utils.hpp +++ b/cpp/src/utilities/eidecl_graph_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,9 +29,12 @@ extern template void offsets_to_indices(int const*, int, int*); extern template void offsets_to_indices(long const*, int, int*); extern template void offsets_to_indices(long const*, long, long*); -extern template __global__ void offsets_to_indices_kernel(int const*, int, int*); -extern template __global__ void offsets_to_indices_kernel(long const*, int, int*); -extern template __global__ void offsets_to_indices_kernel(long const*, long, long*); +extern template __attribute__((visibility("hidden"))) __global__ void +offsets_to_indices_kernel(int const*, int, int*); +extern template __attribute__((visibility("hidden"))) __global__ void +offsets_to_indices_kernel(long const*, int, int*); +extern template __attribute__((visibility("hidden"))) __global__ void +offsets_to_indices_kernel(long const*, long, long*); } // namespace detail } // namespace cugraph diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp index 033bb197ce8..ba06c6f56ea 100644 --- a/cpp/src/utilities/eidir_graph_utils.hpp +++ b/cpp/src/utilities/eidir_graph_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,15 +29,12 @@ template void offsets_to_indices(int32_t const*, int32_t, int3 template void offsets_to_indices(int64_t const*, int32_t, int32_t*); template void offsets_to_indices(int64_t const*, int64_t, int64_t*); -template __global__ void offsets_to_indices_kernel(int32_t const*, - int32_t, - int32_t*); -template __global__ void offsets_to_indices_kernel(int64_t const*, - int32_t, - int32_t*); -template __global__ void offsets_to_indices_kernel(int64_t const*, - int64_t, - int64_t*); +template __global__ __attribute__((visibility("hidden"))) void +offsets_to_indices_kernel(int32_t const*, int32_t, int32_t*); +template __global__ __attribute__((visibility("hidden"))) void +offsets_to_indices_kernel(int64_t const*, int32_t, int32_t*); +template __global__ __attribute__((visibility("hidden"))) void +offsets_to_indices_kernel(int64_t const*, int64_t, int64_t*); } // namespace detail } // namespace cugraph diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh index 2d542956531..0b257e7abde 100644 --- a/cpp/src/utilities/graph_utils.cuh +++ b/cpp/src/utilities/graph_utils.cuh @@ -247,34 +247,36 @@ void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) // google matrix kernels template -__global__ void degree_coo(const IndexType n, - const IndexType e, - const IndexType* ind, - ValueType* degree) +__global__ static void degree_coo(const IndexType n, + const IndexType e, + const IndexType* ind, + ValueType* degree) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) atomicAdd(°ree[ind[i]], (ValueType)1.0); } template -__global__ void flag_leafs_kernel(const size_t n, const IndexType* degree, ValueType* bookmark) +__global__ static void flag_leafs_kernel(const size_t n, + const IndexType* degree, + ValueType* bookmark) { for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) if (degree[i] == 0) bookmark[i] = 1.0; } template -__global__ void degree_offsets(const IndexType n, - const IndexType e, - const IndexType* ind, - ValueType* degree) +__global__ static void degree_offsets(const IndexType n, + const IndexType e, + const IndexType* ind, + ValueType* degree) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) degree[i] += ind[i + 1] - ind[i]; } template -__global__ void type_convert(FromType* array, int n) +__global__ static void type_convert(FromType* array, int n) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { ToType val = array[i]; @@ -284,12 +286,12 @@ __global__ void type_convert(FromType* array, int n) } template -__global__ void equi_prob3(const IndexType n, - const IndexType e, - const IndexType* csrPtr, - const IndexType* csrInd, - ValueType* val, - IndexType* degree) +__global__ static void equi_prob3(const IndexType n, + const IndexType e, + const IndexType* csrPtr, + const IndexType* csrInd, + ValueType* val, + IndexType* degree) { int j, row, col; for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { @@ -303,12 +305,12 @@ __global__ void equi_prob3(const IndexType n, } template -__global__ void equi_prob2(const IndexType n, - const IndexType e, - const IndexType* csrPtr, - const IndexType* csrInd, - ValueType* val, - IndexType* degree) +__global__ static void equi_prob2(const IndexType n, + const IndexType e, + const IndexType* csrPtr, + const IndexType* csrInd, + ValueType* val, + IndexType* degree) { int row = blockIdx.x * blockDim.x + threadIdx.x; if (row < n) { @@ -372,7 +374,8 @@ void HT_matrix_csc_coo(const IndexType n, } template -__global__ void offsets_to_indices_kernel(const offsets_t* offsets, index_t v, index_t* indices) +__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel( + const offsets_t* offsets, index_t v, index_t* indices) { auto tid{threadIdx.x}; auto ctaStart{blockIdx.x}; diff --git a/cpp/src/utilities/path_retrieval.cu b/cpp/src/utilities/path_retrieval.cu index e37ce3a3ced..eda60941c23 100644 --- a/cpp/src/utilities/path_retrieval.cu +++ b/cpp/src/utilities/path_retrieval.cu @@ -29,13 +29,13 @@ namespace cugraph { namespace detail { template -__global__ void get_traversed_cost_kernel(vertex_t const* vertices, - vertex_t const* preds, - vertex_t const* vtx_map, - weight_t const* info_weights, - weight_t* out, - vertex_t stop_vertex, - vertex_t num_vertices) +__global__ static void get_traversed_cost_kernel(vertex_t const* vertices, + vertex_t const* preds, + vertex_t const* vtx_map, + weight_t const* info_weights, + weight_t* out, + vertex_t stop_vertex, + vertex_t num_vertices) { for (vertex_t i = threadIdx.x + blockIdx.x * blockDim.x; i < num_vertices; i += gridDim.x * blockDim.x) { From 129180d5cbaebf3b3c357bfae35b4ffab8c05015 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 17 Jan 2024 17:22:08 -0500 Subject: [PATCH 2/5] Mark all cugraph etl CUDA kernels with internal linkage --- .../include/hash/helper_functions.cuh | 4 ++-- cpp/libcugraph_etl/src/renumbering.cu | 20 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/libcugraph_etl/include/hash/helper_functions.cuh b/cpp/libcugraph_etl/include/hash/helper_functions.cuh index db377f938d2..8a11867f7e2 100644 --- a/cpp/libcugraph_etl/include/hash/helper_functions.cuh +++ b/cpp/libcugraph_etl/include/hash/helper_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. + * Copyright (c) 2017-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -131,7 +131,7 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co } template -__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values, +__global__ static void init_hashtbl(value_type* __restrict__ const hashtbl_values, const size_type n, const key_type key_val, const elem_type elem_val) diff --git a/cpp/libcugraph_etl/src/renumbering.cu b/cpp/libcugraph_etl/src/renumbering.cu index 08759702ab4..1cbeeeeea05 100644 --- a/cpp/libcugraph_etl/src/renumbering.cu +++ b/cpp/libcugraph_etl/src/renumbering.cu @@ -270,7 +270,7 @@ __device__ __inline__ int32_t validate_ht_col_insert(volatile int32_t* ptr_col) return col; } -__global__ void concat_and_create_histogram(int8_t* col_1, +__global__ static void concat_and_create_histogram(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -349,7 +349,7 @@ __global__ void concat_and_create_histogram(int8_t* col_1, } } -__global__ void concat_and_create_histogram_2(int8_t* col_1, +__global__ static void concat_and_create_histogram_2(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -452,7 +452,7 @@ __global__ void concat_and_create_histogram_2(int8_t* col_1, } template -__global__ void set_src_vertex_idx(int8_t* col_1, +__global__ static void set_src_vertex_idx(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -509,7 +509,7 @@ __global__ void set_src_vertex_idx(int8_t* col_1, } template -__global__ void set_dst_vertex_idx(int8_t* col_1, +__global__ static void set_dst_vertex_idx(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -585,7 +585,7 @@ __global__ void set_dst_vertex_idx(int8_t* col_1, } } -__global__ void create_mapping_histogram(uint32_t* hash_value, +__global__ static void create_mapping_histogram(uint32_t* hash_value, str_hash_value* payload, cudf_map_type hash_map, accum_type count) @@ -595,7 +595,7 @@ __global__ void create_mapping_histogram(uint32_t* hash_value, if (idx < count) { auto it = hash_map.insert(thrust::make_pair(hash_value[idx], payload[idx])); } } -__global__ void assign_histogram_idx(cudf_map_type cuda_map_obj, +__global__ static void assign_histogram_idx(cudf_map_type cuda_map_obj, size_t slot_count, str_hash_value* key, uint32_t* value, @@ -621,7 +621,7 @@ __global__ void assign_histogram_idx(cudf_map_type cuda_map_obj, } } -__global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count) +__global__ static void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count) { accum_type tid = threadIdx.x + blockIdx.x * blockDim.x; // change count_ to renumber_idx @@ -630,7 +630,7 @@ __global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type } } -__global__ void set_output_col_offsets(str_hash_value* row_col_pair, +__global__ static void set_output_col_offsets(str_hash_value* row_col_pair, int32_t* out_col1_offset, int32_t* out_col2_offset, int dst_pair_match, @@ -653,7 +653,7 @@ __global__ void set_output_col_offsets(str_hash_value* row_col_pair, } } -__global__ void offset_buffer_size_comp(int32_t* out_col1_length, +__global__ static void offset_buffer_size_comp(int32_t* out_col1_length, int32_t* out_col2_length, int32_t* out_col1_offsets, int32_t* out_col2_offsets, @@ -673,7 +673,7 @@ __global__ void offset_buffer_size_comp(int32_t* out_col1_length, } } -__global__ void select_unrenumber_string(str_hash_value* idx_to_col_row, +__global__ static void select_unrenumber_string(str_hash_value* idx_to_col_row, int32_t total_elements, int8_t* src_col1, int8_t* src_col2, From 8770d421d9ac079f8d043966db4518fce12e4ade Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 18 Jan 2024 16:49:33 -0500 Subject: [PATCH 3/5] Add CI checks for external CUDA kernels --- .github/workflows/pr.yaml | 2 +- .github/workflows/test.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f178a99cc1b..d35c3840fd3 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -60,7 +60,7 @@ jobs: with: build_type: pull-request enable_check_symbols: true - symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) + symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 205c60de007..9feb8f5e7ce 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,7 +23,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} enable_check_symbols: true - symbol_exclusions: (hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) + symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) conda-cpp-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 From 28e6e8b6a2a134b81d31b39c11012e0fb17bf121 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 11 Mar 2024 08:54:38 -0400 Subject: [PATCH 4/5] Mark all new cugraph CUDA kernels with internal linkage --- cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 9cb3365116e..5240c49cb80 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -328,7 +328,7 @@ struct return_value_compute_offset_t { }; template -__global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree( +__global__ static void compute_valid_local_nbr_inclusive_sums_mid_local_degree( edge_partition_device_view_t edge_partition, edge_partition_edge_property_device_view_t edge_partition_e_mask, raft::device_span edge_partition_frontier_majors, @@ -382,7 +382,7 @@ __global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree( } template -__global__ void compute_valid_local_nbr_inclusive_sums_high_local_degree( +__global__ static void compute_valid_local_nbr_inclusive_sums_high_local_degree( edge_partition_device_view_t edge_partition, edge_partition_edge_property_device_view_t edge_partition_e_mask, raft::device_span edge_partition_frontier_majors, From 46fca71402ee5d4daf5fb923a30840f329660bab Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 11 Mar 2024 08:56:42 -0400 Subject: [PATCH 5/5] Update symbol vis ci hooks to 24.04 --- .github/workflows/pr.yaml | 2 +- .github/workflows/test.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index d35c3840fd3..7c8c9973462 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -56,7 +56,7 @@ jobs: conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 with: build_type: pull-request enable_check_symbols: true diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9feb8f5e7ce..0bd095bfa94 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 with: build_type: nightly branch: ${{ inputs.branch }}