diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7f0b95e3573..7c8c9973462 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -15,6 +15,7 @@ jobs: - checks - conda-cpp-build - conda-cpp-tests + - conda-cpp-checks - conda-notebook-tests - conda-python-build - conda-python-tests @@ -52,6 +53,14 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 with: build_type: pull-request + conda-cpp-checks: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 + with: + build_type: pull-request + enable_check_symbols: true + symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 32fb2d62b29..0bd095bfa94 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -14,6 +14,16 @@ on: type: string jobs: + conda-cpp-checks: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + enable_check_symbols: true + symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel) conda-cpp-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04 diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh index dad5ce77e45..6b974a326dd 100644 --- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh +++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh @@ -44,7 +44,7 @@ namespace detail { int32_t constexpr decompress_edge_partition_block_size = 1024; template <typename vertex_t, typename edge_t, bool multi_gpu> -__global__ void decompress_to_edgelist_mid_degree( +__global__ static void decompress_to_edgelist_mid_degree( edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition, vertex_t major_range_first, vertex_t major_range_last, @@ -74,7 +74,7 @@ __global__ void decompress_to_edgelist_mid_degree( } template <typename vertex_t, typename edge_t, bool multi_gpu> -__global__ void decompress_to_edgelist_high_degree( +__global__ static void decompress_to_edgelist_high_degree( edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition, vertex_t major_range_first, vertex_t major_range_last, diff --git a/cpp/libcugraph_etl/include/hash/helper_functions.cuh b/cpp/libcugraph_etl/include/hash/helper_functions.cuh index db377f938d2..8a11867f7e2 100644 --- a/cpp/libcugraph_etl/include/hash/helper_functions.cuh +++ b/cpp/libcugraph_etl/include/hash/helper_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. + * Copyright (c) 2017-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -131,7 +131,7 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co } template <typename value_type, typename size_type, typename key_type, typename elem_type> -__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values, +__global__ static void init_hashtbl(value_type* __restrict__ const hashtbl_values, const size_type n, const key_type key_val, const elem_type elem_val) diff --git a/cpp/libcugraph_etl/src/renumbering.cu b/cpp/libcugraph_etl/src/renumbering.cu index 08759702ab4..1cbeeeeea05 100644 --- a/cpp/libcugraph_etl/src/renumbering.cu +++ b/cpp/libcugraph_etl/src/renumbering.cu @@ -270,7 +270,7 @@ __device__ __inline__ int32_t validate_ht_col_insert(volatile int32_t* ptr_col) return col; } -__global__ void concat_and_create_histogram(int8_t* col_1, +__global__ static void concat_and_create_histogram(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -349,7 +349,7 @@ __global__ void concat_and_create_histogram(int8_t* col_1, } } -__global__ void concat_and_create_histogram_2(int8_t* col_1, +__global__ static void concat_and_create_histogram_2(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -452,7 +452,7 @@ __global__ void concat_and_create_histogram_2(int8_t* col_1, } template <typename T> -__global__ void set_src_vertex_idx(int8_t* col_1, +__global__ static void set_src_vertex_idx(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -509,7 +509,7 @@ __global__ void set_src_vertex_idx(int8_t* col_1, } template <typename T> -__global__ void set_dst_vertex_idx(int8_t* col_1, +__global__ static void set_dst_vertex_idx(int8_t* col_1, int32_t* offset_1, int8_t* col_2, int32_t* offset_2, @@ -585,7 +585,7 @@ __global__ void set_dst_vertex_idx(int8_t* col_1, } } -__global__ void create_mapping_histogram(uint32_t* hash_value, +__global__ static void create_mapping_histogram(uint32_t* hash_value, str_hash_value* payload, cudf_map_type hash_map, accum_type count) @@ -595,7 +595,7 @@ __global__ void create_mapping_histogram(uint32_t* hash_value, if (idx < count) { auto it = hash_map.insert(thrust::make_pair(hash_value[idx], payload[idx])); } } -__global__ void assign_histogram_idx(cudf_map_type cuda_map_obj, +__global__ static void assign_histogram_idx(cudf_map_type cuda_map_obj, size_t slot_count, str_hash_value* key, uint32_t* value, @@ -621,7 +621,7 @@ __global__ void assign_histogram_idx(cudf_map_type cuda_map_obj, } } -__global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count) +__global__ static void set_vertex_indices(str_hash_value* ht_value_payload, accum_type count) { accum_type tid = threadIdx.x + blockIdx.x * blockDim.x; // change count_ to renumber_idx @@ -630,7 +630,7 @@ __global__ void set_vertex_indices(str_hash_value* ht_value_payload, accum_type } } -__global__ void set_output_col_offsets(str_hash_value* row_col_pair, +__global__ static void set_output_col_offsets(str_hash_value* row_col_pair, int32_t* out_col1_offset, int32_t* out_col2_offset, int dst_pair_match, @@ -653,7 +653,7 @@ __global__ void set_output_col_offsets(str_hash_value* row_col_pair, } } -__global__ void offset_buffer_size_comp(int32_t* out_col1_length, +__global__ static void offset_buffer_size_comp(int32_t* out_col1_length, int32_t* out_col2_length, int32_t* out_col1_offsets, int32_t* out_col2_offsets, @@ -673,7 +673,7 @@ __global__ void offset_buffer_size_comp(int32_t* out_col1_length, } } -__global__ void select_unrenumber_string(str_hash_value* idx_to_col_row, +__global__ static void select_unrenumber_string(str_hash_value* idx_to_col_row, int32_t total_elements, int8_t* src_col1, int8_t* src_col2, diff --git a/cpp/src/community/legacy/ecg.cu b/cpp/src/community/legacy/ecg.cu index d93a4446faa..b2ad79204ed 100644 --- a/cpp/src/community/legacy/ecg.cu +++ b/cpp/src/community/legacy/ecg.cu @@ -52,7 +52,7 @@ binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexT // FIXME: This shouldn't need to be a custom kernel, this // seems like it should just be a thrust::transform template <typename IdxT, typename ValT> -__global__ void match_check_kernel( +__global__ static void match_check_kernel( IdxT size, IdxT num_verts, IdxT* offsets, IdxT* indices, IdxT* parts, ValT* weights) { IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/cpp/src/components/legacy/weak_cc.cuh b/cpp/src/components/legacy/weak_cc.cuh index ad9aa773590..f4254e2d55d 100644 --- a/cpp/src/components/legacy/weak_cc.cuh +++ b/cpp/src/components/legacy/weak_cc.cuh @@ -59,15 +59,15 @@ class WeakCCState { }; template <typename vertex_t, typename edge_t, int TPB_X = 32> -__global__ void weak_cc_label_device(vertex_t* labels, - edge_t const* offsets, - vertex_t const* indices, - edge_t nnz, - bool* fa, - bool* xa, - bool* m, - vertex_t startVertexId, - vertex_t batchSize) +__global__ static void weak_cc_label_device(vertex_t* labels, + edge_t const* offsets, + vertex_t const* indices, + edge_t nnz, + bool* fa, + bool* xa, + bool* m, + vertex_t startVertexId, + vertex_t batchSize) { vertex_t tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < batchSize) { @@ -118,11 +118,11 @@ __global__ void weak_cc_label_device(vertex_t* labels, } template <typename vertex_t, int TPB_X = 32, typename Lambda> -__global__ void weak_cc_init_label_kernel(vertex_t* labels, - vertex_t startVertexId, - vertex_t batchSize, - vertex_t MAX_LABEL, - Lambda filter_op) +__global__ static void weak_cc_init_label_kernel(vertex_t* labels, + vertex_t startVertexId, + vertex_t batchSize, + vertex_t MAX_LABEL, + Lambda filter_op) { /** F1 and F2 in the paper correspond to fa and xa */ /** Cd in paper corresponds to db_cluster */ @@ -134,7 +134,7 @@ __global__ void weak_cc_init_label_kernel(vertex_t* labels, } template <typename vertex_t, int TPB_X = 32> -__global__ void weak_cc_init_all_kernel( +__global__ static void weak_cc_init_all_kernel( vertex_t* labels, bool* fa, bool* xa, vertex_t N, vertex_t MAX_LABEL) { vertex_t tid = threadIdx.x + blockIdx.x * TPB_X; diff --git a/cpp/src/layout/legacy/bh_kernels.cuh b/cpp/src/layout/legacy/bh_kernels.cuh index 5b101363314..f6e163ab306 100644 --- a/cpp/src/layout/legacy/bh_kernels.cuh +++ b/cpp/src/layout/legacy/bh_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,9 +42,9 @@ namespace detail { /** * Intializes the states of objects. This speeds the overall kernel up. */ -__global__ void InitializationKernel(unsigned* restrict limiter, - int* restrict maxdepthd, - float* restrict radiusd) +__global__ static void InitializationKernel(unsigned* restrict limiter, + int* restrict maxdepthd, + float* restrict radiusd) { maxdepthd[0] = 1; limiter[0] = 0; @@ -54,10 +54,10 @@ __global__ void InitializationKernel(unsigned* restrict limiter, /** * Reset root. */ -__global__ void ResetKernel(float* restrict radiusd_squared, - int* restrict bottomd, - const int NNODES, - const float* restrict radiusd) +__global__ static void ResetKernel(float* restrict radiusd_squared, + int* restrict bottomd, + const int NNODES, + const float* restrict radiusd) { radiusd_squared[0] = radiusd[0] * radiusd[0]; // create root node @@ -67,20 +67,21 @@ __global__ void ResetKernel(float* restrict radiusd_squared, /** * Figures the bounding boxes for every point in the embedding. */ -__global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* restrict startd, - int* restrict childd, - int* restrict massd, - float* restrict posxd, - float* restrict posyd, - float* restrict maxxd, - float* restrict maxyd, - float* restrict minxd, - float* restrict minyd, - const int FOUR_NNODES, - const int NNODES, - const int N, - unsigned* restrict limiter, - float* restrict radiusd) +__global__ static __launch_bounds__(THREADS1, + FACTOR1) void BoundingBoxKernel(int* restrict startd, + int* restrict childd, + int* restrict massd, + float* restrict posxd, + float* restrict posyd, + float* restrict maxxd, + float* restrict maxyd, + float* restrict minxd, + float* restrict minyd, + const int FOUR_NNODES, + const int NNODES, + const int N, + unsigned* restrict limiter, + float* restrict radiusd) { float val, minx, maxx, miny, maxy; __shared__ float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1]; @@ -158,9 +159,9 @@ __global__ __launch_bounds__(THREADS1, FACTOR1) void BoundingBoxKernel(int* rest /** * Clear some of the state vectors up. */ -__global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd, - const int FOUR_NNODES, - const int FOUR_N) +__global__ static __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd, + const int FOUR_NNODES, + const int FOUR_N) { const int inc = blockDim.x * gridDim.x; int k = (FOUR_N & -32) + threadIdx.x + blockIdx.x * blockDim.x; @@ -175,15 +176,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel1(int* restrict childd, /** * Build the actual KD Tree. */ -__global__ __launch_bounds__(THREADS2, - FACTOR2) void TreeBuildingKernel(int* restrict childd, - const float* restrict posxd, - const float* restrict posyd, - const int NNODES, - const int N, - int* restrict maxdepthd, - int* restrict bottomd, - const float* restrict radiusd) +__global__ static __launch_bounds__(THREADS2, + FACTOR2) void TreeBuildingKernel(int* restrict childd, + const float* restrict posxd, + const float* restrict posyd, + const int NNODES, + const int N, + int* restrict maxdepthd, + int* restrict bottomd, + const float* restrict radiusd) { int j, depth; float x, y, r; @@ -296,10 +297,10 @@ __global__ __launch_bounds__(THREADS2, /** * Clean more state vectors. */ -__global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd, - int* restrict massd, - const int NNODES, - const int* restrict bottomd) +__global__ static __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd, + int* restrict massd, + const int NNODES, + const int* restrict bottomd) { const int bottom = bottomd[0]; const int inc = blockDim.x * gridDim.x; @@ -317,15 +318,15 @@ __global__ __launch_bounds__(1024, 1) void ClearKernel2(int* restrict startd, /** * Summarize the KD Tree via cell gathering */ -__global__ __launch_bounds__(THREADS3, - FACTOR3) void SummarizationKernel(int* restrict countd, - const int* restrict childd, - volatile int* restrict massd, - float* restrict posxd, - float* restrict posyd, - const int NNODES, - const int N, - const int* restrict bottomd) +__global__ static __launch_bounds__(THREADS3, + FACTOR3) void SummarizationKernel(int* restrict countd, + const int* restrict childd, + volatile int* restrict massd, + float* restrict posxd, + float* restrict posyd, + const int NNODES, + const int N, + const int* restrict bottomd) { bool flag = 0; float cm, px, py; @@ -453,13 +454,14 @@ __global__ __launch_bounds__(THREADS3, /** * Sort the cells */ -__global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict sortd, - const int* restrict countd, - volatile int* restrict startd, - int* restrict childd, - const int NNODES, - const int N, - const int* restrict bottomd) +__global__ static __launch_bounds__(THREADS4, + FACTOR4) void SortKernel(int* restrict sortd, + const int* restrict countd, + volatile int* restrict startd, + int* restrict childd, + const int NNODES, + const int N, + const int* restrict bottomd) { const int bottom = bottomd[0]; const int dec = blockDim.x * gridDim.x; @@ -502,7 +504,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void SortKernel(int* restrict so /** * Calculate the repulsive forces using the KD Tree */ -__global__ __launch_bounds__( +__global__ static __launch_bounds__( THREADS5, FACTOR5) void RepulsionKernel(/* int *restrict errd, */ const float scaling_ratio, const float theta, @@ -612,18 +614,18 @@ __global__ __launch_bounds__( } } -__global__ __launch_bounds__(THREADS6, - FACTOR6) void apply_forces_bh(float* restrict Y_x, - float* restrict Y_y, - const float* restrict attract_x, - const float* restrict attract_y, - const float* restrict repel_x, - const float* restrict repel_y, - float* restrict old_dx, - float* restrict old_dy, - const float* restrict swinging, - const float speed, - const int n) +__global__ static __launch_bounds__(THREADS6, + FACTOR6) void apply_forces_bh(float* restrict Y_x, + float* restrict Y_y, + const float* restrict attract_x, + const float* restrict attract_y, + const float* restrict repel_x, + const float* restrict repel_y, + float* restrict old_dx, + float* restrict old_dy, + const float* restrict swinging, + const float speed, + const int n) { // For evrery vertex for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { diff --git a/cpp/src/layout/legacy/exact_repulsion.cuh b/cpp/src/layout/legacy/exact_repulsion.cuh index fe895bae6a0..8530202afd5 100644 --- a/cpp/src/layout/legacy/exact_repulsion.cuh +++ b/cpp/src/layout/legacy/exact_repulsion.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,13 +22,13 @@ namespace cugraph { namespace detail { template <typename vertex_t> -__global__ void repulsion_kernel(const float* restrict x_pos, - const float* restrict y_pos, - float* restrict repel_x, - float* restrict repel_y, - const int* restrict mass, - const float scaling_ratio, - const vertex_t n) +__global__ static void repulsion_kernel(const float* restrict x_pos, + const float* restrict y_pos, + float* restrict repel_x, + float* restrict repel_y, + const int* restrict mass, + const float scaling_ratio, + const vertex_t n) { int j = (blockIdx.x * blockDim.x) + threadIdx.x; // for every item in row int i = (blockIdx.y * blockDim.y) + threadIdx.y; // for every row diff --git a/cpp/src/layout/legacy/fa2_kernels.cuh b/cpp/src/layout/legacy/fa2_kernels.cuh index 4f1ce520387..33e7841a380 100644 --- a/cpp/src/layout/legacy/fa2_kernels.cuh +++ b/cpp/src/layout/legacy/fa2_kernels.cuh @@ -23,19 +23,19 @@ namespace cugraph { namespace detail { template <typename vertex_t, typename edge_t, typename weight_t> -__global__ void attraction_kernel(const vertex_t* restrict row, - const vertex_t* restrict col, - const weight_t* restrict v, - const edge_t e, - const float* restrict x_pos, - const float* restrict y_pos, - float* restrict attract_x, - float* restrict attract_y, - const int* restrict mass, - bool outbound_attraction_distribution, - bool lin_log_mode, - const float edge_weight_influence, - const float coef) +__global__ static void attraction_kernel(const vertex_t* restrict row, + const vertex_t* restrict col, + const weight_t* restrict v, + const edge_t e, + const float* restrict x_pos, + const float* restrict y_pos, + float* restrict attract_x, + float* restrict attract_y, + const int* restrict mass, + bool outbound_attraction_distribution, + bool lin_log_mode, + const float edge_weight_influence, + const float coef) { vertex_t i, src, dst; weight_t weight = 1; @@ -116,13 +116,13 @@ void apply_attraction(const vertex_t* restrict row, } template <typename vertex_t> -__global__ void linear_gravity_kernel(const float* restrict x_pos, - const float* restrict y_pos, - float* restrict attract_x, - float* restrict attract_y, - const int* restrict mass, - const float gravity, - const vertex_t n) +__global__ static void linear_gravity_kernel(const float* restrict x_pos, + const float* restrict y_pos, + float* restrict attract_x, + float* restrict attract_y, + const int* restrict mass, + const float gravity, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -136,14 +136,14 @@ __global__ void linear_gravity_kernel(const float* restrict x_pos, } template <typename vertex_t> -__global__ void strong_gravity_kernel(const float* restrict x_pos, - const float* restrict y_pos, - float* restrict attract_x, - float* restrict attract_y, - const int* restrict mass, - const float gravity, - const float scaling_ratio, - const vertex_t n) +__global__ static void strong_gravity_kernel(const float* restrict x_pos, + const float* restrict y_pos, + float* restrict attract_x, + float* restrict attract_y, + const int* restrict mass, + const float gravity, + const float scaling_ratio, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -187,16 +187,16 @@ void apply_gravity(const float* restrict x_pos, } template <typename vertex_t> -__global__ void local_speed_kernel(const float* restrict repel_x, - const float* restrict repel_y, - const float* restrict attract_x, - const float* restrict attract_y, - const float* restrict old_dx, - const float* restrict old_dy, - const int* restrict mass, - float* restrict swinging, - float* restrict traction, - const vertex_t n) +__global__ static void local_speed_kernel(const float* restrict repel_x, + const float* restrict repel_y, + const float* restrict attract_x, + const float* restrict attract_y, + const float* restrict old_dx, + const float* restrict old_dy, + const int* restrict mass, + float* restrict swinging, + float* restrict traction, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -272,17 +272,17 @@ void adapt_speed(const float jitter_tolerance, } template <typename vertex_t> -__global__ void update_positions_kernel(float* restrict x_pos, - float* restrict y_pos, - const float* restrict repel_x, - const float* restrict repel_y, - const float* restrict attract_x, - const float* restrict attract_y, - float* restrict old_dx, - float* restrict old_dy, - const float* restrict swinging, - const float speed, - const vertex_t n) +__global__ static void update_positions_kernel(float* restrict x_pos, + float* restrict y_pos, + const float* restrict repel_x, + const float* restrict repel_y, + const float* restrict attract_x, + const float* restrict attract_y, + float* restrict old_dx, + float* restrict old_dy, + const float* restrict swinging, + const float speed, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index fc3da3cac07..0b6447f50d9 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -127,7 +127,7 @@ template <bool hypersparse, typename BufferKeyOutputIterator, typename BufferValueOutputIterator, typename EdgeOp> -__global__ void extract_transform_v_frontier_e_hypersparse_or_low_degree( +__global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -295,7 +295,7 @@ template <typename GraphViewType, typename BufferKeyOutputIterator, typename BufferValueOutputIterator, typename EdgeOp> -__global__ void extract_transform_v_frontier_e_mid_degree( +__global__ static void extract_transform_v_frontier_e_mid_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -396,7 +396,7 @@ template <typename GraphViewType, typename BufferKeyOutputIterator, typename BufferValueOutputIterator, typename EdgeOp> -__global__ void extract_transform_v_frontier_e_high_degree( +__global__ static void extract_transform_v_frontier_e_high_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 9cb3365116e..5240c49cb80 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -328,7 +328,7 @@ struct return_value_compute_offset_t { }; template <typename vertex_t, typename edge_t, bool multi_gpu> -__global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree( +__global__ static void compute_valid_local_nbr_inclusive_sums_mid_local_degree( edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition, edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask, raft::device_span<vertex_t const> edge_partition_frontier_majors, @@ -382,7 +382,7 @@ __global__ void compute_valid_local_nbr_inclusive_sums_mid_local_degree( } template <typename vertex_t, typename edge_t, bool multi_gpu> -__global__ void compute_valid_local_nbr_inclusive_sums_high_local_degree( +__global__ static void compute_valid_local_nbr_inclusive_sums_high_local_degree( edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition, edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool> edge_partition_e_mask, raft::device_span<vertex_t const> edge_partition_frontier_majors, diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 083487fa5b4..509ab56d3fe 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -149,7 +149,7 @@ template <bool update_major, typename EdgeOp, typename ReduceOp, typename T> -__global__ void per_v_transform_reduce_e_hypersparse( +__global__ static void per_v_transform_reduce_e_hypersparse( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -251,7 +251,7 @@ template <bool update_major, typename EdgeOp, typename ReduceOp, typename T> -__global__ void per_v_transform_reduce_e_low_degree( +__global__ static void per_v_transform_reduce_e_low_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -350,7 +350,7 @@ template <bool update_major, typename EdgeOp, typename ReduceOp, typename T> -__global__ void per_v_transform_reduce_e_mid_degree( +__global__ static void per_v_transform_reduce_e_mid_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -466,7 +466,7 @@ template <bool update_major, typename EdgeOp, typename ReduceOp, typename T> -__global__ void per_v_transform_reduce_e_high_degree( +__global__ static void per_v_transform_reduce_e_high_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh index 2cb1a5358b0..9c7670f68d2 100644 --- a/cpp/src/prims/transform_e.cuh +++ b/cpp/src/prims/transform_e.cuh @@ -51,7 +51,7 @@ template <bool check_edge_mask, typename EdgePartitionEdgeMaskWrapper, typename EdgePartitionEdgeValueOutputWrapper, typename EdgeOp> -__global__ void transform_e_packed_bool( +__global__ static void transform_e_packed_bool( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh index e5855b105ee..43722550c58 100644 --- a/cpp/src/prims/transform_reduce_e.cuh +++ b/cpp/src/prims/transform_reduce_e.cuh @@ -61,7 +61,7 @@ template <typename GraphViewType, typename EdgePartitionEdgeMaskWrapper, typename ResultIterator, typename EdgeOp> -__global__ void transform_reduce_e_hypersparse( +__global__ static void transform_reduce_e_hypersparse( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -153,7 +153,7 @@ template <typename GraphViewType, typename EdgePartitionEdgeMaskWrapper, typename ResultIterator, typename EdgeOp> -__global__ void transform_reduce_e_low_degree( +__global__ static void transform_reduce_e_low_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -242,7 +242,7 @@ template <typename GraphViewType, typename EdgePartitionEdgeMaskWrapper, typename ResultIterator, typename EdgeOp> -__global__ void transform_reduce_e_mid_degree( +__global__ static void transform_reduce_e_mid_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -320,7 +320,7 @@ template <typename GraphViewType, typename EdgePartitionEdgeMaskWrapper, typename ResultIterator, typename EdgeOp> -__global__ void transform_reduce_e_high_degree( +__global__ static void transform_reduce_e_high_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh index 42203085077..eee0ed03d1c 100644 --- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh +++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh @@ -97,7 +97,7 @@ template <bool edge_partition_src_key, typename EdgePartitionSrcDstKeyInputWrapper, typename EdgeOp, typename ValueIterator> -__global__ void transform_reduce_by_src_dst_key_hypersparse( +__global__ static void transform_reduce_by_src_dst_key_hypersparse( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -156,7 +156,7 @@ template <bool edge_partition_src_key, typename EdgePartitionSrcDstKeyInputWrapper, typename EdgeOp, typename ValueIterator> -__global__ void transform_reduce_by_src_dst_key_low_degree( +__global__ static void transform_reduce_by_src_dst_key_low_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -214,7 +214,7 @@ template <bool edge_partition_src_key, typename EdgePartitionSrcDstKeyInputWrapper, typename EdgeOp, typename ValueIterator> -__global__ void transform_reduce_by_src_dst_key_mid_degree( +__global__ static void transform_reduce_by_src_dst_key_mid_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, @@ -274,7 +274,7 @@ template <bool edge_partition_src_key, typename EdgePartitionSrcDstKeyInputWrapper, typename EdgeOp, typename ValueIterator> -__global__ void transform_reduce_by_src_dst_key_high_degree( +__global__ static void transform_reduce_by_src_dst_key_high_degree( edge_partition_device_view_t<typename GraphViewType::vertex_type, typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 4ee5ad5ca02..29dca6ef409 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -241,7 +241,7 @@ rmm::device_uvector<edge_t> compute_minor_degrees( int32_t constexpr count_edge_partition_multi_edges_block_size = 1024; template <typename vertex_t, typename edge_t, bool multi_gpu> -__global__ void for_all_major_for_all_nbr_mid_degree( +__global__ static void for_all_major_for_all_nbr_mid_degree( edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition, vertex_t major_range_first, vertex_t major_range_last, @@ -275,7 +275,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree( } template <typename vertex_t, typename edge_t, bool multi_gpu> -__global__ void for_all_major_for_all_nbr_high_degree( +__global__ static void for_all_major_for_all_nbr_high_degree( edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> edge_partition, vertex_t major_range_first, vertex_t major_range_last, diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index c2a3f1160ca..612eb0c48f2 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -215,7 +215,7 @@ template <int32_t max_num_partitions, typename key_t, typename PartitionOp, typename KeyOp> -__global__ void multi_partition_copy( +__global__ static void multi_partition_copy( InputIterator input_first, InputIterator input_last, raft::device_span<key_t*> output_buffer_ptrs, diff --git a/cpp/src/utilities/eidecl_graph_utils.hpp b/cpp/src/utilities/eidecl_graph_utils.hpp index 84240ba2845..abf026cbbfe 100644 --- a/cpp/src/utilities/eidecl_graph_utils.hpp +++ b/cpp/src/utilities/eidecl_graph_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,9 +29,12 @@ extern template void offsets_to_indices<int, int>(int const*, int, int*); extern template void offsets_to_indices<long, int>(long const*, int, int*); extern template void offsets_to_indices<long, long>(long const*, long, long*); -extern template __global__ void offsets_to_indices_kernel<int, int>(int const*, int, int*); -extern template __global__ void offsets_to_indices_kernel<long, int>(long const*, int, int*); -extern template __global__ void offsets_to_indices_kernel<long, long>(long const*, long, long*); +extern template __attribute__((visibility("hidden"))) __global__ void +offsets_to_indices_kernel<int, int>(int const*, int, int*); +extern template __attribute__((visibility("hidden"))) __global__ void +offsets_to_indices_kernel<long, int>(long const*, int, int*); +extern template __attribute__((visibility("hidden"))) __global__ void +offsets_to_indices_kernel<long, long>(long const*, long, long*); } // namespace detail } // namespace cugraph diff --git a/cpp/src/utilities/eidir_graph_utils.hpp b/cpp/src/utilities/eidir_graph_utils.hpp index 033bb197ce8..ba06c6f56ea 100644 --- a/cpp/src/utilities/eidir_graph_utils.hpp +++ b/cpp/src/utilities/eidir_graph_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,15 +29,12 @@ template void offsets_to_indices<int32_t, int32_t>(int32_t const*, int32_t, int3 template void offsets_to_indices<int64_t, int32_t>(int64_t const*, int32_t, int32_t*); template void offsets_to_indices<int64_t, int64_t>(int64_t const*, int64_t, int64_t*); -template __global__ void offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*, - int32_t, - int32_t*); -template __global__ void offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*, - int32_t, - int32_t*); -template __global__ void offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*, - int64_t, - int64_t*); +template __global__ __attribute__((visibility("hidden"))) void +offsets_to_indices_kernel<int32_t, int32_t>(int32_t const*, int32_t, int32_t*); +template __global__ __attribute__((visibility("hidden"))) void +offsets_to_indices_kernel<int64_t, int32_t>(int64_t const*, int32_t, int32_t*); +template __global__ __attribute__((visibility("hidden"))) void +offsets_to_indices_kernel<int64_t, int64_t>(int64_t const*, int64_t, int64_t*); } // namespace detail } // namespace cugraph diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh index 2d542956531..0b257e7abde 100644 --- a/cpp/src/utilities/graph_utils.cuh +++ b/cpp/src/utilities/graph_utils.cuh @@ -247,34 +247,36 @@ void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) // google matrix kernels template <typename IndexType, typename ValueType> -__global__ void degree_coo(const IndexType n, - const IndexType e, - const IndexType* ind, - ValueType* degree) +__global__ static void degree_coo(const IndexType n, + const IndexType e, + const IndexType* ind, + ValueType* degree) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) atomicAdd(°ree[ind[i]], (ValueType)1.0); } template <typename IndexType, typename ValueType> -__global__ void flag_leafs_kernel(const size_t n, const IndexType* degree, ValueType* bookmark) +__global__ static void flag_leafs_kernel(const size_t n, + const IndexType* degree, + ValueType* bookmark) { for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) if (degree[i] == 0) bookmark[i] = 1.0; } template <typename IndexType, typename ValueType> -__global__ void degree_offsets(const IndexType n, - const IndexType e, - const IndexType* ind, - ValueType* degree) +__global__ static void degree_offsets(const IndexType n, + const IndexType e, + const IndexType* ind, + ValueType* degree) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) degree[i] += ind[i + 1] - ind[i]; } template <typename FromType, typename ToType> -__global__ void type_convert(FromType* array, int n) +__global__ static void type_convert(FromType* array, int n) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { ToType val = array[i]; @@ -284,12 +286,12 @@ __global__ void type_convert(FromType* array, int n) } template <typename IndexType, typename ValueType> -__global__ void equi_prob3(const IndexType n, - const IndexType e, - const IndexType* csrPtr, - const IndexType* csrInd, - ValueType* val, - IndexType* degree) +__global__ static void equi_prob3(const IndexType n, + const IndexType e, + const IndexType* csrPtr, + const IndexType* csrInd, + ValueType* val, + IndexType* degree) { int j, row, col; for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { @@ -303,12 +305,12 @@ __global__ void equi_prob3(const IndexType n, } template <typename IndexType, typename ValueType> -__global__ void equi_prob2(const IndexType n, - const IndexType e, - const IndexType* csrPtr, - const IndexType* csrInd, - ValueType* val, - IndexType* degree) +__global__ static void equi_prob2(const IndexType n, + const IndexType e, + const IndexType* csrPtr, + const IndexType* csrInd, + ValueType* val, + IndexType* degree) { int row = blockIdx.x * blockDim.x + threadIdx.x; if (row < n) { @@ -372,7 +374,8 @@ void HT_matrix_csc_coo(const IndexType n, } template <typename offsets_t, typename index_t> -__global__ void offsets_to_indices_kernel(const offsets_t* offsets, index_t v, index_t* indices) +__attribute__((visibility("hidden"))) __global__ void offsets_to_indices_kernel( + const offsets_t* offsets, index_t v, index_t* indices) { auto tid{threadIdx.x}; auto ctaStart{blockIdx.x}; diff --git a/cpp/src/utilities/path_retrieval.cu b/cpp/src/utilities/path_retrieval.cu index e37ce3a3ced..eda60941c23 100644 --- a/cpp/src/utilities/path_retrieval.cu +++ b/cpp/src/utilities/path_retrieval.cu @@ -29,13 +29,13 @@ namespace cugraph { namespace detail { template <typename vertex_t, typename weight_t> -__global__ void get_traversed_cost_kernel(vertex_t const* vertices, - vertex_t const* preds, - vertex_t const* vtx_map, - weight_t const* info_weights, - weight_t* out, - vertex_t stop_vertex, - vertex_t num_vertices) +__global__ static void get_traversed_cost_kernel(vertex_t const* vertices, + vertex_t const* preds, + vertex_t const* vtx_map, + weight_t const* info_weights, + weight_t* out, + vertex_t stop_vertex, + vertex_t num_vertices) { for (vertex_t i = threadIdx.x + blockIdx.x * blockDim.x; i < num_vertices; i += gridDim.x * blockDim.x) {