diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index d1c2cf3df52..c286c81683e 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -92,6 +93,54 @@ struct local_degree_op_t { } }; +template +struct local_degree_with_mask_op_t { + raft::device_span offsets{}; + std::conditional_t major_range_first{}; + + std::conditional_t, std::byte /* dummy */> + dcs_nzd_vertices{}; + std::conditional_t major_hypersparse_first{}; + + MaskIterator mask_first{}; + + __device__ return_type_t operator()(vertex_t major) const + { + if constexpr (multi_gpu) { + vertex_t idx{}; + if constexpr (use_dcs) { + if (major < major_hypersparse_first) { + idx = major - major_range_first; + return static_cast( + count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + } else { + auto major_hypersparse_idx = + major_hypersparse_idx_from_major_nocheck_impl(dcs_nzd_vertices, major); + if (major_hypersparse_idx) { + idx = (major_hypersparse_first - major_range_first) + *major_hypersparse_idx; + return static_cast( + count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + } else { + return return_type_t{0}; + } + } + } else { + idx = major - major_range_first; + return static_cast( + count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + } + } else { + return static_cast( + count_set_bits(mask_first, offsets[major], offsets[major + 1] - offsets[major])); + } + } +}; + template class edge_partition_device_view_base_t { public: @@ -255,6 +304,122 @@ class edge_partition_device_view_t + size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + return dcs_nzd_vertices_ ? thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_with_mask_op_t< + vertex_t, + edge_t, + size_t /* no limit on majors.size(), so edge_t can overflow */, + multi_gpu, + true, + MaskIterator>{this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + *major_hypersparse_first_, + mask_first}, + size_t{0}, + thrust::plus()) + : thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_with_mask_op_t< + vertex_t, + edge_t, + size_t /* no limit on majors.size(), so edge_t can overflow */, + multi_gpu, + false, + MaskIterator>{this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}, + size_t{0}, + thrust::plus()); + } + + template + rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, + rmm::cuda_stream_view stream) const + { + rmm::device_uvector local_degrees(this->major_range_size(), stream); + if (dcs_nzd_vertices_) { + assert(major_hypersparse_first_); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail:: + local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + major_hypersparse_first_.value_or(vertex_t{0}), + mask_first}); + } else { + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail:: + local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + } + return local_degrees; + } + + template + rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_uvector local_degrees(thrust::distance(major_first, major_last), stream); + if (dcs_nzd_vertices_) { + assert(major_hypersparse_first_); + thrust::transform( + rmm::exec_policy(stream), + major_first, + major_last, + local_degrees.begin(), + detail:: + local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + dcs_nzd_vertices_.value(), + major_hypersparse_first_.value_or(vertex_t{0}), + mask_first}); + } else { + thrust::transform( + rmm::exec_policy(stream), + major_first, + major_last, + local_degrees.begin(), + detail:: + local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + } + return local_degrees; + } + __host__ __device__ vertex_t major_value_start_offset() const { return major_value_start_offset_; @@ -440,6 +605,71 @@ class edge_partition_device_view_t + size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + return thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_with_mask_op_t< + vertex_t, + edge_t, + size_t /* no limit on majors.size(), so edge_t can overflow */, + multi_gpu, + false, + MaskIterator>{this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}, + size_t{0}, + thrust::plus()); + } + + template + rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, + rmm::cuda_stream_view stream) const + { + rmm::device_uvector local_degrees(this->major_range_size(), stream); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail::local_degree_with_mask_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + return local_degrees; + } + + template + rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_uvector local_degrees(thrust::distance(major_first, major_last), stream); + thrust::transform( + rmm::exec_policy(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_with_mask_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + return local_degrees; + } + __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; } __host__ __device__ thrust::optional major_hypersparse_first() const noexcept diff --git a/cpp/include/cugraph/utilities/mask_utils.cuh b/cpp/include/cugraph/utilities/mask_utils.cuh index ab1403d019b..5621b1267e9 100644 --- a/cpp/include/cugraph/utilities/mask_utils.cuh +++ b/cpp/include/cugraph/utilities/mask_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include #include +#include namespace cugraph { diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index e496344583c..08907716412 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,7 +69,7 @@ struct extract_edge_e_op_t { vertex_t dst, thrust::tuple src_props, thrust::tuple dst_props, - weight_t edge_centrality) + weight_t edge_centrality) const { return ((thrust::get<0>(dst_props) == d) && (thrust::get<0>(src_props) == (d - 1))) ? thrust::optional>{thrust::make_tuple(src, dst)} diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 2d77d64e1ff..608a824c57e 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -61,13 +62,13 @@ namespace detail { int32_t constexpr extract_transform_v_frontier_e_kernel_block_size = 512; -template -__device__ void push_buffer_element(e_op_result_t e_op_result, - BufferKeyOutputIterator buffer_key_output_first, +template +__device__ void push_buffer_element(BufferKeyOutputIterator buffer_key_output_first, BufferValueOutputIterator buffer_value_output_first, - size_t buffer_idx) + size_t buffer_idx, + e_op_result_t e_op_result) { using output_key_t = typename optional_dataframe_buffer_value_type_t::value; @@ -87,15 +88,45 @@ __device__ void push_buffer_element(e_op_result_t e_op_result, } } -template +__device__ void warp_push_buffer_elements( + BufferKeyOutputIterator buffer_key_output_first, + BufferValueOutputIterator buffer_value_output_first, + cuda::atomic_ref& buffer_idx, + int lane_id, + e_op_result_t e_op_result) +{ + auto ballot = __ballot_sync(raft::warp_full_mask(), e_op_result ? uint32_t{1} : uint32_t{0}); + if (ballot > 0) { + size_t warp_buffer_start_idx{}; + if (lane_id == 0) { + auto increment = __popc(ballot); + warp_buffer_start_idx = buffer_idx.fetch_add(increment, cuda::std::memory_order_relaxed); + } + warp_buffer_start_idx = __shfl_sync(raft::warp_full_mask(), warp_buffer_start_idx, int{0}); + if (e_op_result) { + auto buffer_warp_offset = __popc(ballot & ~(raft::warp_full_mask() << lane_id)); + push_buffer_element(buffer_key_output_first, + buffer_value_output_first, + warp_buffer_start_idx + buffer_warp_offset, + e_op_result); + } + } +} + +template -__global__ void extract_transform_v_frontier_e_hypersparse( +__global__ void extract_transform_v_frontier_e_hypersparse_or_low_degree( edge_partition_device_view_t edge_partition, @@ -104,6 +135,7 @@ __global__ void extract_transform_v_frontier_e_hypersparse( EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, BufferKeyOutputIterator buffer_key_output_first, BufferValueOutputIterator buffer_value_output_first, size_t* buffer_idx_ptr, @@ -127,6 +159,8 @@ __global__ void extract_transform_v_frontier_e_hypersparse( edge_partition.major_range_first()); auto idx = static_cast(tid); + cuda::atomic_ref buffer_idx(*buffer_idx_ptr); + __shared__ edge_t warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size]; __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size]; @@ -134,9 +168,6 @@ __global__ void extract_transform_v_frontier_e_hypersparse( using WarpScan = cub::WarpScan; __shared__ typename WarpScan::TempStorage temp_storage; - __shared__ size_t - buffer_warp_start_indices[extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()]; - auto indices = edge_partition.indices(); vertex_t num_keys = static_cast(thrust::distance(key_first, key_last)); @@ -160,251 +191,93 @@ __global__ void extract_transform_v_frontier_e_hypersparse( } else { major = thrust::get<0>(key); } - auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major); - if (major_hypersparse_idx) { - auto major_idx = major_start_offset + *major_hypersparse_idx; - local_degree = edge_partition.local_degree(major_idx); - warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_idx); - } else { - local_degree = edge_t{0}; - warp_key_local_edge_offsets[threadIdx.x] = edge_t{0}; // dummy - } - } - WarpScan(temp_storage) - .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); - __syncwarp(); - - // process local edges for the keys in [key_first + min_key_idx, key_first + max_key_idx) - - auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + - (max_key_idx - min_key_idx) - 1]; - auto rounded_up_num_edges_this_warp = - ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); - - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(), - thrust::upper_bound(thrust::seq, - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(), - warp_local_degree_inclusive_sums + warp_id * raft::warp_size() + - (max_key_idx - min_key_idx), - i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - - ((key_idx_this_warp == 0) - ? edge_t{0} - : warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + - key_idx_this_warp - 1])); - auto key = *(key_first + (min_key_idx + key_idx_this_warp)); - vertex_t major{}; - if constexpr (std::is_same_v) { - major = key; + if constexpr (hypersparse) { + auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major); + if (major_hypersparse_idx) { + auto major_idx = major_start_offset + *major_hypersparse_idx; + local_degree = edge_partition.local_degree(major_idx); + warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_idx); } else { - major = thrust::get<0>(key); - } - auto minor = indices[local_edge_offset]; - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - std::conditional_t - key_or_src{}; // key if major - std::conditional_t - key_or_dst{}; // key if major - if constexpr (GraphViewType::is_storage_transposed) { - key_or_src = minor; - key_or_dst = key; - } else { - key_or_src = key; - key_or_dst = minor; - } - auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; - auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; - e_op_result = e_op(key_or_src, - key_or_dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(local_edge_offset)); - } - auto ballot_e_op = - __ballot_sync(uint32_t{0xffffffff}, e_op_result ? uint32_t{1} : uint32_t{0}); - if (ballot_e_op) { - if (lane_id == 0) { - auto increment = __popc(ballot_e_op); - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - buffer_warp_start_indices[warp_id] = - static_cast(atomicAdd(reinterpret_cast(buffer_idx_ptr), - static_cast(increment))); - } - __syncwarp(); - if (e_op_result) { - auto buffer_warp_offset = - static_cast(__popc(ballot_e_op & ~(uint32_t{0xffffffff} << lane_id))); - push_buffer_element(e_op_result, - buffer_key_output_first, - buffer_value_output_first, - buffer_warp_start_indices[warp_id] + buffer_warp_offset); + local_degree = edge_t{0}; + warp_key_local_edge_offsets[threadIdx.x] = edge_t{0}; // dummy } - } - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ void extract_transform_v_frontier_e_low_degree( - edge_partition_device_view_t edge_partition, - KeyIterator key_first, - KeyIterator key_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - BufferKeyOutputIterator buffer_key_output_first, - BufferValueOutputIterator buffer_value_output_first, - size_t* buffer_idx_ptr, - EdgeOp e_op) -{ - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename thrust::iterator_traits::value_type; - using e_op_result_t = - typename edge_op_result_type::type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto const warp_id = threadIdx.x / raft::warp_size(); - auto const lane_id = tid % raft::warp_size(); - auto idx = static_cast(tid); - - __shared__ edge_t - warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size]; - __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size]; - - using WarpScan = cub::WarpScan; - __shared__ typename WarpScan::TempStorage temp_storage; - - __shared__ size_t - buffer_warp_start_indices[extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()]; - - auto indices = edge_partition.indices(); - - vertex_t num_keys = static_cast(thrust::distance(key_first, key_last)); - auto rounded_up_num_keys = - ((static_cast(num_keys) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); - while (idx < rounded_up_num_keys) { - auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive - auto max_key_idx = - static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), - static_cast(num_keys))); // exclusive - - // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - - edge_t local_degree{0}; - if (lane_id < static_cast(max_key_idx - min_key_idx)) { - auto key = *(key_first + idx); - vertex_t major{}; - if constexpr (std::is_same_v) { - major = key; } else { - major = thrust::get<0>(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + local_degree = edge_partition.local_degree(major_offset); + warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_offset); } - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - local_degree = edge_partition.local_degree(major_offset); - warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_offset); } WarpScan(temp_storage) .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); __syncwarp(); - // processes local edges for the keys in [key_first + min_key_idx, key_first + max_key_idx) + // all the threads in a warp collectively process local edges for the keys in [key_first + + // min_key_idx, key_first + max_key_idx) auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + (max_key_idx - min_key_idx) - 1]; auto rounded_up_num_edges_this_warp = ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * raft::warp_size(); - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(), - thrust::upper_bound(thrust::seq, - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(), - warp_local_degree_inclusive_sums + warp_id * raft::warp_size() + - (max_key_idx - min_key_idx), - i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - - ((key_idx_this_warp == 0) - ? edge_t{0} - : warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + - key_idx_this_warp - 1])); - auto key = *(key_first + (min_key_idx + key_idx_this_warp)); - vertex_t major{}; - if constexpr (std::is_same_v) { - major = key; - } else { - major = thrust::get<0>(key); - } - auto minor = indices[local_edge_offset]; - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - std::conditional_t - key_or_src{}; // key if major - std::conditional_t - key_or_dst{}; // key if major - if constexpr (GraphViewType::is_storage_transposed) { - key_or_src = minor; - key_or_dst = key; - } else { - key_or_src = key; - key_or_dst = minor; + + auto call_e_op = call_e_op_with_key_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op}; + + auto this_warp_inclusive_sum_first = + warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); + auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); + + if (edge_partition_e_mask) { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + if ((*edge_partition_e_mask).get(local_edge_offset)) { + auto key = *(key_first + (min_key_idx + key_idx_this_warp)); + e_op_result = call_e_op(key, local_edge_offset); + } } - auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; - auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; - e_op_result = e_op(key_or_src, - key_or_dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(local_edge_offset)); + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } - auto ballot = __ballot_sync(uint32_t{0xffffffff}, e_op_result ? uint32_t{1} : uint32_t{0}); - if (ballot > 0) { - if (lane_id == 0) { - auto increment = __popc(ballot); - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - buffer_warp_start_indices[warp_id] = - static_cast(atomicAdd(reinterpret_cast(buffer_idx_ptr), - static_cast(increment))); - } - __syncwarp(); - if (e_op_result) { - auto buffer_warp_offset = - static_cast(__popc(ballot & ~(uint32_t{0xffffffff} << lane_id))); - push_buffer_element(e_op_result, - buffer_key_output_first, - buffer_value_output_first, - buffer_warp_start_indices[warp_id] + buffer_warp_offset); + } else { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + auto key = *(key_first + (min_key_idx + key_idx_this_warp)); + e_op_result = call_e_op(key, local_edge_offset); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -417,6 +290,7 @@ template @@ -429,6 +303,7 @@ __global__ void extract_transform_v_frontier_e_mid_degree( EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, BufferKeyOutputIterator buffer_key_output_first, BufferValueOutputIterator buffer_value_output_first, size_t* buffer_idx_ptr, @@ -451,8 +326,8 @@ __global__ void extract_transform_v_frontier_e_mid_degree( auto const lane_id = tid % raft::warp_size(); auto idx = static_cast(tid / raft::warp_size()); - __shared__ size_t - buffer_warp_start_indices[extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()]; + cuda::atomic_ref buffer_idx(*buffer_idx_ptr); + while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); vertex_t major{}; @@ -470,48 +345,40 @@ __global__ void extract_transform_v_frontier_e_mid_degree( auto rounded_up_local_out_degree = ((static_cast(local_out_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * raft::warp_size(); - for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { - e_op_result_t e_op_result{}; - if (i < static_cast(local_out_degree)) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - std::conditional_t - key_or_src{}; // key if major - std::conditional_t - key_or_dst{}; // key if major - if constexpr (GraphViewType::is_storage_transposed) { - key_or_src = minor; - key_or_dst = key; - } else { - key_or_src = key; - key_or_dst = minor; + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + local_edge_offset}; + + if (edge_partition_e_mask) { + for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + if ((i < static_cast(local_out_degree)) && + ((*edge_partition_e_mask).get(local_edge_offset + i))) { + e_op_result = call_e_op(i); } - auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; - auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; - e_op_result = e_op(key_or_src, - key_or_dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(local_edge_offset + i)); + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } - auto ballot = __ballot_sync(uint32_t{0xffffffff}, e_op_result ? uint32_t{1} : uint32_t{0}); - if (ballot > 0) { - if (lane_id == 0) { - auto increment = __popc(ballot); - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - buffer_warp_start_indices[warp_id] = - static_cast(atomicAdd(reinterpret_cast(buffer_idx_ptr), - static_cast(increment))); - } - __syncwarp(); - if (e_op_result) { - auto buffer_warp_offset = - static_cast(__popc(ballot & ~(uint32_t{0xffffffff} << lane_id))); - push_buffer_element(e_op_result, - buffer_key_output_first, - buffer_value_output_first, - buffer_warp_start_indices[warp_id] + buffer_warp_offset); - } + } else { + for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -524,6 +391,7 @@ template @@ -536,6 +404,7 @@ __global__ void extract_transform_v_frontier_e_high_degree( EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, BufferKeyOutputIterator buffer_key_output_first, BufferValueOutputIterator buffer_value_output_first, size_t* buffer_idx_ptr, @@ -552,11 +421,11 @@ __global__ void extract_transform_v_frontier_e_high_degree( typename EdgePartitionEdgeValueInputWrapper::value_type, EdgeOp>::type; - auto idx = static_cast(blockIdx.x); + auto const warp_id = threadIdx.x / raft::warp_size(); + auto const lane_id = threadIdx.x % raft::warp_size(); + auto idx = static_cast(blockIdx.x); - using BlockScan = cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - __shared__ size_t buffer_block_start_idx; + cuda::atomic_ref buffer_idx(*buffer_idx_ptr); while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); @@ -576,49 +445,40 @@ __global__ void extract_transform_v_frontier_e_high_degree( (extract_transform_v_frontier_e_kernel_block_size - 1)) / extract_transform_v_frontier_e_kernel_block_size) * extract_transform_v_frontier_e_kernel_block_size; - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{}; - edge_t buffer_block_offset{0}; - - if (i < static_cast(local_out_degree)) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - std::conditional_t - key_or_src{}; // key if major - std::conditional_t - key_or_dst{}; // key if major - if constexpr (GraphViewType::is_storage_transposed) { - key_or_src = minor; - key_or_dst = key; - } else { - key_or_src = key; - key_or_dst = minor; + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + local_edge_offset}; + + if (edge_partition_e_mask) { + for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { + e_op_result_t e_op_result{thrust::nullopt}; + if ((i < static_cast(local_out_degree)) && + ((*edge_partition_e_mask).get(local_edge_offset + i))) { + e_op_result = call_e_op(i); } - auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; - auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; - e_op_result = e_op(key_or_src, - key_or_dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(local_edge_offset + i)); - } - BlockScan(temp_storage) - .ExclusiveSum(e_op_result ? edge_t{1} : edge_t{0}, buffer_block_offset); - if (threadIdx.x == (blockDim.x - 1)) { - auto increment = buffer_block_offset + (e_op_result ? edge_t{1} : edge_t{0}); - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - buffer_block_start_idx = increment > 0 - ? static_cast(atomicAdd( - reinterpret_cast(buffer_idx_ptr), - static_cast(increment))) - : size_t{0} /* dummy */; + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } - __syncthreads(); - if (e_op_result) { - push_buffer_element(e_op_result, - buffer_key_output_first, - buffer_value_output_first, - buffer_block_start_idx + buffer_block_offset); + } else { + for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { + e_op_result_t e_op_result{thrust::nullopt}; + if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -757,10 +617,18 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; } + auto edge_mask_view = graph_view.edge_mask_view(); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; auto edge_partition_frontier_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); @@ -846,8 +714,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); h_offsets.push_back(edge_partition_frontier_size); // FIXME: we may further improve performance by 1) concurrently running kernels on different - // segments; 2) individually tuning block sizes for different segments; and 3) adding one more - // segment for very high degree vertices and running segmented reduction + // segments; 2) individually tuning block sizes for different segments; and 3) adding one + // more segment for very high degree vertices and running segmented reduction if (h_offsets[0] > 0) { raft::grid_1d_block_t update_grid(h_offsets[0], extract_transform_v_frontier_e_kernel_block_size, @@ -860,6 +728,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, get_optional_dataframe_buffer_begin(key_buffer), get_optional_dataframe_buffer_begin(value_buffer), buffer_idx.data(), @@ -877,6 +746,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, get_optional_dataframe_buffer_begin(key_buffer), get_optional_dataframe_buffer_begin(value_buffer), buffer_idx.data(), @@ -886,7 +756,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_low_degree + extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, edge_partition_frontier_key_first + h_offsets[1], @@ -894,6 +764,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, get_optional_dataframe_buffer_begin(key_buffer), get_optional_dataframe_buffer_begin(value_buffer), buffer_idx.data(), @@ -903,7 +774,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse + extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, edge_partition_frontier_key_first + h_offsets[2], @@ -911,6 +782,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, get_optional_dataframe_buffer_begin(key_buffer), get_optional_dataframe_buffer_begin(value_buffer), buffer_idx.data(), @@ -922,7 +794,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_low_degree + extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, edge_partition_frontier_key_first, @@ -930,6 +802,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, get_optional_dataframe_buffer_begin(key_buffer), get_optional_dataframe_buffer_begin(value_buffer), buffer_idx.data(), diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh index 26b87f21dbb..e3453ffdec5 100644 --- a/cpp/src/prims/detail/nbr_intersection.cuh +++ b/cpp/src/prims/detail/nbr_intersection.cuh @@ -213,8 +213,7 @@ struct update_rx_major_local_nbrs_t { auto mask_first = (*edge_partition_e_mask).value_first(); if constexpr (!std::is_same_v) { auto input_first = - thrust::make_zip_iterator(indices, edge_partition_e_value_input.value_first()) + - edge_offset; + thrust::make_zip_iterator(indices, edge_partition_e_value_input.value_first()); copy_if_mask_set(input_first, mask_first, thrust::make_zip_iterator(local_nbrs_for_rx_majors.begin(), diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh index 2785ba38dfd..d142aed1051 100644 --- a/cpp/src/prims/detail/prim_functors.cuh +++ b/cpp/src/prims/detail/prim_functors.cuh @@ -22,6 +22,7 @@ namespace cugraph { namespace detail { template + key_or_src{}; + std::conditional_t + key_or_dst{}; + if constexpr (GraphViewType::is_storage_transposed) { + key_or_src = minor; + key_or_dst = key; + } else { + key_or_src = key; + key_or_dst = minor; + } + auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; + auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; + return e_op(key_or_src, + key_or_dst, edge_partition_src_value_input.get(src_offset), edge_partition_dst_value_input.get(dst_offset), edge_partition_e_value_input.get(edge_offset + i)); } }; +template +struct call_e_op_with_key_t { + edge_partition_device_view_t const& edge_partition{}; + EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{}; + EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{}; + EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{}; + EdgeOp const& e_op{}; + + __device__ auto operator()( + key_t key, typename GraphViewType::edge_type i /* index in edge_partition's edge list */) const + { + typename GraphViewType::vertex_type major{}; + if constexpr (std::is_same_v) { + major = key; + } else { + major = thrust::get<0>(key); + } + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + auto minor = *(edge_partition.indices() + i); + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + std::conditional_t + key_or_src{}; + std::conditional_t + key_or_dst{}; + if constexpr (GraphViewType::is_storage_transposed) { + key_or_src = minor; + key_or_dst = key; + } else { + key_or_src = key; + key_or_dst = minor; + } + auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; + auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; + return e_op(key_or_src, + key_or_dst, + edge_partition_src_value_input.get(src_offset), + edge_partition_dst_value_input.get(dst_offset), + edge_partition_e_value_input.get(i)); + } +}; + } // namespace detail } // namespace cugraph diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh index f135b76d6e3..25e04fff83a 100644 --- a/cpp/src/prims/extract_transform_e.cuh +++ b/cpp/src/prims/extract_transform_e.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -107,8 +107,6 @@ extract_transform_e(raft::handle_t const& handle, static_assert(!std::is_same_v); using payload_t = typename e_op_result_t::value_type; - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - // FIXME: Consider updating detail::extract_transform_v_forntier_e to take std::nullopt to as a // frontier or create a new key bucket type that just stores [vertex_first, vertex_last) for // further optimization. Better revisit this once this becomes a performance bottleneck and after diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh index 42af8a1164d..f3b85da53ea 100644 --- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh +++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,8 +98,6 @@ extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle, static_assert(!std::is_same_v); using payload_t = typename e_op_result_t::value_type; - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - auto value_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); std::tie(std::ignore, value_buffer) = detail::extract_transform_v_frontier_e(handle, diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 86e23a1a04e..5a7fe6b5044 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -123,8 +123,6 @@ void fill_edge_src_property(raft::handle_t const& handle, edge_src_property_t& edge_src_property_output, bool do_expensive_check = false) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - if (do_expensive_check) { // currently, nothing to do } @@ -161,8 +159,6 @@ void fill_edge_dst_property(raft::handle_t const& handle, edge_dst_property_t& edge_dst_property_output, bool do_expensive_check = false) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - if (do_expensive_check) { // currently, nothing to do } diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 24b4f0857b1..c519d8aefed 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -190,6 +190,7 @@ __global__ void per_v_transform_reduce_e_hypersparse( edge_partition.local_edges(static_cast(major_idx)); auto call_e_op = call_e_op_t(major_offset)); auto call_e_op = call_e_op_t{static_cast(frontier.size())}; } + + auto edge_mask_view = graph_view.edge_mask_view(); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -225,14 +232,30 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, static_cast(i), handle.get_stream()); - ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); + if (edge_partition_e_mask) { + ret += + edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), + edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } } else { assert(i == 0); - ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); + if (edge_partition_e_mask) { + ret += edge_partition.compute_number_of_edges_with_mask( + (*edge_partition_e_mask).value_first(), + local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } } } @@ -323,8 +346,6 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, using key_t = typename VertexFrontierBucketType::key_type; using payload_t = typename ReduceOp::value_type; - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - if (do_expensive_check) { // currently, nothing to do } diff --git a/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh b/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh index 74267d02b38..cac648079b0 100644 --- a/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh +++ b/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ struct return_edges_with_properties_e_op { vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, - EdgeProperties edge_properties) + EdgeProperties edge_properties) const { static_assert(std::is_same_v || std::is_same_v>); diff --git a/cpp/src/structure/graph_weight_utils_impl.cuh b/cpp/src/structure/graph_weight_utils_impl.cuh index 1e386792b21..173b4df207b 100644 --- a/cpp/src/structure/graph_weight_utils_impl.cuh +++ b/cpp/src/structure/graph_weight_utils_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,8 +89,6 @@ rmm::device_uvector compute_in_weight_sums( graph_view_t const& graph_view, edge_property_view_t edge_weight_view) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - if (store_transposed) { return compute_weight_sums(handle, graph_view, edge_weight_view); } else { @@ -108,8 +106,6 @@ rmm::device_uvector compute_out_weight_sums( graph_view_t const& graph_view, edge_property_view_t edge_weight_view) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - if (store_transposed) { return compute_weight_sums(handle, graph_view, edge_weight_view); } else { @@ -127,8 +123,6 @@ weight_t compute_max_in_weight_sum( graph_view_t const& graph_view, edge_property_view_t edge_weight_view) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - auto in_weight_sums = compute_in_weight_sums(handle, graph_view, edge_weight_view); auto it = thrust::max_element(handle.get_thrust_policy(), in_weight_sums.begin(), in_weight_sums.end()); @@ -153,8 +147,6 @@ weight_t compute_max_out_weight_sum( graph_view_t const& graph_view, edge_property_view_t edge_weight_view) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - auto out_weight_sums = compute_out_weight_sums(handle, graph_view, edge_weight_view); auto it = thrust::max_element(handle.get_thrust_policy(), out_weight_sums.begin(), out_weight_sums.end()); @@ -179,8 +171,6 @@ weight_t compute_total_edge_weight( graph_view_t const& graph_view, edge_property_view_t edge_weight_view) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - return transform_reduce_e( handle, graph_view, diff --git a/cpp/src/structure/induced_subgraph_impl.cuh b/cpp/src/structure/induced_subgraph_impl.cuh index 18e1af32a71..fa4c5e0f6f2 100644 --- a/cpp/src/structure/induced_subgraph_impl.cuh +++ b/cpp/src/structure/induced_subgraph_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,7 +71,7 @@ struct induced_subgraph_weighted_edge_op { vertex_t dst, property_t sv, property_t dv, - weight_t wgt) + weight_t wgt) const { size_t subgraph = thrust::get<1>(tagged_src); return thrust::binary_search(thrust::seq, @@ -95,7 +95,7 @@ struct induced_subgraph_unweighted_edge_op { vertex_t dst, property_t sv, property_t dv, - thrust::nullopt_t) + thrust::nullopt_t) const { size_t subgraph = thrust::get<1>(tagged_src); return thrust::binary_search(thrust::seq, diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu index bca6471a5bb..caa00e13640 100644 --- a/cpp/tests/prims/mg_extract_transform_e.cu +++ b/cpp/tests/prims/mg_extract_transform_e.cu @@ -1,6 +1,6 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,6 +116,7 @@ struct e_op_t { }; struct Prims_Usecase { + bool edge_masking{false}; bool check_correctness{true}; }; @@ -180,6 +181,13 @@ class Tests_MGExtractTransformE auto mg_graph_view = mg_graph.view(); + std::optional> edge_mask{std::nullopt}; + if (prims_usecase.edge_masking) { + edge_mask = + cugraph::test::generate::edge_property(*handle_, mg_graph_view, 2); + mg_graph_view.attach_edge_mask((*edge_mask).view()); + } + // 2. run MG extract_transform_e const int hash_bin_count = 5; @@ -400,7 +408,7 @@ INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGExtractTransformE_File, ::testing::Combine( - ::testing::Values(Prims_Usecase{true}), + ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"), cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), @@ -408,7 +416,8 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(rmat_small_test, Tests_MGExtractTransformE_Rmat, - ::testing::Combine(::testing::Values(Prims_Usecase{true}), + ::testing::Combine(::testing::Values(Prims_Usecase{false, true}, + Prims_Usecase{true, true}), ::testing::Values(cugraph::test::Rmat_Usecase( 10, 16, 0.57, 0.19, 0.19, 0, false, false)))); @@ -420,7 +429,7 @@ INSTANTIATE_TEST_SUITE_P( factor (to avoid running same benchmarks more than once) */ Tests_MGExtractTransformE_Rmat, ::testing::Combine( - ::testing::Values(Prims_Usecase{false}), + ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}), ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu index 4d9435dd344..09c2fc0b2cb 100644 --- a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu +++ b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,6 +115,7 @@ struct e_op_t { }; struct Prims_Usecase { + bool edge_masking{false}; bool check_correctness{true}; }; @@ -180,6 +181,13 @@ class Tests_MGExtractTransformVFrontierOutgoingE auto mg_graph_view = mg_graph.view(); + std::optional> edge_mask{std::nullopt}; + if (prims_usecase.edge_masking) { + edge_mask = + cugraph::test::generate::edge_property(*handle_, mg_graph_view, 2); + mg_graph_view.attach_edge_mask((*edge_mask).view()); + } + // 2. run MG extract_transform_v_frontier_outgoing_e const int hash_bin_count = 5; @@ -458,7 +466,7 @@ INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGExtractTransformVFrontierOutgoingE_File, ::testing::Combine( - ::testing::Values(Prims_Usecase{true}), + ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"), cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), @@ -466,7 +474,8 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(rmat_small_test, Tests_MGExtractTransformVFrontierOutgoingE_Rmat, - ::testing::Combine(::testing::Values(Prims_Usecase{true}), + ::testing::Combine(::testing::Values(Prims_Usecase{false, true}, + Prims_Usecase{true, true}), ::testing::Values(cugraph::test::Rmat_Usecase( 10, 16, 0.57, 0.19, 0.19, 0, false, false)))); @@ -478,7 +487,7 @@ INSTANTIATE_TEST_SUITE_P( factor (to avoid running same benchmarks more than once) */ Tests_MGExtractTransformVFrontierOutgoingE_Rmat, ::testing::Combine( - ::testing::Values(Prims_Usecase{false}), + ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}), ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu index 4d05b0c9e65..3e59bf3bf20 100644 --- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu +++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "property_generator.cuh" + #include #include #include @@ -84,6 +86,7 @@ struct intersection_op_t { struct Prims_Usecase { size_t num_vertex_pairs{0}; + bool edge_masking{false}; bool check_correctness{true}; }; @@ -109,6 +112,13 @@ class Tests_MGPerVPairTransformDstNbrIntersection auto const comm_rank = handle_->get_comms().get_rank(); auto const comm_size = handle_->get_comms().get_size(); + constexpr bool store_transposed = false; + + constexpr bool test_weighted = true; + constexpr bool renumber = true; + constexpr bool drop_self_loops = false; + constexpr bool drop_multi_edges = true; + // 1. create MG graph if (cugraph::test::g_perf) { @@ -117,29 +127,10 @@ class Tests_MGPerVPairTransformDstNbrIntersection hr_timer.start("MG Construct graph"); } - constexpr bool store_transposed = false; - constexpr bool multi_gpu = true; - - cugraph::graph_t mg_graph(*handle_); - std::optional< - cugraph::edge_property_t, - weight_t>> - mg_edge_weight{std::nullopt}; - - std::optional> mg_renumber_map{std::nullopt}; - - constexpr bool test_weighted = true; - constexpr bool renumber = true; - constexpr bool drop_self_loops = false; - constexpr bool drop_multi_edges = true; - - std::tie(mg_graph, mg_edge_weight, mg_renumber_map) = - cugraph::test::construct_graph( + auto [mg_graph, mg_edge_weight, mg_renumber_map] = + cugraph::test::construct_graph( *handle_, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges); - auto mg_graph_view = mg_graph.view(); - auto mg_edge_weight_view = (*mg_edge_weight).view(); - if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement handle_->get_comms().barrier(); @@ -147,6 +138,16 @@ class Tests_MGPerVPairTransformDstNbrIntersection hr_timer.display_and_clear(std::cout); } + auto mg_graph_view = mg_graph.view(); + auto mg_edge_weight_view = (*mg_edge_weight).view(); + + std::optional> edge_mask{std::nullopt}; + if (prims_usecase.edge_masking) { + edge_mask = + cugraph::test::generate::edge_property(*handle_, mg_graph_view, 2); + mg_graph_view.attach_edge_mask((*edge_mask).view()); + } + // 2. run MG per_v_pair_transform_dst_nbr_intersection primitive ASSERT_TRUE( @@ -355,15 +356,18 @@ INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGPerVPairTransformDstNbrIntersection_File, ::testing::Combine( - ::testing::Values(Prims_Usecase{size_t{10}, true}), + ::testing::Values(Prims_Usecase{size_t{10}, false, true}, + Prims_Usecase{size_t{10}, true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"), cugraph::test::File_Usecase("test/datasets/netscience.mtx")))); -INSTANTIATE_TEST_SUITE_P(rmat_small_test, - Tests_MGPerVPairTransformDstNbrIntersection_Rmat, - ::testing::Combine(::testing::Values(Prims_Usecase{size_t{1024}, true}), - ::testing::Values(cugraph::test::Rmat_Usecase( - 10, 16, 0.57, 0.19, 0.19, 0, false, false)))); +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_MGPerVPairTransformDstNbrIntersection_Rmat, + ::testing::Combine( + ::testing::Values(Prims_Usecase{size_t{1024}, false, true}, + Prims_Usecase{size_t{1024}, true, true}), + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)))); INSTANTIATE_TEST_SUITE_P( rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with @@ -373,7 +377,8 @@ INSTANTIATE_TEST_SUITE_P( factor (to avoid running same benchmarks more than once) */ Tests_MGPerVPairTransformDstNbrIntersection_Rmat, ::testing::Combine( - ::testing::Values(Prims_Usecase{size_t{1024 * 1024}, false}), + ::testing::Values(Prims_Usecase{size_t{1024 * 1024}, false, false}, + Prims_Usecase{size_t{1024 * 1024}, true, false}), ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu index d0b97065da7..7d1b2dd9412 100644 --- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu +++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,6 +88,7 @@ struct e_op_t { }; struct Prims_Usecase { + bool edge_masking{false}; bool check_correctness{true}; }; @@ -152,6 +153,13 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst auto mg_graph_view = mg_graph.view(); + std::optional> edge_mask{std::nullopt}; + if (prims_usecase.edge_masking) { + edge_mask = + cugraph::test::generate::edge_property(*handle_, mg_graph_view, 2); + mg_graph_view.attach_edge_mask((*edge_mask).view()); + } + // 2. run MG transform reduce const int hash_bin_count = 5; @@ -533,7 +541,7 @@ INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGTransformReduceVFrontierOutgoingEByDst_File, ::testing::Combine( - ::testing::Values(Prims_Usecase{true}), + ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"), cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), @@ -541,7 +549,8 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(rmat_small_test, Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, - ::testing::Combine(::testing::Values(Prims_Usecase{true}), + ::testing::Combine(::testing::Values(Prims_Usecase{false, true}, + Prims_Usecase{true, true}), ::testing::Values(cugraph::test::Rmat_Usecase( 10, 16, 0.57, 0.19, 0.19, 0, false, false)))); @@ -553,7 +562,7 @@ INSTANTIATE_TEST_SUITE_P( factor (to avoid running same benchmarks more than once) */ Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat, ::testing::Combine( - ::testing::Values(Prims_Usecase{false}), + ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}), ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/structure/weight_sum_test.cpp b/cpp/tests/structure/weight_sum_test.cpp index a61c0d4eeb4..30de0092a5a 100644 --- a/cpp/tests/structure/weight_sum_test.cpp +++ b/cpp/tests/structure/weight_sum_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -46,8 +47,7 @@ void weight_sum_reference(edge_t const* offsets, if (!major) { std::fill(weight_sums, weight_sums + num_vertices, weight_t{0.0}); } for (vertex_t i = 0; i < num_vertices; ++i) { if (major) { - weight_sums[i] = - std::accumulate(weights + offsets[i], weights + offsets[i + 1], weight_t{0.0}); + weight_sums[i] = std::reduce(weights + offsets[i], weights + offsets[i + 1], weight_t{0.0}); } else { for (auto j = offsets[i]; j < offsets[i + 1]; ++j) { auto nbr = indices[j]; @@ -60,19 +60,13 @@ void weight_sum_reference(edge_t const* offsets, } typedef struct WeightSum_Usecase_t { - std::string graph_file_full_path{}; - - WeightSum_Usecase_t(std::string const& graph_file_path) - { - if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { - graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; - } else { - graph_file_full_path = graph_file_path; - } - }; + bool edge_masking{false}; + bool check_correctness{true}; } WeightSum_Usecase; -class Tests_WeightSum : public ::testing::TestWithParam { +template +class Tests_WeightSum + : public ::testing::TestWithParam> { public: Tests_WeightSum() {} @@ -83,96 +77,189 @@ class Tests_WeightSum : public ::testing::TestWithParam { virtual void TearDown() {} template - void run_current_test(WeightSum_Usecase const& configuration) + void run_current_test(WeightSum_Usecase const& weight_sum_usecase, + input_usecase_t const& input_usecase) { + constexpr bool renumber = true; + constexpr bool test_weighted = true; + raft::handle_t handle{}; + HighResTimer hr_timer{}; + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.start("Construct graph"); + } + + auto [graph, edge_weights, d_renumber_map_labels] = + cugraph::test::construct_graph( + handle, input_usecase, test_weighted, renumber); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.stop(); + hr_timer.display_and_clear(std::cout); + } - cugraph::graph_t graph(handle); - std::optional< - cugraph::edge_property_t, - weight_t>> - edge_weights{std::nullopt}; - std::tie(graph, edge_weights, std::ignore) = cugraph::test:: - read_graph_from_matrix_market_file( - handle, configuration.graph_file_full_path, true, false); auto graph_view = graph.view(); auto edge_weight_view = edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt; - auto h_offsets = - cugraph::test::to_host(handle, graph_view.local_edge_partition_view().offsets()); - auto h_indices = - cugraph::test::to_host(handle, graph_view.local_edge_partition_view().indices()); - auto h_weights = cugraph::test::to_host( - handle, - raft::device_span((*edge_weight_view).value_firsts()[0], - (*edge_weight_view).edge_counts()[0])); - - std::vector h_reference_in_weight_sums(graph_view.number_of_vertices()); - std::vector h_reference_out_weight_sums(graph_view.number_of_vertices()); - - weight_sum_reference(h_offsets.data(), - h_indices.data(), - h_weights.data(), - h_reference_in_weight_sums.data(), - graph_view.number_of_vertices(), - store_transposed); - - weight_sum_reference(h_offsets.data(), - h_indices.data(), - h_weights.data(), - h_reference_out_weight_sums.data(), - graph_view.number_of_vertices(), - !store_transposed); - - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.start("Compute in-weight sums"); + } auto d_in_weight_sums = cugraph::compute_in_weight_sums(handle, graph_view, *edge_weight_view); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.stop(); + hr_timer.display_and_clear(std::cout); + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.start("Compute out-weight sums"); + } + auto d_out_weight_sums = cugraph::compute_out_weight_sums(handle, graph_view, *edge_weight_view); - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - auto h_cugraph_in_weight_sums = cugraph::test::to_host(handle, d_in_weight_sums); - auto h_cugraph_out_weight_sums = cugraph::test::to_host(handle, d_out_weight_sums); - - auto threshold_ratio = weight_t{1e-4}; - auto threshold_magnitude = std::numeric_limits::min(); - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - ASSERT_TRUE(std::equal(h_reference_in_weight_sums.begin(), - h_reference_in_weight_sums.end(), - h_cugraph_in_weight_sums.begin(), - nearly_equal)) - << "In-weight-sum values do not match with the reference values."; - ASSERT_TRUE(std::equal(h_reference_out_weight_sums.begin(), - h_reference_out_weight_sums.end(), - h_cugraph_out_weight_sums.begin(), - nearly_equal)) - << "Out-weight-sum values do not match with the reference values."; + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_timer.stop(); + hr_timer.display_and_clear(std::cout); + } + + if (weight_sum_usecase.check_correctness) { + auto [h_offsets, h_indices, h_weights] = + cugraph::test::graph_to_host_csr(handle, graph_view, edge_weight_view); + + std::vector h_reference_in_weight_sums(graph_view.number_of_vertices()); + std::vector h_reference_out_weight_sums(graph_view.number_of_vertices()); + + weight_sum_reference(h_offsets.data(), + h_indices.data(), + (*h_weights).data(), + h_reference_in_weight_sums.data(), + graph_view.number_of_vertices(), + store_transposed); + + weight_sum_reference(h_offsets.data(), + h_indices.data(), + (*h_weights).data(), + h_reference_out_weight_sums.data(), + graph_view.number_of_vertices(), + !store_transposed); + + auto h_cugraph_in_weight_sums = cugraph::test::to_host(handle, d_in_weight_sums); + auto h_cugraph_out_weight_sums = cugraph::test::to_host(handle, d_out_weight_sums); + + auto threshold_ratio = weight_t{2.0 * 1e-4}; + auto threshold_magnitude = std::numeric_limits::min(); + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + auto ret = + std::abs(lhs - rhs) < std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + ASSERT_TRUE(std::equal(h_reference_in_weight_sums.begin(), + h_reference_in_weight_sums.end(), + h_cugraph_in_weight_sums.begin(), + nearly_equal)) + << "In-weight-sum values do not match with the reference values."; + + ASSERT_TRUE(std::equal(h_reference_out_weight_sums.begin(), + h_reference_out_weight_sums.end(), + h_cugraph_out_weight_sums.begin(), + nearly_equal)) + << "Out-weight-sum values do not match with the reference values."; + } } }; -// FIXME: add tests for type combinations +using Tests_WeightSum_File = Tests_WeightSum; +using Tests_WeightSum_Rmat = Tests_WeightSum; + +TEST_P(Tests_WeightSum_File, CheckInt32Int32FloatTransposeFalse) +{ + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); +} + +TEST_P(Tests_WeightSum_File, CheckInt32Int32FloatTransposeTrue) +{ + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); +} + +TEST_P(Tests_WeightSum_Rmat, CheckInt32Int32FloatTransposeFalse) +{ + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); +} -TEST_P(Tests_WeightSum, CheckInt32Int32FloatTransposeFalse) +TEST_P(Tests_WeightSum_Rmat, CheckInt32Int32FloatTransposeTrue) { - run_current_test(GetParam()); + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); } -TEST_P(Tests_WeightSum, CheckInt32Int32FloatTransposeTrue) +TEST_P(Tests_WeightSum_Rmat, CheckInt32Int64FloatTransposeFalse) { - run_current_test(GetParam()); + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); } -INSTANTIATE_TEST_SUITE_P(simple_test, - Tests_WeightSum, - ::testing::Values(WeightSum_Usecase("test/datasets/karate.mtx"), - WeightSum_Usecase("test/datasets/web-Google.mtx"), - WeightSum_Usecase("test/datasets/ljournal-2008.mtx"), - WeightSum_Usecase("test/datasets/webbase-1M.mtx"))); +TEST_P(Tests_WeightSum_Rmat, CheckInt32Int64FloatTransposeTrue) +{ + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); +} + +TEST_P(Tests_WeightSum_Rmat, CheckInt64Int64FloatTransposeFalse) +{ + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); +} + +TEST_P(Tests_WeightSum_Rmat, CheckInt64Int64FloatTransposeTrue) +{ + auto param = GetParam(); + run_current_test(std::get<0>(param), std::get<1>(param)); +} + +INSTANTIATE_TEST_SUITE_P( + file_test, + Tests_WeightSum_File, + ::testing::Combine( + // enable correctness checks + ::testing::Values(WeightSum_Usecase{false}, WeightSum_Usecase{true}), + ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"), + cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), + cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), + cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")))); + +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_WeightSum_Rmat, + ::testing::Combine( + // enable correctness checks + ::testing::Values(WeightSum_Usecase{false}, WeightSum_Usecase{true}), + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)))); + +INSTANTIATE_TEST_SUITE_P( + rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with + --gtest_filter to select only the rmat_benchmark_test with a specific + vertex & edge type combination) by command line arguments and do not + include more than one Rmat_Usecase that differ only in scale or edge + factor (to avoid running same benchmarks more than once) */ + Tests_WeightSum_Rmat, + ::testing::Combine( + // disable correctness checks for large graphs + ::testing::Values(WeightSum_Usecase{false, false}, WeightSum_Usecase{true, false}), + ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_TEST_PROGRAM_MAIN()