From a36eb622973cd977957f03baea339b6114587927 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 5 Jan 2024 18:01:45 -0800 Subject: [PATCH 01/12] update tests to include edge masking --- ..._v_transform_reduce_incoming_outgoing_e.cu | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu index 677d6ce5022..a8a273804e4 100644 --- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu +++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu @@ -150,8 +150,9 @@ struct result_compare { }; struct Prims_Usecase { - bool check_correctness{true}; bool test_weighted{false}; + bool edge_masking{false}; + bool check_correctness{true}; }; template @@ -200,6 +201,13 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE auto mg_graph_view = mg_graph.view(); + std::optional> edge_mask{std::nullopt}; + if (prims_usecase.edge_masking) { + edge_mask = + cugraph::test::generate::edge_property(*handle_, mg_graph_view, 2); + mg_graph_view.attach_edge_mask((*edge_mask).view()); + } + // 2. run MG transform reduce const int hash_bin_count = 5; @@ -674,7 +682,10 @@ INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGPerVTransformReduceIncomingOutgoingE_File, ::testing::Combine( - ::testing::Values(Prims_Usecase{true}), + ::testing::Values(Prims_Usecase{false, false, true}, + Prims_Usecase{false, true, true}, + Prims_Usecase{true, false, true}, + Prims_Usecase{true, true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"), cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), @@ -682,7 +693,10 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(rmat_small_test, Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat, - ::testing::Combine(::testing::Values(Prims_Usecase{true}), + ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true}, + Prims_Usecase{false, true, true}, + Prims_Usecase{true, false, true}, + Prims_Usecase{true, true, true}), ::testing::Values(cugraph::test::Rmat_Usecase( 10, 16, 0.57, 0.19, 0.19, 0, false, false)))); @@ -694,7 +708,10 @@ INSTANTIATE_TEST_SUITE_P( factor (to avoid running same benchmarks more than once) */ Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat, ::testing::Combine( - ::testing::Values(Prims_Usecase{false}), + ::testing::Values(Prims_Usecase{false, false, false}, + Prims_Usecase{false, true, false}, + Prims_Usecase{true, false, false}, + Prims_Usecase{true, true, false}), ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_MG_TEST_PROGRAM_MAIN() From 8bdc17ed731c49ba374cea2c05b48adadad071a1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 7 Jan 2024 07:48:52 -0800 Subject: [PATCH 02/12] remove unnecessary multiply by 1.0 --- cpp/src/centrality/eigenvector_centrality_impl.cuh | 2 +- cpp/src/link_analysis/pagerank_impl.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh index 8d1bea4004d..40fe2858683 100644 --- a/cpp/src/centrality/eigenvector_centrality_impl.cuh +++ b/cpp/src/centrality/eigenvector_centrality_impl.cuh @@ -117,7 +117,7 @@ rmm::device_uvector eigenvector_centrality( edge_src_centralities.view(), edge_dst_dummy_property_t{}.view(), edge_dummy_property_t{}.view(), - [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val * 1.0; }, + [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val; }, weight_t{0}, reduce_op::plus{}, centralities.begin()); diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index 92c70fcff20..6f02390c3bc 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -288,7 +288,7 @@ centrality_algorithm_metadata_t pagerank( edge_dst_dummy_property_t{}.view(), edge_dummy_property_t{}.view(), [alpha] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { - return src_val * 1.0 * alpha; + return src_val * alpha; }, unvarying_part, reduce_op::plus{}, From 9751332e25b8b6c7ef5b364ad28a29dfbd3eb64e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 10 Jan 2024 12:35:43 -0800 Subject: [PATCH 03/12] update to support edge masking --- ...v_transform_reduce_incoming_outgoing_e.cuh | 260 +++++++++++------- 1 file changed, 161 insertions(+), 99 deletions(-) diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 1a7fc0130c4..cbd1662b37f 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -68,6 +68,7 @@ template edge_partition_e_mask, ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, ReduceOp reduce_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< @@ -115,22 +118,28 @@ __global__ void per_v_transform_reduce_e_hypersparse( &edge_partition_src_value_input, &edge_partition_dst_value_input, &edge_partition_e_value_input, + &edge_partition_e_mask, &e_op, + identity_element, major, indices, edge_offset] __device__(auto i) { - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed ? minor : major; - auto dst = GraphViewType::is_storage_transposed ? major : minor; - auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; - auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; - return e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); + if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + auto src = GraphViewType::is_storage_transposed ? minor : major; + auto dst = GraphViewType::is_storage_transposed ? major : minor; + auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; + auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; + return e_op(src, + dst, + edge_partition_src_value_input.get(src_offset), + edge_partition_dst_value_input.get(dst_offset), + edge_partition_e_value_input.get(edge_offset + i)); + } else { + return identity_element; + } }; if constexpr (update_major) { @@ -147,22 +156,34 @@ __global__ void per_v_transform_reduce_e_hypersparse( thrust::seq, thrust::make_counting_iterator(edge_t{0}), thrust::make_counting_iterator(local_degree), - [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + [&edge_partition, + identity_element, + indices, + &result_value_output, + &transform_op] __device__(auto i) { + auto e_op_result = transform_op(i); + if (e_op_result != identity_element) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } }); } else { thrust::for_each( thrust::seq, thrust::make_counting_iterator(edge_t{0}), thrust::make_counting_iterator(local_degree), - [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + [&edge_partition, + identity_element, + indices, + &result_value_output, + &transform_op] __device__(auto i) { + auto e_op_result = transform_op(i); + if (e_op_result != identity_element) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } }); } } @@ -175,6 +196,7 @@ template edge_partition_e_mask, ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, ReduceOp reduce_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< @@ -219,27 +243,33 @@ __global__ void per_v_transform_reduce_e_low_degree( &edge_partition_src_value_input, &edge_partition_dst_value_input, &edge_partition_e_value_input, + &edge_partition_e_mask, &e_op, + identity_element, major_offset, indices, edge_offset] __device__(auto i) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed - ? minor - : edge_partition.major_from_major_offset_nocheck(major_offset); - auto dst = GraphViewType::is_storage_transposed - ? edge_partition.major_from_major_offset_nocheck(major_offset) - : minor; - auto src_offset = - GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); - auto dst_offset = - GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; - return e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); + if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + auto src = GraphViewType::is_storage_transposed + ? minor + : edge_partition.major_from_major_offset_nocheck(major_offset); + auto dst = GraphViewType::is_storage_transposed + ? edge_partition.major_from_major_offset_nocheck(major_offset) + : minor; + auto src_offset = + GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); + auto dst_offset = + GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; + return e_op(src, + dst, + edge_partition_src_value_input.get(src_offset), + edge_partition_dst_value_input.get(dst_offset), + edge_partition_e_value_input.get(edge_offset + i)); + } else { + return identity_element; + } }; if constexpr (update_major) { @@ -256,22 +286,34 @@ __global__ void per_v_transform_reduce_e_low_degree( thrust::seq, thrust::make_counting_iterator(edge_t{0}), thrust::make_counting_iterator(local_degree), - [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + [&edge_partition, + identity_element, + indices, + &result_value_output, + &transform_op] __device__(auto i) { + auto e_op_result = transform_op(i); + if (e_op_result != identity_element) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } }); } else { thrust::for_each( thrust::seq, thrust::make_counting_iterator(edge_t{0}), thrust::make_counting_iterator(local_degree), - [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + [&edge_partition, + identity_element, + indices, + &result_value_output, + &transform_op] __device__(auto i) { + auto e_op_result = transform_op(i); + if (e_op_result != identity_element) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } }); } } @@ -284,6 +326,7 @@ template edge_partition_e_mask, ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, @@ -335,30 +379,32 @@ __global__ void per_v_transform_reduce_e_mid_degree( [[maybe_unused]] auto reduced_e_op_result = lane_id == 0 ? init : identity_element; // relevant only if update_major == true for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed - ? minor - : edge_partition.major_from_major_offset_nocheck(major_offset); - auto dst = GraphViewType::is_storage_transposed - ? edge_partition.major_from_major_offset_nocheck(major_offset) - : minor; - auto src_offset = - GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); - auto dst_offset = - GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; - auto e_op_result = e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + auto src = GraphViewType::is_storage_transposed + ? minor + : edge_partition.major_from_major_offset_nocheck(major_offset); + auto dst = GraphViewType::is_storage_transposed + ? edge_partition.major_from_major_offset_nocheck(major_offset) + : minor; + auto src_offset = + GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); + auto dst_offset = + GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; + auto e_op_result = e_op(src, + dst, + edge_partition_src_value_input.get(src_offset), + edge_partition_dst_value_input.get(dst_offset), + edge_partition_e_value_input.get(edge_offset + i)); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } } } } @@ -377,6 +423,7 @@ template edge_partition_e_mask, ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, @@ -424,30 +472,32 @@ __global__ void per_v_transform_reduce_e_high_degree( [[maybe_unused]] auto reduced_e_op_result = threadIdx.x == 0 ? init : identity_element; // relevant only if update_major == true for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed - ? minor - : edge_partition.major_from_major_offset_nocheck(major_offset); - auto dst = GraphViewType::is_storage_transposed - ? edge_partition.major_from_major_offset_nocheck(major_offset) - : minor; - auto src_offset = - GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); - auto dst_offset = - GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; - auto e_op_result = e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + auto src = GraphViewType::is_storage_transposed + ? minor + : edge_partition.major_from_major_offset_nocheck(major_offset); + auto dst = GraphViewType::is_storage_transposed + ? edge_partition.major_from_major_offset_nocheck(major_offset) + : minor; + auto src_offset = + GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); + auto dst_offset = + GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; + auto e_op_result = e_op(src, + dst, + edge_partition_src_value_input.get(src_offset), + edge_partition_dst_value_input.get(dst_offset), + edge_partition_e_value_input.get(edge_offset + i)); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } } } } @@ -656,10 +706,18 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (stream_pool_indices) { handle.sync_stream(); } + auto edge_mask_view = graph_view.edge_mask_view(); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; auto major_init = ReduceOp::identity_element; if constexpr (update_major) { @@ -737,9 +795,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, segment_output_buffer, e_op, major_init, + ReduceOp::identity_element, reduce_op); } } @@ -761,9 +821,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, segment_output_buffer, e_op, major_init, + ReduceOp::identity_element, reduce_op); } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { @@ -784,6 +846,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, segment_output_buffer, e_op, major_init, @@ -806,6 +869,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, output_buffer, e_op, major_init, @@ -825,9 +889,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, + edge_partition_e_mask, output_buffer, e_op, major_init, + ReduceOp::identity_element, reduce_op); } } @@ -1056,8 +1122,6 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, VertexValueOutputIterator vertex_value_output_first, bool do_expensive_check = false) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - if (do_expensive_check) { // currently, nothing to do } @@ -1137,8 +1201,6 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, VertexValueOutputIterator vertex_value_output_first, bool do_expensive_check = false) { - CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); - if (do_expensive_check) { // currently, nothing to do } From 3782e1b204ba659a6b7083a398f080e476921643 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 23 Jan 2024 10:49:37 -0800 Subject: [PATCH 04/12] rename call_e_op_t to transform_reduce_v_froniter_call_e_op_t to avoid name collision --- ...sform_reduce_v_frontier_outgoing_e_by_dst.cuh | 16 ++++++++-------- cpp/src/traversal/od_shortest_distances_impl.cuh | 15 ++++++++------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 745f1a8fd8e..24ade16bbfb 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -80,7 +80,7 @@ template -struct call_e_op_t { +struct transform_reduce_v_frontier_call_e_op_t { EdgeOp e_op{}; __device__ thrust::optional< @@ -331,13 +331,13 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, // 1. fill the buffer - detail::call_e_op_t + detail::transform_reduce_v_frontier_call_e_op_t e_op_wrapper{e_op}; auto [key_buffer, payload_buffer] = diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index cc69cb5f67f..1629790794e 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -639,13 +639,14 @@ rmm::device_uvector od_shortest_distances( static_cast(origins.size()), cutoff, invalid_distance}; - detail::call_e_op_t, - weight_t, - vertex_t, - thrust::nullopt_t, - thrust::nullopt_t, - weight_t, - e_op_t> + detail::transform_reduce_v_frontier_call_e_op_t< + thrust::tuple, + weight_t, + vertex_t, + thrust::nullopt_t, + thrust::nullopt_t, + weight_t, + e_op_t> e_op_wrapper{e_op}; auto new_frontier_tagged_vertex_buffer = From 23874af532fc3eb10f44bc19e9a29e3608f191ec Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 09:34:37 -0800 Subject: [PATCH 05/12] fewer checking for edge_partition_e_mask.has_value() if edge masking is disabled --- ...v_transform_reduce_incoming_outgoing_e.cuh | 454 +++++++++++------- 1 file changed, 268 insertions(+), 186 deletions(-) diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index cbd1662b37f..3febc2876cc 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -63,6 +63,107 @@ namespace detail { int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; +template +struct per_v_transform_reduce_call_e_op_t { + edge_partition_device_view_t + edge_partition{}; + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input{}; + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input{}; + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input{}; + EdgeOp e_op{}; + typename GraphViewType::vertex_type major{}; + typename GraphViewType::vertex_type major_offset{}; + typename GraphViewType::vertex_type const* indices{nullptr}; + typename GraphViewType::edge_type edge_offset{}; + + __device__ auto operator()(typename GraphViewType::edge_type i) const + { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + auto src = GraphViewType::is_storage_transposed ? minor : major; + auto dst = GraphViewType::is_storage_transposed ? major : minor; + auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; + auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; + return e_op(src, + dst, + edge_partition_src_value_input.get(src_offset), + edge_partition_dst_value_input.get(dst_offset), + edge_partition_e_value_input.get(edge_offset + i)); + } +}; + +template +__device__ void update_result_value_output( + edge_partition_device_view_t edge_partition, + vertex_t const* indices, + edge_t local_degree, + TransformOp transform_op, + result_t init, + ReduceOp reduce_op, + size_t output_idx /* relevent only when update_major === true */, + result_t identity_element, + ResultValueOutputIteratorOrWrapper result_value_output) +{ + if constexpr (update_major) { + *(result_value_output + output_idx) = + thrust::transform_reduce(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_op, + init, + reduce_op); + } else { + if constexpr (multi_gpu) { + thrust::for_each( + thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + [&edge_partition, + identity_element, + indices, + &result_value_output, + &transform_op] __device__(auto i) { + auto e_op_result = transform_op(i); + if (e_op_result != identity_element) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } + }); + } else { + thrust::for_each( + thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + [&edge_partition, + identity_element, + indices, + &result_value_output, + &transform_op] __device__(auto i) { + auto e_op_result = transform_op(i); + if (e_op_result != identity_element) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + }); + } + } +} + template (dcs_nzd_vertex_count)) { auto major = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); auto major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region vertex_t const* indices{nullptr}; @@ -114,78 +216,52 @@ __global__ void per_v_transform_reduce_e_hypersparse( edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(static_cast(major_idx)); - auto transform_op = [&edge_partition, - &edge_partition_src_value_input, - &edge_partition_dst_value_input, - &edge_partition_e_value_input, - &edge_partition_e_mask, - &e_op, - identity_element, - major, - indices, - edge_offset] __device__(auto i) { - if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed ? minor : major; - auto dst = GraphViewType::is_storage_transposed ? major : minor; - auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; - auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; - return e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); - } else { - return identity_element; - } - }; - if constexpr (update_major) { - *(result_value_output + (major - *(edge_partition.major_hypersparse_first()))) = - thrust::transform_reduce(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_op, - init, - reduce_op); + auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; + + if (edge_partition_e_mask) { + auto transform_op = + [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_e_op(i); + } else { + return identity_element; + } + }; + + update_result_value_output(edge_partition, + indices, + local_degree, + transform_op, + init, + reduce_op, + major - *(edge_partition).major_hypersparse_first(), + identity_element, + result_value_output); } else { - if constexpr (GraphViewType::is_multi_gpu) { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - [&edge_partition, - identity_element, - indices, - &result_value_output, - &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } - }); - } else { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - [&edge_partition, - identity_element, - indices, - &result_value_output, - &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - }); - } + auto transform_op = [&call_e_op] __device__(auto i) { return call_e_op(i); }; + + update_result_value_output(edge_partition, + indices, + local_degree, + transform_op, + init, + reduce_op, + major - *(edge_partition).major_hypersparse_first(), + identity_element, + result_value_output); } idx += gridDim.x * blockDim.x; } @@ -233,89 +309,59 @@ __global__ void per_v_transform_reduce_e_low_degree( auto idx = static_cast(tid); while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = major_start_offset + idx; + auto major_offset = static_cast(major_start_offset + idx); + auto major = edge_partition.major_from_major_offset_nocheck(major_offset); vertex_t const* indices{nullptr}; edge_t edge_offset{}; edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(static_cast(major_offset)); - auto transform_op = [&edge_partition, - &edge_partition_src_value_input, - &edge_partition_dst_value_input, - &edge_partition_e_value_input, - &edge_partition_e_mask, - &e_op, - identity_element, - major_offset, - indices, - edge_offset] __device__(auto i) { - if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed - ? minor - : edge_partition.major_from_major_offset_nocheck(major_offset); - auto dst = GraphViewType::is_storage_transposed - ? edge_partition.major_from_major_offset_nocheck(major_offset) - : minor; - auto src_offset = - GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); - auto dst_offset = - GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; - return e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); - } else { - return identity_element; - } - }; - if constexpr (update_major) { - *(result_value_output + idx) = - thrust::transform_reduce(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_op, - init, - reduce_op); + auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; + + if (edge_partition_e_mask) { + auto transform_op = + [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_e_op(i); + } else { + return identity_element; + } + }; + + update_result_value_output(edge_partition, + indices, + local_degree, + transform_op, + init, + reduce_op, + idx, + identity_element, + result_value_output); } else { - if constexpr (GraphViewType::is_multi_gpu) { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - [&edge_partition, - identity_element, - indices, - &result_value_output, - &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } - }); - } else { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - [&edge_partition, - identity_element, - indices, - &result_value_output, - &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - }); - } + auto transform_op = [&call_e_op] __device__(auto i) { return call_e_op(i); }; + + update_result_value_output(edge_partition, + indices, + local_degree, + transform_op, + init, + reduce_op, + idx, + identity_element, + result_value_output); } idx += gridDim.x * blockDim.x; } @@ -371,35 +417,52 @@ __global__ void per_v_transform_reduce_e_mid_degree( raft::warp_size()]; // relevant only if update_major == true while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = major_start_offset + idx; + auto major_offset = static_cast(major_start_offset + idx); + auto major = edge_partition.major_from_major_offset_nocheck(major_offset); vertex_t const* indices{nullptr}; edge_t edge_offset{}; edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; + [[maybe_unused]] auto reduced_e_op_result = lane_id == 0 ? init : identity_element; // relevant only if update_major == true - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed - ? minor - : edge_partition.major_from_major_offset_nocheck(major_offset); - auto dst = GraphViewType::is_storage_transposed - ? edge_partition.major_from_major_offset_nocheck(major_offset) - : minor; - auto src_offset = - GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); - auto dst_offset = - GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; - auto e_op_result = e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); + if (edge_partition_e_mask) { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); if constexpr (GraphViewType::is_multi_gpu) { reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); } else { @@ -408,6 +471,7 @@ __global__ void per_v_transform_reduce_e_mid_degree( } } } + if constexpr (update_major) { reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) .Reduce(reduced_e_op_result, reduce_op); @@ -464,35 +528,52 @@ __global__ void per_v_transform_reduce_e_high_degree( typename BlockReduce::TempStorage temp_storage; // relevant only if update_major == true while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = major_start_offset + idx; + auto major_offset = static_cast(major_start_offset + idx); + auto major = edge_partition.major_from_major_offset_nocheck(major_offset); vertex_t const* indices{nullptr}; edge_t edge_offset{}; edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; + [[maybe_unused]] auto reduced_e_op_result = threadIdx.x == 0 ? init : identity_element; // relevant only if update_major == true - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed - ? minor - : edge_partition.major_from_major_offset_nocheck(major_offset); - auto dst = GraphViewType::is_storage_transposed - ? edge_partition.major_from_major_offset_nocheck(major_offset) - : minor; - auto src_offset = - GraphViewType::is_storage_transposed ? minor_offset : static_cast(major_offset); - auto dst_offset = - GraphViewType::is_storage_transposed ? static_cast(major_offset) : minor_offset; - auto e_op_result = e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); + if (edge_partition_e_mask) { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); if constexpr (GraphViewType::is_multi_gpu) { reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); } else { @@ -501,6 +582,7 @@ __global__ void per_v_transform_reduce_e_high_degree( } } } + if constexpr (update_major) { reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } From 65104df7b2cd35d33ad1e02d74deb0ec166f1029 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 11:14:31 -0800 Subject: [PATCH 06/12] copyright year --- cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh | 2 +- .../prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh | 2 +- cpp/src/traversal/od_shortest_distances_impl.cuh | 2 +- .../prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 3febc2876cc..9b51d0ac15d 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 24ade16bbfb..18e722d62cc 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index 1629790794e..58fae83bca0 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu index a8a273804e4..0ed697b6a0e 100644 --- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu +++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -200,6 +200,7 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE } auto mg_graph_view = mg_graph.view(); + std::cout << "test_weighted=" << prims_usecase.test_weighted << " edge_masking=" << prims_usecase.edge_masking << std::endl; std::optional> edge_mask{std::nullopt}; if (prims_usecase.edge_masking) { From 55927e9cb91f84044a5ae2d8bdb6f39f2b518fca Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 12:08:11 -0800 Subject: [PATCH 07/12] reduce code repetition --- ...v_transform_reduce_incoming_outgoing_e.cuh | 76 ++++++++++--------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 9b51d0ac15d..b65f294b629 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -98,6 +98,35 @@ struct per_v_transform_reduce_call_e_op_t { } }; +template +struct transform_and_atomic_reduce_t { + edge_partition_device_view_t edge_partition{}; + result_t identity_element{}; + vertex_t const* indices{nullptr}; + TransformOp transform_op{}; + ResultValueOutputIteratorOrWrapper result_value_output{}; + + __device__ void operator()(edge_t i) const + { + auto e_op_result = transform_op(i); + if (e_op_result != identity_element) { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + if constexpr (multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } +}; + template (result_value_output, minor_offset, e_op_result); - } - }); - } else { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - [&edge_partition, - identity_element, - indices, - &result_value_output, - &transform_op] __device__(auto i) { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - }); - } + thrust::for_each( + thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_and_atomic_reduce_t{ + edge_partition, identity_element, indices, transform_op, result_value_output}); } } From 58799d9ffc274b058bd22db5a7431936027d3de5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 14:31:22 -0800 Subject: [PATCH 08/12] remove debug printout --- cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu index 0ed697b6a0e..fc8114a4652 100644 --- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu +++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu @@ -200,7 +200,6 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE } auto mg_graph_view = mg_graph.view(); - std::cout << "test_weighted=" << prims_usecase.test_weighted << " edge_masking=" << prims_usecase.edge_masking << std::endl; std::optional> edge_mask{std::nullopt}; if (prims_usecase.edge_masking) { From a421a765d4b5327cc6646e73cf6a0b48e35136f8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 14:31:43 -0800 Subject: [PATCH 09/12] get large value by reference --- ...v_transform_reduce_incoming_outgoing_e.cuh | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index b65f294b629..a19df920843 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -71,12 +71,11 @@ template - edge_partition{}; - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input{}; - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input{}; - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input{}; - EdgeOp e_op{}; + GraphViewType::is_multi_gpu> const& edge_partition{}; + EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{}; + EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{}; + EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{}; + EdgeOp const& e_op{}; typename GraphViewType::vertex_type major{}; typename GraphViewType::vertex_type major_offset{}; typename GraphViewType::vertex_type const* indices{nullptr}; @@ -106,11 +105,11 @@ template struct transform_and_atomic_reduce_t { - edge_partition_device_view_t edge_partition{}; + edge_partition_device_view_t const& edge_partition{}; result_t identity_element{}; vertex_t const* indices{nullptr}; - TransformOp transform_op{}; - ResultValueOutputIteratorOrWrapper result_value_output{}; + TransformOp const& transform_op{}; + ResultValueOutputIteratorOrWrapper& result_value_output{}; __device__ void operator()(edge_t i) const { @@ -136,15 +135,15 @@ template __device__ void update_result_value_output( - edge_partition_device_view_t edge_partition, + edge_partition_device_view_t const& edge_partition, vertex_t const* indices, edge_t local_degree, - TransformOp transform_op, + TransformOp const& transform_op, result_t init, - ReduceOp reduce_op, + ReduceOp const& reduce_op, size_t output_idx /* relevent only when update_major === true */, result_t identity_element, - ResultValueOutputIteratorOrWrapper result_value_output) + ResultValueOutputIteratorOrWrapper& result_value_output) { if constexpr (update_major) { *(result_value_output + output_idx) = From a413978d5f681c0df0539861fe78abefd1d982c8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 14:59:48 -0800 Subject: [PATCH 10/12] copyright year --- cpp/src/centrality/eigenvector_centrality_impl.cuh | 2 +- cpp/src/link_analysis/pagerank_impl.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh index 40fe2858683..2129dca6985 100644 --- a/cpp/src/centrality/eigenvector_centrality_impl.cuh +++ b/cpp/src/centrality/eigenvector_centrality_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index 6f02390c3bc..9a76ba73f92 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From b2308d67461c45d4fbe7631b0f35db491020f843 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 15:55:45 -0800 Subject: [PATCH 11/12] code refactor for re-use for other primitives --- cpp/src/prims/detail/prim_functors.cuh | 60 ++++++++ ...v_transform_reduce_incoming_outgoing_e.cuh | 139 +++++++----------- 2 files changed, 113 insertions(+), 86 deletions(-) create mode 100644 cpp/src/prims/detail/prim_functors.cuh diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh new file mode 100644 index 00000000000..2785ba38dfd --- /dev/null +++ b/cpp/src/prims/detail/prim_functors.cuh @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cugraph { + +namespace detail { + +template +struct call_e_op_t { + edge_partition_device_view_t const& edge_partition{}; + EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{}; + EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{}; + EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{}; + EdgeOp const& e_op{}; + typename GraphViewType::vertex_type major{}; + typename GraphViewType::vertex_type major_offset{}; + typename GraphViewType::vertex_type const* indices{nullptr}; + typename GraphViewType::edge_type edge_offset{}; + + __device__ auto operator()(typename GraphViewType::edge_type i) const + { + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + auto src = GraphViewType::is_storage_transposed ? minor : major; + auto dst = GraphViewType::is_storage_transposed ? major : minor; + auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; + auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; + return e_op(src, + dst, + edge_partition_src_value_input.get(src_offset), + edge_partition_dst_value_input.get(dst_offset), + edge_partition_e_value_input.get(edge_offset + i)); + } +}; + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index a19df920843..89ae892f5bc 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -63,40 +64,6 @@ namespace detail { int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; -template -struct per_v_transform_reduce_call_e_op_t { - edge_partition_device_view_t const& edge_partition{}; - EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{}; - EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{}; - EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{}; - EdgeOp const& e_op{}; - typename GraphViewType::vertex_type major{}; - typename GraphViewType::vertex_type major_offset{}; - typename GraphViewType::vertex_type const* indices{nullptr}; - typename GraphViewType::edge_type edge_offset{}; - - __device__ auto operator()(typename GraphViewType::edge_type i) const - { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - auto src = GraphViewType::is_storage_transposed ? minor : major; - auto dst = GraphViewType::is_storage_transposed ? major : minor; - auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset; - auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset; - return e_op(src, - dst, - edge_partition_src_value_input.get(src_offset), - edge_partition_dst_value_input.get(dst_offset), - edge_partition_e_value_input.get(edge_offset + i)); - } -}; - template (major_idx)); - auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; if (edge_partition_e_mask) { auto transform_op = @@ -322,19 +289,19 @@ __global__ void per_v_transform_reduce_e_low_degree( thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(static_cast(major_offset)); - auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; if (edge_partition_e_mask) { auto transform_op = @@ -429,19 +396,19 @@ __global__ void per_v_transform_reduce_e_mid_degree( edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; [[maybe_unused]] auto reduced_e_op_result = lane_id == 0 ? init : identity_element; // relevant only if update_major == true @@ -540,19 +507,19 @@ __global__ void per_v_transform_reduce_e_high_degree( edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - auto call_e_op = per_v_transform_reduce_call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + major, + major_offset, + indices, + edge_offset}; [[maybe_unused]] auto reduced_e_op_result = threadIdx.x == 0 ? init : identity_element; // relevant only if update_major == true From d6653dbed81e6830379eb6b41f7da648e4b15cdc Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jan 2024 16:15:36 -0800 Subject: [PATCH 12/12] code cleanup --- .../prims/per_v_transform_reduce_incoming_outgoing_e.cuh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 89ae892f5bc..24b4f0857b1 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -223,12 +223,10 @@ __global__ void per_v_transform_reduce_e_hypersparse( identity_element, result_value_output); } else { - auto transform_op = [&call_e_op] __device__(auto i) { return call_e_op(i); }; - update_result_value_output(edge_partition, indices, local_degree, - transform_op, + call_e_op, init, reduce_op, major - *(edge_partition).major_hypersparse_first(), @@ -323,12 +321,10 @@ __global__ void per_v_transform_reduce_e_low_degree( identity_element, result_value_output); } else { - auto transform_op = [&call_e_op] __device__(auto i) { return call_e_op(i); }; - update_result_value_output(edge_partition, indices, local_degree, - transform_op, + call_e_op, init, reduce_op, idx,