From d71424346f7b10a507b2c7abffaaf288dafa4b0b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com> Date: Mon, 25 Nov 2024 08:11:02 -0800 Subject: [PATCH] Primitives & BFS performance improvements (#4751) This PR includes multiple updates to cut peak memory usage in graph creation and improve performance of BFS on scale-free graphs. * Add a bitmap for non-zero local degree vertices in the hypersparse region; this information can be used to quickly filter out locally zero degree vertices which don't need to be processed in multiple instances. * Store (global-)degree offsets for vertices in the hypersparse region; this information can used to quickly identify the vertices with a certain global degree (e.g. for global degree 1 vertices, we can skip inter-GPU reduction as we know each vertex has only one neighbor). * Skip kernel invocations in computing edge counts if the vertex list is empty. * Add asynchronous functions to compute edge counts. This helps in preventing unnecessary serialization when we can process multiple such functions concurrently. * Replace rmm::exec_policy with rmm::exec_policy_nosync in multiple places; the former enforces stream synchronization at the end. The latter does not. * Enforce cache line alignment in NCCL communication in multiple places (NCCL communication performance is significantly affected by cache line alignment, often leading to 30-40% or more differences). * For primitives working on a subset of vertices, broadcast a vertex list using a bitmap if the vertex frontier size is large. If the vertex frontier size is small (in case vertex_t is 8B and the local vertex partition range can fit into 4B), use vertex offsets instead of vertices to cut communication volume. * Merge multiple host scalar communication function calls to a single one. * Increase multi-stream concurrency in detail::extract_transform_e & detail::per_v_transform_reduce_e * Multiple optimizations in template specialization (for update_major == true && reduce_op == any && key type is vertex && working on a subset of vertices) in detail::per_v_transform_reduce_e (this includes pre-processing vertices with non-zero local degrees; so we don't need to process such vertices using multiple GPUs, pre-filtering of zero local degree vertices, allreduce communication to reduce shuffle communication volumes, and special treatment of global degree 1 vertices, and so on). * Multiple optimizations & specializations in detail::fill_edge_minor_property that works on a subset of vertices (this includes kernel fusion, specialization for bitmap properties including direct broadcast to the property buffer and special treatments for vertex partition boundaries, and so on). * Added multiple optimizations & specializations in transform_reduce_v_frontier_outgoing_e (especially for reduce_op::any and to cut communication volumes and to filter out (key, value) pairs that won't contribute to the final results). * Multiple low-level optimizations in direction optimizing BFS (including approximations in determining between bottom -up and top-down). * Multiple optimizations to cut peak memory usage in graph creation. Authors: - Seunghwa Kang (https://github.com/seunghwak) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) URL: https://github.com/rapidsai/cugraph/pull/4751 --- .../cugraph/edge_partition_device_view.cuh | 139 +- cpp/include/cugraph/edge_partition_view.hpp | 10 +- cpp/include/cugraph/graph.hpp | 24 +- cpp/include/cugraph/graph_functions.hpp | 41 +- cpp/include/cugraph/graph_view.hpp | 73 +- cpp/include/cugraph/partition_manager.hpp | 26 +- .../cugraph/utilities/dataframe_buffer.hpp | 73 +- cpp/include/cugraph/utilities/device_comm.hpp | 124 +- cpp/include/cugraph/utilities/misc_utils.cuh | 2 +- .../cugraph/utilities/shuffle_comm.cuh | 289 +- .../cugraph/utilities/thrust_tuple_utils.hpp | 26 + .../betweenness_centrality_impl.cuh | 20 +- .../approx_weighted_matching_impl.cuh | 5 +- cpp/src/community/detail/common_methods.cuh | 5 +- cpp/src/community/detail/refine_impl.cuh | 15 +- .../weakly_connected_components_impl.cuh | 39 +- cpp/src/cores/core_number_impl.cuh | 19 +- cpp/src/lookup/lookup_src_dst_impl.cuh | 5 +- .../detail/extract_transform_v_frontier_e.cuh | 1549 ++++-- cpp/src/prims/detail/multi_stream_utils.cuh | 156 + .../detail/optional_dataframe_buffer.hpp | 174 +- .../prims/detail/per_v_transform_reduce_e.cuh | 4374 +++++++++++++++++ cpp/src/prims/detail/prim_functors.cuh | 22 + cpp/src/prims/detail/prim_utils.cuh | 104 + cpp/src/prims/extract_transform_e.cuh | 4 +- ...xtract_transform_v_frontier_outgoing_e.cuh | 24 +- cpp/src/prims/fill_edge_src_dst_property.cuh | 859 +++- ..._v_pair_transform_dst_nbr_intersection.cuh | 95 +- ...r_v_random_select_transform_outgoing_e.cuh | 162 +- ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 5 +- ...ransform_reduce_if_incoming_outgoing_e.cuh | 421 ++ ...v_transform_reduce_incoming_outgoing_e.cuh | 1336 +---- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 1196 +++++ ...educe_v_frontier_outgoing_e_by_src_dst.cuh | 585 --- .../prims/update_edge_src_dst_property.cuh | 318 +- cpp/src/prims/vertex_frontier.cuh | 248 +- .../create_graph_from_edgelist_impl.cuh | 651 ++- cpp/src/structure/detail/structure_utils.cuh | 136 +- cpp/src/structure/graph_impl.cuh | 233 +- cpp/src/structure/graph_view_impl.cuh | 7 +- cpp/src/structure/induced_subgraph_impl.cuh | 13 - cpp/src/structure/renumber_edgelist_impl.cuh | 761 +-- cpp/src/structure/renumber_utils_impl.cuh | 29 +- cpp/src/traversal/bfs_impl.cuh | 582 ++- cpp/src/traversal/extract_bfs_paths_impl.cuh | 14 +- cpp/src/traversal/k_hop_nbrs_impl.cuh | 20 +- .../traversal/od_shortest_distances_impl.cuh | 15 +- cpp/src/traversal/sssp_impl.cuh | 4 +- cpp/src/utilities/collect_comm.cuh | 245 +- cpp/src/utilities/shuffle_vertex_pairs.cuh | 8 +- cpp/tests/CMakeLists.txt | 6 +- cpp/tests/c_api/mg_test_utils.h | 9 + ...rm_reduce_v_frontier_outgoing_e_by_dst.cu} | 194 +- cpp/tests/traversal/mg_bfs_test.cpp | 34 +- cpp/tests/utilities/mg_utilities.cpp | 4 +- cpp/tests/utilities/mg_utilities.hpp | 5 +- cpp/tests/utilities/test_graphs.hpp | 46 +- 57 files changed, 11526 insertions(+), 4057 deletions(-) create mode 100644 cpp/src/prims/detail/multi_stream_utils.cuh create mode 100644 cpp/src/prims/detail/per_v_transform_reduce_e.cuh create mode 100644 cpp/src/prims/detail/prim_utils.cuh create mode 100644 cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh create mode 100644 cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh delete mode 100644 cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh rename cpp/tests/prims/{mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu => mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu} (74%) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 583b0a37214..628c3cc10cc 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -204,6 +204,7 @@ class edge_partition_device_view_t view) : detail::edge_partition_device_view_base_t(view.offsets(), view.indices()), dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())), + dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())), major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())), major_range_first_(view.major_range_first()), major_range_last_(view.major_range_last()), @@ -218,6 +219,7 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices_) { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -266,7 +328,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -284,7 +346,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -295,7 +357,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -368,7 +431,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -394,7 +457,7 @@ class edge_partition_device_view_t> for consistency (see + // dcs_nzd_range_bitmap()) __host__ __device__ thrust::optional dcs_nzd_vertices() const { return dcs_nzd_vertices_ ? thrust::optional{(*dcs_nzd_vertices_).data()} @@ -528,10 +593,20 @@ class edge_partition_device_view_t> dcs_nzd_range_bitmap() + const + { + return dcs_nzd_range_bitmap_ + ? thrust::make_optional>( + (*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size()) + : thrust::nullopt; + } + private: // should be trivially copyable to device thrust::optional> dcs_nzd_vertices_{thrust::nullopt}; + thrust::optional> dcs_nzd_range_bitmap_{thrust::nullopt}; thrust::optional major_hypersparse_first_{thrust::nullopt}; vertex_t major_range_first_{0}; @@ -558,6 +633,7 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -595,7 +709,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -613,6 +727,7 @@ class edge_partition_device_view_t local_degrees(this->major_range_size(), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -660,7 +775,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), diff --git a/cpp/include/cugraph/edge_partition_view.hpp b/cpp/include/cugraph/edge_partition_view.hpp index 42465273718..27c5705dfcc 100644 --- a/cpp/include/cugraph/edge_partition_view.hpp +++ b/cpp/include/cugraph/edge_partition_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,6 +56,7 @@ class edge_partition_view_t offsets, raft::device_span indices, std::optional> dcs_nzd_vertices, + std::optional> dcs_nzd_range_bitmap, std::optional major_hypersparse_first, vertex_t major_range_first, vertex_t major_range_last, @@ -64,6 +65,7 @@ class edge_partition_view_t(offsets, indices), dcs_nzd_vertices_(dcs_nzd_vertices), + dcs_nzd_range_bitmap_(dcs_nzd_range_bitmap), major_hypersparse_first_(major_hypersparse_first), major_range_first_(major_range_first), major_range_last_(major_range_last), @@ -78,6 +80,11 @@ class edge_partition_view_t> dcs_nzd_range_bitmap() const + { + return dcs_nzd_range_bitmap_; + } + std::optional major_hypersparse_first() const { return major_hypersparse_first_; } vertex_t major_range_first() const { return major_range_first_; } @@ -90,6 +97,7 @@ class edge_partition_view_t> dcs_nzd_vertices_{std::nullopt}; + std::optional> dcs_nzd_range_bitmap_{std::nullopt}; std::optional major_hypersparse_first_{std::nullopt}; vertex_t major_range_first_{0}; diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 0607b39153d..290f4b3c4db 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -48,6 +48,7 @@ struct graph_meta_t> { partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; vertex_t num_local_unique_edge_srcs{}; vertex_t num_local_unique_edge_dsts{}; @@ -61,6 +62,7 @@ struct graph_meta_t> { // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered std::optional> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_t is an owning graph class (note that graph_view_t is a non-owning graph class) @@ -101,6 +103,11 @@ class graph_t>>( (*edge_partition_dcs_nzd_vertices_).size()) : std::nullopt; + auto dcs_nzd_range_bitmaps = + edge_partition_dcs_nzd_range_bitmaps_ + ? std::make_optional>>( + (*edge_partition_dcs_nzd_range_bitmaps_).size()) + : std::nullopt; for (size_t i = 0; i < offsets.size(); ++i) { offsets[i] = raft::device_span(edge_partition_offsets_[i].data(), edge_partition_offsets_[i].size()); @@ -111,6 +118,11 @@ class graph_t((*edge_partition_dcs_nzd_vertices_)[i].data(), (*edge_partition_dcs_nzd_vertices_)[i].size()); } + if (dcs_nzd_range_bitmaps) { + (*dcs_nzd_range_bitmaps)[i] = + raft::device_span((*edge_partition_dcs_nzd_range_bitmaps_)[i].data(), + (*edge_partition_dcs_nzd_range_bitmaps_)[i].size()); + } } std::conditional_t{ this->number_of_vertices(), this->number_of_edges(), this->properties_, partition_, edge_partition_segment_offsets_, + edge_partition_hypersparse_degree_offsets_, local_sorted_unique_edge_srcs, local_sorted_unique_edge_src_chunk_start_offsets, local_sorted_unique_edge_src_chunk_size_, @@ -224,10 +238,13 @@ class graph_t>> edge_partition_dcs_nzd_vertices_{ std::nullopt}; + std::optional>> edge_partition_dcs_nzd_range_bitmaps_{ + std::nullopt}; partition_t partition_{}; // segment offsets within the vertex partition based on vertex degree std::vector edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge // sources/destinations << V / major_comm_size|minor_comm_size). @@ -290,7 +307,11 @@ class graph_t(offsets_.data(), offsets_.size()), raft::device_span(indices_.data(), indices_.size()), graph_view_meta_t{ - this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_}); + this->number_of_vertices(), + this->number_of_edges(), + this->properties_, + segment_offsets_, + hypersparse_degree_offsets_}); } private: @@ -299,6 +320,7 @@ class graph_t> segment_offsets_{}; + std::optional> hypersparse_degree_offsets_{}; }; template diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 866ab16ee97..6a03b9a6454 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -41,11 +41,13 @@ struct renumber_meta_t> edge_t number_of_edges{}; partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; }; template struct renumber_meta_t> { std::vector segment_offsets{}; + std::optional> hypersparse_degree_offsets{}; }; /** @@ -244,7 +246,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle, * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -284,7 +286,7 @@ std::enable_if_t unrenumber_local_int_edges( * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -346,7 +348,7 @@ void renumber_local_ext_vertices(raft::handle_t const& handle, * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam edge_type_t Type of edge types. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -388,7 +390,7 @@ decompress_to_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -421,7 +423,7 @@ symmetrize_edgelist(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -463,7 +465,7 @@ symmetrize_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -505,7 +507,7 @@ transpose_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -549,7 +551,7 @@ transpose_graph_storage( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -625,7 +627,7 @@ void relabel(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -678,7 +680,7 @@ extract_induced_subgraphs( * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -743,7 +745,7 @@ create_graph_from_edgelist(raft::handle_t const& handle, * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -807,7 +809,7 @@ create_graph_from_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -829,7 +831,7 @@ std::tuple, rmm::device_uvector> get_two * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -856,7 +858,7 @@ rmm::device_uvector compute_in_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -883,7 +885,7 @@ rmm::device_uvector compute_out_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -910,7 +912,7 @@ weight_t compute_max_in_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -937,7 +939,7 @@ weight_t compute_max_out_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -963,7 +965,7 @@ weight_t compute_total_edge_weight( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -1114,7 +1116,8 @@ shuffle_external_vertex_value_pairs(raft::handle_t const& handle, * @param edge_ids Optional list of edge ids * @param edge_types Optional list of edge types * @return Tuple of vectors storing edge sources, destinations, optional weights, - * optional edge ids, optional edge types mapped to this GPU. + * optional edge ids, optional edge types mapped to this GPU and a vector storing the + * number of edges received from each GPU. */ template std::tuple, diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index a2ff3166fa4..d109fbdac95 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -165,7 +165,12 @@ class partition_t { return vertex_partition_range_last(partition_idx) - vertex_partition_range_first(partition_idx); } - size_t number_of_local_edge_partitions() const { return minor_comm_size_; } + size_t number_of_local_edge_partitions() const { return static_cast(minor_comm_size_); } + size_t coinciding_local_edge_partition_idx() const + { + return static_cast(minor_comm_rank_); + } // the major range of coinciding_local_edge_partition_idx()'th local edge partition coincides + // with the local vertex partition range // major: source of the edge partition (if not transposed) or destination of the edge partition // (if transposed). @@ -249,9 +254,13 @@ double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_thres // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller // than minor_comm_size * hypersparse_threshold_ratio, should be less than 1.0 double constexpr hypersparse_threshold_ratio = 0.5; -size_t constexpr low_degree_threshold{raft::warp_size()}; -size_t constexpr mid_degree_threshold{1024}; -size_t constexpr num_sparse_segments_per_vertex_partition{3}; +size_t constexpr low_degree_threshold{ + raft::warp_size()}; // belongs to the low degree segment if the global degree is smaller than + // this value. +size_t constexpr mid_degree_threshold{ + 1024}; // belongs to the medium degree segment if the global degree is smaller than this value, + // otherwise, belongs to the high degree segment. +size_t constexpr num_sparse_segments_per_vertex_partition{3}; // high, mid, low // Common for both graph_view_t & graph_t and both single-GPU & multi-GPU versions template @@ -313,6 +322,7 @@ struct graph_view_meta_t edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; std::conditional_t>, @@ -356,6 +366,7 @@ struct graph_view_meta_t> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_view_t is a non-owning graph class (note that graph_t is an owning graph class) @@ -380,6 +391,8 @@ class graph_view_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta); std::vector vertex_partition_range_offsets() const @@ -552,6 +565,12 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_segment_offsets(partition_idx); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx) const { @@ -563,6 +582,28 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_hypersparse_degree_offsets(partition_idx); + } + + std::optional> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx) const + { + auto num_degrees_per_vertex_partition = + edge_partition_hypersparse_degree_offsets_ + ? ((*edge_partition_hypersparse_degree_offsets_).size() / edge_partition_offsets_.size()) + : size_t{0}; + return edge_partition_hypersparse_degree_offsets_ + ? std::make_optional>( + (*edge_partition_hypersparse_degree_offsets_).begin() + + partition_idx * num_degrees_per_vertex_partition, + (*edge_partition_hypersparse_degree_offsets_).begin() + + (partition_idx + 1) * num_degrees_per_vertex_partition) + : std::nullopt; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices(), @@ -605,6 +646,9 @@ class graph_view_t>> edge_partition_dcs_nzd_vertices_{}; + std::optional>> + edge_partition_dcs_nzd_range_bitmaps_{}; partition_t partition_{}; // segment offsets based on vertex degree std::vector edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store source/destination property values in key/value pairs (this saves memory if # // unique edge sources/destinations << V / major_comm_size|minor_comm_size). @@ -903,6 +950,11 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + return local_edge_partition_segment_offsets(size_t{0}); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx = 0) const { @@ -910,6 +962,18 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + return local_edge_partition_hypersparse_degree_offsets(size_t{0}); + } + + std::optional> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx = 0) const + { + assert(partition_idx == 0); + return hypersparse_degree_offsets_; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices()); @@ -1050,6 +1114,7 @@ class graph_view_t> segment_offsets_{std::nullopt}; + std::optional> hypersparse_degree_offsets_{std::nullopt}; std::optional> edge_mask_view_{std::nullopt}; }; diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 309b169e646..2eb210fb7cd 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,6 +71,30 @@ class partition_manager { : (major_comm_rank * minor_comm_size + minor_comm_rank); } +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_major_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank % major_comm_size + : comm_rank / minor_comm_size; + } + +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_minor_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank / major_comm_size + : comm_rank % minor_comm_size; + } + #ifdef __CUDACC__ __host__ __device__ #endif diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp index a20613c65ef..6d47ec540da 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp +++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp @@ -82,6 +82,53 @@ auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_ std::make_index_sequence(), buffer_size, stream_view); } +template +struct dataframe_buffer_type { + using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; + +template +using dataframe_buffer_type_t = typename dataframe_buffer_type::type; + +template +std::optional> try_allocate_dataframe_buffer( + size_t buffer_size, rmm::cuda_stream_view stream_view) +{ + try { + return allocate_dataframe_buffer(buffer_size, stream_view); + } catch (std::exception const& e) { + return std::nullopt; + } +} + +template +struct dataframe_buffer_iterator_type { + using type = typename rmm::device_uvector::iterator; +}; + +template +struct dataframe_buffer_iterator_type> { + using type = thrust::zip_iterator::iterator...>>; +}; + +template +using dataframe_buffer_iterator_type_t = typename dataframe_buffer_iterator_type::type; + +template +struct dataframe_buffer_const_iterator_type { + using type = typename rmm::device_uvector::const_iterator; +}; + +template +struct dataframe_buffer_const_iterator_type> { + using type = + thrust::zip_iterator::const_iterator...>>; +}; + +template +using dataframe_buffer_const_iterator_type_t = + typename dataframe_buffer_const_iterator_type::type; + template void reserve_dataframe_buffer(BufferType& buffer, size_t new_buffer_capacity, @@ -206,30 +253,4 @@ auto get_dataframe_buffer_cend(BufferType& buffer) std::make_index_sequence::value>(), buffer); } -template -struct dataframe_buffer_value_type { - using type = void; -}; - -template -struct dataframe_buffer_value_type> { - using type = T; -}; - -template -struct dataframe_buffer_value_type...>> { - using type = thrust::tuple; -}; - -template -using dataframe_buffer_value_type_t = typename dataframe_buffer_value_type::type; - -template -struct dataframe_buffer_type { - using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); -}; - -template -using dataframe_buffer_type_t = typename dataframe_buffer_type::type; - } // namespace cugraph diff --git a/cpp/include/cugraph/utilities/device_comm.hpp b/cpp/include/cugraph/utilities/device_comm.hpp index ffb0f7d9e5b..07de2d06466 100644 --- a/cpp/include/cugraph/utilities/device_comm.hpp +++ b/cpp/include/cugraph/utilities/device_comm.hpp @@ -55,7 +55,7 @@ auto iter_to_raw_ptr(thrust::detail::normal_iterator> iter } template -std::enable_if_t::value, void> +std::enable_if_t, void> device_isend_impl(raft::comms::comms_t const& comm, InputIterator input_first, size_t count, @@ -76,7 +76,7 @@ std::enable_if_t::value, void> device_isend_ raft::comms::request_t* request) { static_assert( - std::is_same::value_type, OutputValueType>::value); + std::is_same_v::value_type, OutputValueType>); comm.isend(iter_to_raw_ptr(input_first), count, dst, tag, request); } @@ -136,7 +136,7 @@ device_irecv_impl(raft::comms::comms_t const& comm, { static_assert( - std::is_same::value_type>::value); + std::is_same_v::value_type>); comm.irecv(iter_to_raw_ptr(output_first), count, src, tag, request); } @@ -200,7 +200,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_sendrecv(iter_to_raw_ptr(input_first), tx_count, dst, @@ -286,7 +286,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_multicast_sendrecv(iter_to_raw_ptr(input_first), tx_counts, tx_offsets, @@ -379,8 +379,8 @@ device_bcast_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.bcast( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, root, stream_view.value()); } @@ -440,8 +440,8 @@ device_allreduce_impl(raft::comms::comms_t const& comm, raft::comms::op_t op, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allreduce( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream_view.value()); } @@ -503,8 +503,8 @@ device_reduce_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.reduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, @@ -548,6 +548,62 @@ struct device_reduce_tuple_iterator_element_impl +std::enable_if_t::value, void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + // no-op +} + +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); + comm.allgather( + iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, stream_view.value()); +} + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + device_allgather_impl(comm, + thrust::get(input_first.get_iterator_tuple()), + thrust::get(output_first.get_iterator_tuple()), + sendcount, + stream_view); + device_allgather_tuple_iterator_element_impl().run( + comm, input_first, output_first, sendcount, stream_view); + } +}; + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + } +}; + template std::enable_if_t::value, void> device_allgatherv_impl(raft::comms::comms_t const& comm, @@ -571,8 +627,8 @@ device_allgatherv_impl(raft::comms::comms_t const& comm, std::vector const& displacements, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allgatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), recvcounts.data(), @@ -639,8 +695,8 @@ device_gatherv_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.gatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, @@ -1000,6 +1056,44 @@ device_reduce(raft::comms::comms_t const& comm, .run(comm, input_first, output_first, count, op, root, stream_view); } +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + detail::device_allgather_impl(comm, input_first, output_first, sendcount, stream_view); +} + +template +std::enable_if_t< + is_thrust_tuple_of_arithmetic::value_type>::value && + is_thrust_tuple::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert( + thrust::tuple_size::value_type>::value == + thrust::tuple_size::value_type>::value); + + size_t constexpr tuple_size = + thrust::tuple_size::value_type>::value; + + detail::device_allgather_tuple_iterator_element_impl() + .run(comm, input_first, output_first, sendcount, stream_view); +} + template std::enable_if_t< std::is_arithmetic::value_type>::value, diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh index 633dabe5b40..91a349007da 100644 --- a/cpp/include/cugraph/utilities/misc_utils.cuh +++ b/cpp/include/cugraph/utilities/misc_utils.cuh @@ -81,7 +81,7 @@ std::tuple, std::vector> compute_offset_aligned_ return std::make_tuple(h_chunk_offsets, h_element_offsets); } else { - return std::make_tuple(std::vector{{0, offsets.size() - 1}}, + return std::make_tuple(std::vector{{0, static_cast(offsets.size() - 1)}}, std::vector{{0, num_elements}}); } } diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 3cbd35b4bc3..98fa2cb1706 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,8 @@ namespace cugraph { namespace detail { +constexpr size_t cache_line_size = 128; + template struct compute_group_id_count_pair_t { GroupIdIterator group_id_first{}; @@ -76,6 +79,7 @@ inline std::tuple, std::vector> compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, rmm::device_uvector const& d_tx_value_counts, + bool drop_empty_ranks, rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); @@ -111,28 +115,30 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1); std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1); - int num_tx_dst_ranks{0}; - int num_rx_src_ranks{0}; - for (int i = 0; i < comm_size; ++i) { - if (tx_counts[i] != 0) { - tx_counts[num_tx_dst_ranks] = tx_counts[i]; - tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; - tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; - ++num_tx_dst_ranks; - } - if (rx_counts[i] != 0) { - rx_counts[num_rx_src_ranks] = rx_counts[i]; - rx_offsets[num_rx_src_ranks] = rx_offsets[i]; - rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; - ++num_rx_src_ranks; + if (drop_empty_ranks) { + int num_tx_dst_ranks{0}; + int num_rx_src_ranks{0}; + for (int i = 0; i < comm_size; ++i) { + if (tx_counts[i] != 0) { + tx_counts[num_tx_dst_ranks] = tx_counts[i]; + tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; + tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; + ++num_tx_dst_ranks; + } + if (rx_counts[i] != 0) { + rx_counts[num_rx_src_ranks] = rx_counts[i]; + rx_offsets[num_rx_src_ranks] = rx_offsets[i]; + rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; + ++num_rx_src_ranks; + } } + tx_counts.resize(num_tx_dst_ranks); + tx_offsets.resize(num_tx_dst_ranks); + tx_dst_ranks.resize(num_tx_dst_ranks); + rx_counts.resize(num_rx_src_ranks); + rx_offsets.resize(num_rx_src_ranks); + rx_src_ranks.resize(num_rx_src_ranks); } - tx_counts.resize(num_tx_dst_ranks); - tx_offsets.resize(num_tx_dst_ranks); - tx_dst_ranks.resize(num_tx_dst_ranks); - rx_counts.resize(num_rx_src_ranks); - rx_offsets.resize(num_rx_src_ranks); - rx_src_ranks.resize(num_rx_src_ranks); return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks); } @@ -823,6 +829,8 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector const& tx_value_counts, rmm::cuda_stream_view stream_view) { + using value_t = typename thrust::iterator_traits::value_type; + auto const comm_size = comm.get_size(); rmm::device_uvector d_tx_value_counts(comm_size, stream_view); @@ -836,11 +844,10 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); - auto rx_value_buffer = - allocate_dataframe_buffer::value_type>( - rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); + auto rx_value_buffer = allocate_dataframe_buffer( + rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). device_multicast_sendrecv(comm, @@ -866,6 +873,236 @@ auto shuffle_values(raft::comms::comms_t const& comm, return std::make_tuple(std::move(rx_value_buffer), rx_counts); } +// Add gaps in the receive buffer to enforce that the sent data offset and the received data offset +// have the same alignment for every rank. This is faster assuming that @p alignment ensures cache +// line alignment in both send & receive buffer (tested with NCCL 2.23.4) +template +auto shuffle_values( + raft::comms::comms_t const& comm, + TxValueIterator tx_value_first, + std::vector const& tx_value_counts, + size_t alignment, // # elements + std::optional::value_type> fill_value, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_size = comm.get_size(); + + std::vector tx_value_displacements(tx_value_counts.size()); + std::exclusive_scan( + tx_value_counts.begin(), tx_value_counts.end(), tx_value_displacements.begin(), size_t{0}); + + std::vector tx_unaligned_counts(comm_size); + std::vector tx_displacements(comm_size); + std::vector tx_aligned_counts(comm_size); + std::vector tx_aligned_displacements(comm_size); + std::vector rx_unaligned_counts(comm_size); + std::vector rx_displacements(comm_size); + std::vector rx_aligned_counts(comm_size); + std::vector rx_aligned_displacements(comm_size); + std::vector tx_ranks(comm_size); + std::iota(tx_ranks.begin(), tx_ranks.end(), int{0}); + auto rx_ranks = tx_ranks; + for (size_t i = 0; i < tx_value_counts.size(); ++i) { + tx_unaligned_counts[i] = 0; + if (tx_value_displacements[i] % alignment != 0) { + tx_unaligned_counts[i] = + std::min(alignment - (tx_value_displacements[i] % alignment), tx_value_counts[i]); + } + tx_displacements[i] = tx_value_displacements[i]; + tx_aligned_counts[i] = tx_value_counts[i] - tx_unaligned_counts[i]; + tx_aligned_displacements[i] = tx_value_displacements[i] + tx_unaligned_counts[i]; + } + + rmm::device_uvector d_tx_unaligned_counts(tx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_tx_aligned_counts(tx_aligned_counts.size(), stream_view); + rmm::device_uvector d_rx_unaligned_counts(rx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_rx_aligned_counts(rx_aligned_counts.size(), stream_view); + raft::update_device(d_tx_unaligned_counts.data(), + tx_unaligned_counts.data(), + tx_unaligned_counts.size(), + stream_view); + raft::update_device( + d_tx_aligned_counts.data(), tx_aligned_counts.data(), tx_aligned_counts.size(), stream_view); + std::vector tx_counts(comm_size, size_t{1}); + std::vector tx_offsets(comm_size); + std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); + auto rx_counts = tx_counts; + auto rx_offsets = tx_offsets; + cugraph::device_multicast_sendrecv(comm, + d_tx_unaligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_unaligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + d_tx_aligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_aligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + raft::update_host(rx_unaligned_counts.data(), + d_rx_unaligned_counts.data(), + d_rx_unaligned_counts.size(), + stream_view); + raft::update_host( + rx_aligned_counts.data(), d_rx_aligned_counts.data(), d_rx_aligned_counts.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + size_t offset{0}; + for (size_t i = 0; i < rx_counts.size(); ++i) { + auto target_alignment = (alignment - rx_unaligned_counts[i]) % alignment; + auto cur_alignment = offset % alignment; + if (target_alignment >= cur_alignment) { + offset += target_alignment - cur_alignment; + } else { + offset += (target_alignment + alignment) - cur_alignment; + } + rx_displacements[i] = offset; + rx_aligned_displacements[i] = rx_displacements[i] + rx_unaligned_counts[i]; + offset = rx_aligned_displacements[i] + rx_aligned_counts[i]; + } + + auto rx_values = allocate_dataframe_buffer( + rx_aligned_displacements.back() + rx_aligned_counts.back(), stream_view); + if (fill_value) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_end(rx_values), + *fill_value); + } + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_unaligned_counts, + tx_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_unaligned_counts, + rx_displacements, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_aligned_counts, + tx_aligned_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_aligned_counts, + rx_aligned_displacements, + rx_ranks, + stream_view); + + return std::make_tuple(std::move(rx_values), + tx_unaligned_counts, + tx_aligned_counts, + tx_displacements, + rx_unaligned_counts, + rx_aligned_counts, + rx_displacements); +} + +// this uses less memory than calling shuffle_values then sort & unique but requires comm.get_size() +// - 1 communication steps +template +auto shuffle_and_unique_segment_sorted_values( + raft::comms::comms_t const& comm, + TxValueIterator + segment_sorted_tx_value_first, // sorted within each segment (segment sizes: + // tx_value_counts[i], where i = [0, comm_size); and bettter be + // unique to reduce communication volume + std::vector const& tx_value_counts, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + auto sorted_unique_values = allocate_dataframe_buffer(0, stream_view); + if (comm_size == 1) { + resize_dataframe_buffer(sorted_unique_values, tx_value_counts[comm_rank], stream_view); + thrust::copy(rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first, + segment_sorted_tx_value_first + tx_value_counts[comm_rank], + get_dataframe_buffer_begin(sorted_unique_values)); + resize_dataframe_buffer( + sorted_unique_values, + thrust::distance(get_dataframe_buffer_begin(sorted_unique_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values))), + stream_view); + } else { + rmm::device_uvector d_tx_value_counts(comm_size, stream_view); + raft::update_device( + d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream_view.value()); + + std::vector tx_counts{}; + std::vector tx_offsets{}; + std::vector rx_counts{}; + std::vector rx_offsets{}; + std::tie(tx_counts, tx_offsets, std::ignore, rx_counts, rx_offsets, std::ignore) = + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, false, stream_view); + + d_tx_value_counts.resize(0, stream_view); + d_tx_value_counts.shrink_to_fit(stream_view); + + for (int i = 1; i < comm_size; ++i) { + auto dst = (comm_rank + i) % comm_size; + auto src = + static_cast((static_cast(comm_rank) + static_cast(comm_size - i)) % + static_cast(comm_size)); + auto rx_sorted_values = allocate_dataframe_buffer(rx_counts[src], stream_view); + device_sendrecv(comm, + segment_sorted_tx_value_first + tx_offsets[dst], + tx_counts[dst], + dst, + get_dataframe_buffer_begin(rx_sorted_values), + rx_counts[src], + src, + stream_view); + auto merged_sorted_values = allocate_dataframe_buffer( + (i == 1 ? tx_counts[comm_rank] : size_dataframe_buffer(sorted_unique_values)) + + rx_counts[src], + stream_view); + if (i == 1) { + thrust::merge( + rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first + tx_offsets[comm_rank], + segment_sorted_tx_value_first + (tx_offsets[comm_rank] + tx_counts[comm_rank]), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } else { + thrust::merge(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } + resize_dataframe_buffer( + merged_sorted_values, + thrust::distance(get_dataframe_buffer_begin(merged_sorted_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(merged_sorted_values), + get_dataframe_buffer_end(merged_sorted_values))), + stream_view); + sorted_unique_values = std::move(merged_sorted_values); + } + } + shrink_to_fit_dataframe_buffer(sorted_unique_values, stream_view); + return sorted_unique_values; +} + template auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, ValueIterator tx_value_first /* [INOUT */, @@ -889,7 +1126,7 @@ auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); auto rx_value_buffer = allocate_dataframe_buffer::value_type>( @@ -943,7 +1180,7 @@ auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); rmm::device_uvector::value_type> rx_keys( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp index 2c36ed33359..29b9d132ef8 100644 --- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp +++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp @@ -64,6 +64,18 @@ size_t sum_thrust_tuple_element_sizes(std::index_sequence) return (... + sizeof(typename thrust::tuple_element::type)); } +template +size_t min_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::min(sizeof(typename thrust::tuple_element::type)...); +} + +template +size_t max_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::max(sizeof(typename thrust::tuple_element::type)...); +} + template auto thrust_tuple_to_std_tuple(TupleType tup, std::index_sequence) { @@ -181,6 +193,20 @@ constexpr size_t sum_thrust_tuple_element_sizes() std::make_index_sequence::value>()); } +template +constexpr size_t min_thrust_tuple_element_sizes() +{ + return detail::min_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + +template +constexpr size_t max_thrust_tuple_element_sizes() +{ + return detail::max_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + template auto thrust_tuple_to_std_tuple(TupleType tup) { diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index 8ae49ed207c..88ef3987a03 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -23,7 +23,7 @@ #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" #include "prims/transform_e.cuh" #include "prims/transform_reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -133,15 +133,15 @@ std::tuple, rmm::device_uvector> brandes_b update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view()); update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view()); - auto [new_frontier, new_sigma] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - src_sigmas.view(), - dst_distances.view(), - cugraph::edge_dummy_property_t{}.view(), - brandes_e_op_t{}, - reduce_op::plus()); + auto [new_frontier, new_sigma] = cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + src_sigmas.view(), + dst_distances.view(), + cugraph::edge_dummy_property_t{}.view(), + brandes_e_op_t{}, + reduce_op::plus()); update_v_frontier(handle, graph_view, diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh index a0ccfa52ffc..869ed4e7ae6 100644 --- a/cpp/src/community/approx_weighted_matching_impl.cuh +++ b/cpp/src/community/approx_weighted_matching_impl.cuh @@ -243,11 +243,12 @@ std::tuple, weight_t> approximate_weighted_matchin major_comm_size, minor_comm_size}; - candidates_of_candidates = cugraph::collect_values_for_keys(handle, + candidates_of_candidates = cugraph::collect_values_for_keys(comm, target_candidate_map.view(), candidates.begin(), candidates.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { candidates_of_candidates.resize(candidates.size(), handle.get_stream()); diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh index e17abdb3703..18fb3fdb251 100644 --- a/cpp/src/community/detail/common_methods.cuh +++ b/cpp/src/community/detail/common_methods.cuh @@ -289,11 +289,12 @@ rmm::device_uvector update_clustering_by_delta_modularity( invalid_vertex_id::value, std::numeric_limits::max(), handle.get_stream()); - vertex_cluster_weights_v = cugraph::collect_values_for_keys(handle, + vertex_cluster_weights_v = cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), next_clusters_v.begin(), next_clusters_v.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); src_cluster_weights = edge_src_property_t, weight_t>(handle, diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh index 62b66ed5f41..d69c1463edf 100644 --- a/cpp/src/community/detail/refine_impl.cuh +++ b/cpp/src/community/detail/refine_impl.cuh @@ -181,11 +181,12 @@ refine_clustering( comm_size, major_comm_size, minor_comm_size}; vertex_louvain_cluster_weights = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), louvain_assignment_of_vertices.begin(), louvain_assignment_of_vertices.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { vertex_louvain_cluster_weights.resize(louvain_assignment_of_vertices.size(), @@ -473,11 +474,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; louvain_of_leiden_keys_used_in_edge_reduction = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_used_in_edge_reduction.begin(), leiden_keys_used_in_edge_reduction.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { louvain_of_leiden_keys_used_in_edge_reduction.resize( leiden_keys_used_in_edge_reduction.size(), handle.get_stream()); @@ -864,11 +866,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; lovain_of_leiden_cluster_keys = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_to_read_louvain.begin(), leiden_keys_to_read_louvain.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { lovain_of_leiden_cluster_keys.resize(leiden_keys_to_read_louvain.size(), handle.get_stream()); diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 468f4f7280f..219bc3c4d1d 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -550,24 +550,25 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto old_num_edge_inserts = num_edge_inserts.value(handle.get_stream()); resize_dataframe_buffer(edge_buffer, old_num_edge_inserts + max_pushes, handle.get_stream()); - auto new_frontier_tagged_vertex_buffer = transform_reduce_v_frontier_outgoing_e_by_dst( - handle, - level_graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{ - GraphViewType::is_multi_gpu - ? detail::edge_partition_endpoint_property_device_view_t( - edge_dst_components.mutable_view()) - : detail::edge_partition_endpoint_property_device_view_t( - detail::edge_minor_property_view_t(level_components, - vertex_t{0})), - level_graph_view.local_edge_partition_dst_range_first(), - get_dataframe_buffer_begin(edge_buffer), - num_edge_inserts.data()}, - reduce_op::null()); + auto new_frontier_tagged_vertex_buffer = + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + level_graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{ + GraphViewType::is_multi_gpu + ? detail::edge_partition_endpoint_property_device_view_t( + edge_dst_components.mutable_view()) + : detail::edge_partition_endpoint_property_device_view_t( + detail::edge_minor_property_view_t(level_components, + vertex_t{0})), + level_graph_view.local_edge_partition_dst_range_first(), + get_dataframe_buffer_begin(edge_buffer), + num_edge_inserts.data()}, + reduce_op::null()); update_v_frontier(handle, level_graph_view, diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh index d807ccac5a5..a2b6f6430f0 100644 --- a/cpp/src/cores/core_number_impl.cuh +++ b/cpp/src/cores/core_number_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -222,14 +222,15 @@ void core_number(raft::handle_t const& handle, if (graph_view.is_symmetric() || ((degree_type == k_core_degree_type_t::IN) || (degree_type == k_core_degree_type_t::INOUT))) { auto [new_frontier_vertex_buffer, delta_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - dst_core_numbers.view(), - edge_dummy_property_t{}.view(), - e_op_t{k, delta}, - reduce_op::plus()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + dst_core_numbers.view(), + edge_dummy_property_t{}.view(), + e_op_t{k, delta}, + reduce_op::plus()); update_v_frontier( handle, diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh index 1c8c39fd6dd..45bbf870d80 100644 --- a/cpp/src/lookup/lookup_src_dst_impl.cuh +++ b/cpp/src/lookup/lookup_src_dst_impl.cuh @@ -115,12 +115,13 @@ struct lookup_container_t::lookup_con auto const minor_comm_size = minor_comm.get_size(); value_buffer = cugraph::collect_values_for_keys( - handle, + comm, kv_store_object->view(), edge_ids_to_lookup.begin(), edge_ids_to_lookup.end(), cugraph::detail::compute_gpu_id_from_ext_edge_id_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); } else { cugraph::resize_dataframe_buffer( value_buffer, edge_ids_to_lookup.size(), handle.get_stream()); diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 177c79ace87..2b89d214fd7 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -15,9 +15,11 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" #include "prims/detail/optional_dataframe_buffer.hpp" #include "prims/detail/prim_functors.cuh" #include "prims/property_op_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -72,9 +74,9 @@ __device__ void push_buffer_element(BufferKeyOutputIterator buffer_key_output_fi e_op_result_t e_op_result) { using output_key_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; using output_value_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; assert(e_op_result.has_value()); @@ -118,7 +120,6 @@ __device__ void warp_push_buffer_elements( } template buffer_idx(*buffer_idx_ptr); - int32_t constexpr shared_array_size = max_one_e_per_frontier_key - ? int32_t{1} /* dummy */ - : extract_transform_v_frontier_e_kernel_block_size; - __shared__ std::conditional_t - warp_local_degree_inclusive_sums[shared_array_size]; - __shared__ std::conditional_t - warp_key_local_edge_offsets[shared_array_size]; + __shared__ edge_t + warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size]; + __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size]; using WarpScan = cub::WarpScan; - __shared__ std:: - conditional_t - temp_storage; + __shared__ typename WarpScan::TempStorage temp_storage; auto indices = edge_partition.indices(); @@ -216,98 +211,74 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } - if constexpr (max_one_e_per_frontier_key) { - // each thread processes one frontier key, exits if any edge returns a valid output + auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive + auto max_key_idx = + static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), + static_cast(num_keys))); // exclusive - e_op_result_t e_op_result{thrust::nullopt}; - auto key = *(key_first + idx); + // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - if (edge_partition_e_mask) { - for (edge_t i = 0; i < local_degree; ++i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } - } - } - } else { - for (edge_t i = 0; i < local_degree; ++i) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } - } - } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } else { - auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive - auto max_key_idx = - static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), - static_cast(num_keys))); // exclusive - - // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - - warp_key_local_edge_offsets[threadIdx.x] = edge_offset; - WarpScan(temp_storage) - .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); - __syncwarp(); + warp_key_local_edge_offsets[threadIdx.x] = edge_offset; + WarpScan(temp_storage) + .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); + __syncwarp(); - // all the threads in a warp collectively process local edges for the keys in [key_first + - // min_key_idx, key_first + max_key_idx) + // all the threads in a warp collectively process local edges for the keys in [key_first + + // min_key_idx, key_first + max_key_idx) - auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + - (max_key_idx - min_key_idx) - 1]; - auto rounded_up_num_edges_this_warp = - ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); + auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + + (max_key_idx - min_key_idx) - 1]; + auto rounded_up_num_edges_this_warp = + ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); - auto this_warp_inclusive_sum_first = - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); - auto this_warp_inclusive_sum_last = - this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); + auto this_warp_inclusive_sum_first = + warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); + auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); - if (edge_partition_e_mask) { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); - if ((*edge_partition_e_mask).get(local_edge_offset)) { - auto key = *(key_first + (min_key_idx + key_idx_this_warp)); - e_op_result = call_e_op(key, local_edge_offset); - } - } + if (edge_partition_e_mask) { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } - } else { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + if ((*edge_partition_e_mask).get(local_edge_offset)) { auto key = *(key_first + (min_key_idx + key_idx_this_warp)); e_op_result = call_e_op(key, local_edge_offset); } + } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + } + } else { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + auto key = *(key_first + (min_key_idx + key_idx_this_warp)); + e_op_result = call_e_op(key, local_edge_offset); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -315,8 +286,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } -template buffer_idx(*buffer_idx_ptr); - using WarpReduce = cub::WarpReduce; - __shared__ std::conditional_t - temp_storage[max_one_e_per_frontier_key - ? (extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()) - : int32_t{1} /* dummy */]; - while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); auto major = thrust_tuple_get_or_identity(key); auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = - ((static_cast(local_out_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * raft::warp_size(); auto call_e_op = call_e_op_t(local_out_degree)) && + if ((i < static_cast(local_degree)) && ((*edge_partition_e_mask).get(local_edge_offset + i))) { e_op_result = call_e_op(i); } - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } else { - for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -446,8 +382,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree( } } -template edge_partition, KeyIterator key_first, - KeyIterator key_last, + raft::device_span key_local_degree_offsets, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -482,123 +417,234 @@ __global__ static void extract_transform_v_frontier_e_high_degree( typename EdgePartitionEdgeValueInputWrapper::value_type, EdgeOp>::type; - auto const warp_id = threadIdx.x / raft::warp_size(); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; auto const lane_id = threadIdx.x % raft::warp_size(); - auto idx = static_cast(blockIdx.x); - - cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - - using BlockReduce = cub::BlockReduce; - __shared__ std::conditional_t - temp_storage; - __shared__ int32_t output_thread_id; - - while (idx < static_cast(thrust::distance(key_first, key_last))) { - auto key = *(key_first + idx); - auto major = thrust_tuple_get_or_identity(key); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = - edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = ((static_cast(local_out_degree) + - (extract_transform_v_frontier_e_kernel_block_size - 1)) / - extract_transform_v_frontier_e_kernel_block_size) * - extract_transform_v_frontier_e_kernel_block_size; - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - local_edge_offset}; + auto idx = static_cast(tid); - if (edge_partition_e_mask) { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if ((i < static_cast(local_out_degree)) && - ((*edge_partition_e_mask).get(local_edge_offset + i))) { - e_op_result = call_e_op(i); - } + cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } - } - } else { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + auto num_edges = *(key_local_degree_offsets.rbegin()); + size_t rounded_up_num_edges = + ((static_cast(num_edges) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + while (idx < rounded_up_num_edges) { + e_op_result_t e_op_result{thrust::nullopt}; + if (idx < num_edges) { + auto key_idx = thrust::distance( + key_local_degree_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, key_local_degree_offsets.begin() + 1, key_local_degree_offsets.end(), idx)); + auto key = *(key_first + key_idx); + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t local_edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = + edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + local_edge_offset}; + + auto e_idx = static_cast(idx - key_local_degree_offsets[key_idx]); + if (edge_partition_e_mask) { + if ((*edge_partition_e_mask).get(local_edge_offset + e_idx)) { + e_op_result = call_e_op(e_idx); } + } else { + e_op_result = call_e_op(e_idx); } } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - idx += gridDim.x; + idx += gridDim.x * blockDim.x; + } +} + +template +void extract_transform_v_frontier_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + InputKeyIterator edge_partition_frontier_key_first, + InputKeyIterator edge_partition_frontier_key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + OptionalOutputKeyIterator output_key_first, + OptionalOutputValueIterator output_value_first, + raft::device_span count /* size = 1 */, + EdgeOp e_op, + std::optional> high_segment_key_local_degree_offsets, + std::optional high_segment_edge_count, + std::optional> key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } + if (key_segment_offsets) { + if (((*key_segment_offsets)[1] > 0) && ((*high_segment_edge_count) > 0)) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + raft::grid_1d_thread_t update_grid((*high_segment_edge_count), + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_high_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + raft::device_span((*high_segment_key_local_degree_offsets).data(), + (*high_segment_key_local_degree_offsets).size()), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_mid_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[1], + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if (edge_partition.dcs_nzd_vertex_count() && + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_frontier_key_first + (*key_segment_offsets)[4], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + } else { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + auto frontier_size = static_cast( + thrust::distance(edge_partition_frontier_key_first, edge_partition_frontier_key_last)); + if (frontier_size > 0) { + raft::grid_1d_thread_t update_grid(frontier_size, + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } } } template -std::tuple< - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})), - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> +std::tuple, + optional_dataframe_buffer_type_t> extract_transform_v_frontier_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& frontier, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -607,7 +653,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using output_key_t = OutputKeyT; using output_value_t = OutputValueT; @@ -653,6 +699,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust::optional, thrust::optional>>>); + constexpr bool try_bitmap = GraphViewType::is_multi_gpu && std::is_same_v && + KeyBucketType::is_sorted_unique; + if (do_expensive_check) { auto frontier_vertex_first = thrust_tuple_get_or_identity(frontier.begin()); @@ -673,10 +722,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, "Invalid input argument: frontier includes out-of-range keys."); } + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. pre-process frontier data + auto frontier_key_first = frontier.begin(); auto frontier_key_last = frontier.end(); auto frontier_keys = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - if constexpr (!VertexFrontierBucketType::is_sorted_unique) { + if constexpr (!KeyBucketType::is_sorted_unique) { resize_dataframe_buffer(frontier_keys, frontier.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), frontier_key_first, @@ -689,209 +743,708 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, frontier_key_last = get_dataframe_buffer_end(frontier_keys); } - // 1. fill the buffers + std::optional> key_segment_offsets{std::nullopt}; + { // drop zero degree vertices & compute key_segment_offsets + size_t partition_idx{0}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + partition_idx = static_cast(minor_comm.get_rank()); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + if (thrust::distance(frontier_key_first, frontier_key_last) > 0) { + key_segment_offsets = compute_key_segment_offsets( + frontier_key_first, + frontier_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + frontier_key_last = frontier_key_first + (*key_segment_offsets).back(); + } else { + key_segment_offsets = std::vector((*segment_offsets).size(), 0); + } + } + } + + // 2. compute local max_pushes - auto key_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - auto value_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + size_t local_max_pushes{}; + { + size_t partition_idx{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto frontier_major_first = + thrust_tuple_get_or_identity(frontier_key_first); + auto frontier_major_last = + thrust_tuple_get_or_identity(frontier_key_last); + // for an edge-masked graph, we can pass edge mask to compute tighter bound (at the expense of + // additional computing) + local_max_pushes = edge_partition.compute_number_of_edges( + frontier_major_first, frontier_major_last, handle.get_stream()); + } + + // 3. communication over minor_comm std::vector local_frontier_sizes{}; + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_lasts{}; + std::optional>> key_segment_offset_vectors{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - handle.get_stream()); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{}; + { + size_t key_size{0}; + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + size_t output_key_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_key_size = sizeof(output_key_t); + } else { + output_key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + size_t output_value_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_value_size = sizeof(output_value_t); + } else { + output_value_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + approx_tmp_buffer_size_per_loop = + static_cast(thrust::distance(frontier_key_first, frontier_key_last)) * key_size + + local_max_pushes * (output_key_size + output_value_size); + } + + size_t num_scalars = + 3; // local_frontier_size, max_tmp_buffer_size, approx_tmp_buffer_size_per_loop + if constexpr (try_bitmap) { + num_scalars += 2; // local_frontier_range_first, local_frontier_range_last + } + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + (num_scalars * minor_comm_rank + (try_bitmap ? 5 : 3)), + [frontier_key_first, + max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + v_list_size = static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return v_list_size; + } else if (i == 1) { + return max_tmp_buffer_size; + } else if (i == 2) { + return approx_tmp_buffer_size_per_loop; + } + if constexpr (try_bitmap) { + if (i == 3) { + vertex_t first{}; + if (v_list_size > 0) { + first = *frontier_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else if (i == 4) { + assert(i == 4); + vertex_t last{}; + if (v_list_size > 0) { + last = *(frontier_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + } + assert(false); + return size_t{0}; + }); + if (key_segment_offsets) { + raft::update_device( + d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 5 : 3)), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + local_frontier_sizes = std::vector(minor_comm_size); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_frontier_range_firsts = std::vector(minor_comm_size); + local_frontier_range_lasts = std::vector(minor_comm_size); + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + for (int i = 0; i < minor_comm_size; ++i) { + local_frontier_sizes[i] = h_aggregate_tmps[i * num_scalars]; + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars + 1]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 2]; + if constexpr (try_bitmap) { + local_frontier_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 3]); + local_frontier_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 4]); + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back(h_aggregate_tmps.begin() + (i * num_scalars + (try_bitmap ? 5 : 3)), + h_aggregate_tmps.begin() + + (i * num_scalars + (try_bitmap ? 5 : 3) + (*key_segment_offsets).size())); + } + } } else { local_frontier_sizes = std::vector{static_cast( static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } + } + + // update frontier bitmap (used to reduce broadcast bandwidth size) + + bool v_compressible{false}; + std:: + conditional_t>, std::byte /* dummy */> + frontier_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_frontier{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + if constexpr (sizeof(vertex_t) == 8) { + vertex_t local_frontier_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + local_frontier_max_range_size = std::max(range_size, local_frontier_max_range_size); + } + if (local_frontier_max_range_size <= + std::numeric_limits::max()) { // broadcast 32 bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_frontier_sizes[i]); + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + constexpr double threshold_ratio = + 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + auto avg_frontier_size = + std::reduce(local_frontier_sizes.begin(), local_frontier_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_frontier_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to consider additional kernel launch overhead */)) { + frontier_bitmap = + compute_vertex_list_bitmap_info(frontier_key_first, + frontier_key_last, + local_frontier_range_firsts[minor_comm_rank], + local_frontier_range_lasts[minor_comm_rank], + handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_frontier_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + frontier_key_first, + frontier_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_frontier = std::move(tmps); + } + } + } + + // set-up stream ppol + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; + if (graph_view.local_vertex_partition_segment_offsets() && + (handle.get_stream_pool_size() >= max_segments)) { + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. + } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } + + size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loopos streams from stream_pool_indices + if (stream_pool_indices) { + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } + rmm::device_uvector counters(num_concurrent_loops, handle.get_stream()); + + if constexpr (!GraphViewType::is_multi_gpu) { + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + + // 2. fill the buffers + + std::vector> key_buffers{}; + std::vector> value_buffers{}; + key_buffers.reserve(graph_view.number_of_local_edge_partitions()); + value_buffers.reserve(graph_view.number_of_local_edge_partitions()); + auto edge_mask_view = graph_view.edge_mask_view(); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto edge_partition_frontier_key_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - vertex_t edge_partition_frontier_size = static_cast(local_frontier_sizes[i]); - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); + + std::conditional_t< + GraphViewType::is_multi_gpu, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> + edge_partition_key_buffers{}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + edge_partition_key_buffers.reserve(loop_count); + + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_bitmap_buffers{}; + if constexpr (try_bitmap) { + if (frontier_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); + } + } - resize_dataframe_buffer( - edge_partition_frontier_key_buffer, edge_partition_frontier_size, handle.get_stream()); - - device_bcast(minor_comm, - frontier_key_first, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition_frontier_size, - static_cast(i), - handle.get_stream()); - - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_frontier_key_buffer); - } - - auto edge_partition_frontier_major_first = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_first); - auto edge_partition_frontier_major_last = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_last); - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto max_pushes = max_one_e_per_frontier_key ? edge_partition_frontier_size - : edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - handle.get_stream()); - - auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes; - resize_optional_dataframe_buffer( - key_buffer, new_buffer_size, handle.get_stream()); - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_frontier_range_lasts[partition_idx] - + local_frontier_range_firsts[partition_idx]), + handle.get_stream()); + use_bitmap_buffer = true; + } + } + if (!use_bitmap_buffer) { + bool allocated{false}; + if constexpr (try_bitmap) { + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_frontier_sizes[partition_idx], handle.get_stream())); + allocated = true; + } + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_frontier_sizes[partition_idx], handle.get_stream())); + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + if constexpr (try_bitmap) { + if (frontier_bitmap) { + device_bcast(minor_comm, + (*frontier_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_frontier) { + device_bcast(minor_comm, + (*compressed_frontier).data(), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize(local_frontier_sizes[partition_idx], loop_stream); + } else { + keys = + rmm::device_uvector(local_frontier_sizes[partition_idx], loop_stream); + } + + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + + auto range_first = local_frontier_range_firsts[partition_idx]; + auto range_last = local_frontier_range_lasts[partition_idx]; + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + range_first, + range_last, + loop_stream); + } + + edge_partition_key_buffers.push_back(std::move(keys)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + (*edge_partition_bitmap_buffers).clear(); + } + } } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); - if (segment_offsets) { - static_assert(num_sparse_segments_per_vertex_partition == 3); - std::vector h_thresholds(num_sparse_segments_per_vertex_partition + - (graph_view.use_dcs() ? 1 : 0) - 1); - h_thresholds[0] = edge_partition.major_range_first() + (*segment_offsets)[1]; - h_thresholds[1] = edge_partition.major_range_first() + (*segment_offsets)[2]; - if (graph_view.use_dcs()) { - h_thresholds[2] = edge_partition.major_range_first() + (*segment_offsets)[3]; + std::vector> output_key_buffers{}; + output_key_buffers.reserve(loop_count); + std::vector> output_value_buffers{}; + output_value_buffers.reserve(loop_count); + std::vector edge_partition_max_push_counts(loop_count); + + std::optional>> + high_segment_key_local_degree_offset_vectors{std::nullopt}; + std::optional> high_segment_edge_counts{std::nullopt}; + if (key_segment_offset_vectors) { + high_segment_key_local_degree_offset_vectors = std::vector>{}; + (*high_segment_key_local_degree_offset_vectors).reserve(loop_count); + high_segment_edge_counts = std::vector(loop_count); + } + + edge_partition_max_push_counts[0] = local_max_pushes; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if (static_cast(partition_idx) != minor_comm_rank) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& keys = edge_partition_key_buffers[j]; + + bool computed{false}; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + auto major_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + return range_first + static_cast(v_offset); + })); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + std::get<0>(keys).size(), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + computed = true; + } + } + if (!computed) { + dataframe_buffer_const_iterator_type_t key_first{}; + size_t num_keys{}; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + num_keys = std::get<1>(keys).size(); + } else { + key_first = get_dataframe_buffer_begin(keys); + num_keys = size_dataframe_buffer(keys); + } + auto major_first = thrust_tuple_get_or_identity(key_first); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + num_keys, + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + raft::update_host( + edge_partition_max_push_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + edge_partition_max_push_counts[minor_comm_rank % num_concurrent_loops] = local_max_pushes; + } } - rmm::device_uvector d_thresholds(h_thresholds.size(), handle.get_stream()); - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - rmm::device_uvector d_offsets(d_thresholds.size(), handle.get_stream()); - thrust::lower_bound(handle.get_thrust_policy(), - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - d_thresholds.begin(), - d_thresholds.end(), - d_offsets.begin()); - std::vector h_offsets(d_offsets.size()); - raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); - RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - h_offsets.push_back(edge_partition_frontier_size); - // FIXME: we may further improve performance by 1) concurrently running kernels on different - // segments; 2) individually tuning block sizes for different segments; and 3) adding one - // more segment for very high degree vertices and running segmented reduction - if (h_offsets[0] > 0) { - raft::grid_1d_block_t update_grid(h_offsets[0], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_high_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first, - edge_partition_frontier_key_first + h_offsets[0], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + } + + if (key_segment_offset_vectors) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + rmm::device_uvector high_segment_key_local_degree_offsets( + key_segment_offsets[1] + 1, loop_stream); + high_segment_key_local_degree_offsets.set_element_to_zero_async(0, loop_stream); + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto key_local_degree_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [edge_partition, + range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + auto major = range_first + static_cast(v_offset); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + computed = true; + } + } + if (!computed) { + auto key_first = frontier_key_first; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + } else { + key_first = get_dataframe_buffer_begin(keys); + } + } + auto key_local_degree_first = thrust::make_transform_iterator( + key_first, cuda::proclaim_return_type([edge_partition] __device__(auto key) { + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + } + raft::update_host((*high_segment_edge_counts).data() + j, + high_segment_key_local_degree_offsets.data() + key_segment_offsets[1], + 1, + loop_stream); + (*high_segment_key_local_degree_offset_vectors) + .push_back(std::move(high_segment_key_local_degree_offsets)); } - if (h_offsets[1] - h_offsets[0] > 0) { - raft::grid_1d_warp_t update_grid(h_offsets[1] - h_offsets[0], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_mid_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[0], - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + + // to ensure that *high_segment_edge_counts[] is valid + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); + } else { + handle.sync_stream(); } - if (h_offsets[2] - h_offsets[1] > 0) { - raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + output_key_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); + output_value_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + thrust::fill( + handle.get_thrust_policy(), counters.begin(), counters.begin() + loop_count, size_t{0}); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; } - if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[3] - h_offsets[2] > 0)) { - raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_frontier_key_first + h_offsets[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + auto edge_partition_stream_pool_indices = + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); } - } else { - if (edge_partition_frontier_size > 0) { - raft::grid_1d_thread_t update_grid(edge_partition_frontier_size, - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_frontier_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + auto edge_partition_frontier_key_last = + edge_partition_frontier_key_first + std::get<0>(keys).size(); + extract_transform_v_frontier_e_edge_partition( + handle, edge_partition, edge_partition_frontier_key_first, edge_partition_frontier_key_last, @@ -899,24 +1452,150 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); + computed = true; + } + } + if (!computed) { + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + edge_partition_frontier_key_first = std::get<1>(keys).begin(); + edge_partition_frontier_key_last = std::get<1>(keys).end(); + } else { + edge_partition_frontier_key_first = get_dataframe_buffer_begin(keys); + edge_partition_frontier_key_last = get_dataframe_buffer_end(keys); + } + } + + extract_transform_v_frontier_e_edge_partition( + handle, + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); } } - } - // 2. resize and return the buffers + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto new_buffer_size = buffer_idx.value(handle.get_stream()); + std::vector h_counts(loop_count); + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); - resize_optional_dataframe_buffer(key_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); + auto tmp_buffer_size = h_counts[j]; + if (tmp_buffer_size > 0) { + auto& tmp_key_buffer = output_key_buffers[j]; + auto& tmp_value_buffer = output_value_buffers[j]; + + resize_optional_dataframe_buffer( + tmp_key_buffer, tmp_buffer_size, loop_stream); + // skip shrink_to_fit before return to cut execution time + + resize_optional_dataframe_buffer( + tmp_value_buffer, tmp_buffer_size, loop_stream); + // skip shrink_to_fit before return to cut execution time + + key_buffers.push_back(std::move(tmp_key_buffer)); + value_buffers.push_back(std::move(tmp_value_buffer)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + } + + // 3. concatenate and return the buffers + + auto key_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + auto value_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + if (key_buffers.size() == 0) { + /* nothing to do */ + } else if (key_buffers.size() == 1) { + key_buffer = std::move(key_buffers[0]); + value_buffer = std::move(value_buffers[0]); + shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); + } else { + std::vector buffer_sizes(key_buffers.size()); + static_assert(!std::is_same_v || !std::is_same_v); + for (size_t i = 0; i < key_buffers.size(); ++i) { + if constexpr (!std::is_same_v) { + buffer_sizes[i] = size_optional_dataframe_buffer(key_buffers[i]); + } else { + buffer_sizes[i] = size_optional_dataframe_buffer(value_buffers[i]); + } + } + auto buffer_size = std::reduce(buffer_sizes.begin(), buffer_sizes.end()); + resize_optional_dataframe_buffer(key_buffer, buffer_size, handle.get_stream()); + resize_optional_dataframe_buffer( + value_buffer, buffer_size, handle.get_stream()); + std::vector buffer_displacements(buffer_sizes.size()); + std::exclusive_scan( + buffer_sizes.begin(), buffer_sizes.end(), buffer_displacements.begin(), size_t{0}); + handle.sync_stream(); + for (size_t i = 0; i < key_buffers.size(); ++i) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[i]) + : handle.get_stream(); + if constexpr (!std::is_same_v) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_optional_dataframe_buffer_cbegin(key_buffers[i]), + get_optional_dataframe_buffer_cend(key_buffers[i]), + get_optional_dataframe_buffer_begin(key_buffer) + buffer_displacements[i]); + } + + if constexpr (!std::is_same_v) { + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_optional_dataframe_buffer_cbegin(value_buffers[i]), + get_optional_dataframe_buffer_cend(value_buffers[i]), + get_optional_dataframe_buffer_begin(value_buffer) + + buffer_displacements[i]); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + } return std::make_tuple(std::move(key_buffer), std::move(value_buffer)); } diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh new file mode 100644 index 00000000000..76ef3fb0de4 --- /dev/null +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cugraph { + +namespace detail { + +inline std::vector init_stream_pool_indices(size_t max_tmp_buffer_size, + size_t approx_tmp_buffer_size_per_loop, + size_t loop_count, + size_t num_streams_per_loop, + size_t max_streams) +{ + size_t num_streams = std::min(loop_count * num_streams_per_loop, + raft::round_down_safe(max_streams, num_streams_per_loop)); + + auto num_concurrent_loops = + (approx_tmp_buffer_size_per_loop > 0) + ? std::max(max_tmp_buffer_size / approx_tmp_buffer_size_per_loop, size_t{1}) + : loop_count; + num_streams = std::min(num_concurrent_loops * num_streams_per_loop, num_streams); + + std::vector stream_pool_indices(num_streams); + std::iota(stream_pool_indices.begin(), stream_pool_indices.end(), size_t{0}); + + return stream_pool_indices; +} + +// this assumes that the caller already knows how many items will be copied. +template +void copy_if_nosync(InputIterator input_first, + InputIterator input_last, + FlagIterator flag_first, + OutputIterator output_first, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::copy_if_nosync relies on cub::DeviceSelect::Flagged which uses int for input " + "size, but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceSelect::Flagged(static_cast(nullptr), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + count.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceSelect::Flagged(d_tmp_storage.data(), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + count.data(), + input_size, + stream_view); +} + +template +void count_nosync(InputIterator input_first, + InputIterator input_last, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type value, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + count.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, count.data(), input_size, stream_view); +} + +template +void sum_nosync( + InputIterator input_first, + InputIterator input_last, + raft::device_span::value_type> sum /* size = 1 */, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + sum.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, sum.data(), input_size, stream_view); +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/detail/optional_dataframe_buffer.hpp b/cpp/src/prims/detail/optional_dataframe_buffer.hpp index 87c095f8e81..6657b91f13b 100644 --- a/cpp/src/prims/detail/optional_dataframe_buffer.hpp +++ b/cpp/src/prims/detail/optional_dataframe_buffer.hpp @@ -26,152 +26,130 @@ namespace detail { // we cannot use thrust::iterator_traits::value_type if Iterator is void* (reference to // void is not allowed) template -struct optional_dataframe_buffer_value_type_t; +struct optional_dataframe_buffer_iterator_value_type_t; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = typename thrust::iterator_traits::value_type; }; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = void; }; -template >* = nullptr> -std::byte allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) -{ - return std::byte{0}; // dummy -} - -template >* = nullptr> +template auto allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) { - return allocate_dataframe_buffer(size, stream); + if constexpr (std::is_same_v) { + return std::byte{0}; // dummy + } else { + return allocate_dataframe_buffer(size, stream); + } } -template >* = nullptr> -void* get_optional_dataframe_buffer_begin(std::byte& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} +template +struct optional_dataframe_buffer_type { + using type = decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; -template >* = nullptr> -auto get_optional_dataframe_buffer_begin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_begin(optional_dataframe_buffer); -} +template +using optional_dataframe_buffer_type_t = typename optional_dataframe_buffer_type::type; -template >* = nullptr> -void* get_optional_dataframe_buffer_end(std::byte& optional_dataframe_buffer) +template +auto get_optional_dataframe_buffer_begin( + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return static_cast(nullptr); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_begin(optional_dataframe_buffer); + } } -template >* = nullptr> +template auto get_optional_dataframe_buffer_end( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return get_dataframe_buffer_end(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_end(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cbegin(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cbegin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cend(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cend( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_cend(optional_dataframe_buffer); -} - -template >* = nullptr> -void reserve_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_capacity, - rmm::cuda_stream_view stream_view) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return; + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cend(optional_dataframe_buffer); + } } -template >* = nullptr> +template void reserve_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_capacity, rmm::cuda_stream_view stream_view) { - return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); -} - -template >* = nullptr> -void resize_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_size, - rmm::cuda_stream_view stream_view) -{ - return; + if constexpr (std::is_same_v) { + return; + } else { + return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); + } } -template >* = nullptr> +template void resize_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view) { - return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + if constexpr (std::is_same_v) { + return; + } else { + return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + } } -template >* = nullptr> -void shrink_to_fit_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return; -} - -template >* = nullptr> +template void shrink_to_fit_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); -} - -template >* = nullptr> -size_t size_optional_dataframe_buffer(std::byte const& optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer, rmm::cuda_stream_view stream_view) { - return size_t{0}; + if constexpr (std::is_same_v) { + return; + } else { + return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); + } } -template >* = nullptr> +template size_t size_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return size_dataframe_buffer(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return size_t{0}; + } else { + return size_dataframe_buffer(optional_dataframe_buffer); + } } } // namespace detail diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh new file mode 100644 index 00000000000..311b16e71ec --- /dev/null +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -0,0 +1,4374 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "detail/graph_partition_utils.cuh" +#include "prims/detail/multi_stream_utils.cuh" +#include "prims/detail/optional_dataframe_buffer.hpp" +#include "prims/detail/prim_functors.cuh" +#include "prims/detail/prim_utils.cuh" +#include "prims/fill_edge_src_dst_property.cuh" +#include "prims/property_op_utils.cuh" +#include "prims/reduce_op.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cugraph { + +namespace detail { + +int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; +int32_t constexpr per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size = 128; + +template +struct iterator_value_type_or_default_t; + +template +struct iterator_value_type_or_default_t>> { + using value_type = default_t; // if Iterator is invalid (void*), value_type = default_t +}; + +template +struct iterator_value_type_or_default_t>> { + using value_type = typename thrust::iterator_traits< + Iterator>::value_type; // if iterator is valid, value_type = typename + // thrust::iterator_traits::value_type +}; + +template +__device__ auto init_pred_op( + edge_partition_device_view_t const& edge_partition, + EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input, + PredOp const& pred_op, + key_t key, + typename GraphViewType::vertex_type major_offset, + typename GraphViewType::vertex_type const* indices, + typename GraphViewType::edge_type edge_offset) +{ + if constexpr (std::is_same_v< + PredOp, + const_true_e_op_t>) { + return call_const_true_e_op_t{}; + } else { + return call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } +} + +template +struct transform_and_atomic_reduce_t { + edge_partition_device_view_t const& edge_partition{}; + vertex_t const* indices{nullptr}; + TransformOp const& transform_op{}; + PredOp const& pred_op{}; + ResultValueOutputIteratorOrWrapper& result_value_output{}; + + __device__ void operator()(edge_t i) const + { + if (pred_op(i)) { + auto e_op_result = transform_op(i); + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + if constexpr (multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } +}; + +template +__device__ void update_result_value_output( + edge_partition_device_view_t const& edge_partition, + vertex_t const* indices, + edge_t local_degree, + TransformOp const& transform_op, + result_t init, + ReduceOp const& reduce_op, + PredOp const& pred_op, + size_t output_idx /* relevent only when update_major === true */, + ResultValueOutputIteratorOrWrapper& result_value_output) +{ + if constexpr (update_major) { + result_t val{}; + if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v>) { // init is selected only when no + // edges return a valid value + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + auto tmp = transform_op(i); + val = tmp; + break; + } + } else { + val = thrust::transform_reduce(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_op, + init, + reduce_op); + } + } else { + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + if (pred_op(i)) { + auto tmp = transform_op(i); + if constexpr (std::is_same_v>) { // init is selected only when + // no edges return a valid + // value + val = tmp; + break; + } else { + val = reduce_op(val, tmp); + } + } + } + } + *(result_value_output + output_idx) = val; + } else { + thrust::for_each(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_and_atomic_reduce_t{ + edge_partition, indices, transform_op, pred_op, result_value_output}); + } +} + +template +__global__ static void per_v_transform_reduce_e_hypersparse( + edge_partition_device_view_t edge_partition, + OptionalKeyIterator key_first, + OptionalKeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + static_assert(update_major || !use_input_key); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + size_t key_count{}; + if constexpr (use_input_key) { + key_count = static_cast(thrust::distance(key_first, key_last)); + } else { + key_count = *(edge_partition.dcs_nzd_vertex_count()); + } + + while (idx < key_count) { + key_t key{}; + vertex_t major{}; + thrust::optional major_idx{}; + if constexpr (use_input_key) { + key = *(key_first + idx); + major = thrust_tuple_get_or_identity(key); + major_idx = edge_partition.major_idx_from_major_nocheck(major); + } else { + key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); + major = key; + auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - + edge_partition.major_range_first()); + major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region + } + + size_t output_idx = use_input_key ? idx : (major - *(edge_partition).major_hypersparse_first()); + if (major_idx) { + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(*major_idx)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(i); + } else { + return false; + } + }, + output_idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + output_idx, + result_value_output); + } + } else { + if constexpr (update_major) { *(result_value_output + output_idx) = init; } + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_low_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(major_offset)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(i); + } else { + return false; + } + }, + idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + idx, + result_value_output); + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_mid_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); + auto const lane_id = tid % raft::warp_size(); + auto idx = static_cast(tid / raft::warp_size()); + + using WarpReduce = cub::WarpReduce< + std::conditional_t>, int32_t, e_op_result_t>>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size()) + : int32_t{1} /* dummy */]; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_lane_id{}; + if constexpr (update_major) { + reduced_e_op_result = + (lane_id == 0) ? init : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_lane_id = raft::warp_size(); + } + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (lane_id == ((first_valid_lane_id == raft::warp_size()) ? 0 : first_valid_lane_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(reduced_e_op_result, reduce_op); + if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x * (blockDim.x / raft::warp_size()); + } +} + +template +__global__ static void per_v_transform_reduce_e_high_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + auto idx = static_cast(blockIdx.x); + + using BlockReduce = cub::BlockReduce< + std::conditional_t>, int32_t, e_op_result_t>, + std::is_same_v> + ? per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : per_v_transform_reduce_e_kernel_block_size>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage; + [[maybe_unused]] __shared__ + std::conditional_t>, + int32_t, + std::byte /* dummy */> + output_thread_id; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_thread_id{}; + if constexpr (update_major) { + reduced_e_op_result = threadIdx.x == 0 + ? init + : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_thread_id = per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + } + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (threadIdx.x == ((first_valid_thread_id == + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) + ? 0 + : first_valid_thread_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); + if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x; + } +} + +template +void compute_priorities( + raft::comms::comms_t const& comm, + ValueIterator value_first, + raft::device_span priorities, + std::optional, raft::device_span>> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are + // more than one, the GPU with (comm_rank == root) has the highest priority, GPUs in the same DGX + // node should be the next) + + if (ignore_local_values) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + } else { + thrust::tabulate( + rmm::exec_policy_nosync(stream_view), + priorities.begin(), + priorities.begin() + contiguous_size, + [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { + auto val = *(value_first + offset); + return (val != init) + ? rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)) + : std::numeric_limits::max(); // lowest priority + }); + if (hypersparse_key_offsets) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + priorities.begin() + contiguous_size, + priorities.end(), + std::numeric_limits::max()); + if ((*hypersparse_key_offsets).index() == 0) { + auto priority_first = thrust::make_transform_iterator( + std::get<0>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(uint32_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<0>(*hypersparse_key_offsets).size(), + std::get<0>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } else { + auto priority_first = thrust::make_transform_iterator( + std::get<1>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(size_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<1>(*hypersparse_key_offsets).size(), + std::get<1>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } + } + } +} + +// return selected ranks if root. +// otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are +// selected or not. +template +std::variant, + int, + priority_t>> /* root, store selected ranks */, + std::optional> /* store bitmap */> +compute_selected_ranks_from_priorities( + raft::comms::comms_t const& comm, + raft::device_span priorities, + std::optional, raft::device_span>> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + using rank_t = std::conditional_t, int, priority_t>; + + if (comm_rank == root) { + rmm::device_uvector selected_ranks(priorities.size(), stream_view); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::transform(rmm::exec_policy_nosync(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + priorities.size(), + selected_ranks.begin(), + [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + return static_cast(rank); + }); + return selected_ranks; + } else { + std::optional> keep_flags{std::nullopt}; + if (!ignore_local_values) { + keep_flags = rmm::device_uvector( + packed_bool_size(hypersparse_key_offsets + ? (contiguous_size + ((*hypersparse_key_offsets).index() == 0 + ? std::get<0>(*hypersparse_key_offsets).size() + : std::get<1>(*hypersparse_key_offsets).size())) + : contiguous_size), + stream_view); + thrust::fill(rmm::exec_policy_nosync(stream_view), + (*keep_flags).begin(), + (*keep_flags).end(), + packed_bool_empty_mask()); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + contiguous_size, + [keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if (hypersparse_key_offsets) { + if ((*hypersparse_key_offsets).index() == 0) { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<0>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<0>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } else { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<1>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<1>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } + } + } + return keep_flags; + } +} + +template +void per_v_transform_reduce_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + OptionalKeyIterator edge_partition_key_first, + OptionalKeyIterator edge_partition_key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper output_buffer, + EdgeOp e_op, + T major_init, + T major_identity_element, + ReduceOp reduce_op, + PredOp pred_op, + std::optional> key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + constexpr bool use_input_key = !std::is_same_v; + + using vertex_t = typename GraphViewType::vertex_type; + using segment_key_iterator_t = + std::conditional_t; + + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } + if (key_segment_offsets) { + static_assert(detail::num_sparse_segments_per_vertex_partition == 3); + + if (edge_partition.dcs_nzd_vertex_count()) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit + // every vertex in the hypersparse segment + thrust::fill(rmm::exec_policy_nosync(exec_stream), + output_buffer + (*key_segment_offsets)[3], + output_buffer + (*key_segment_offsets)[4], + major_init); + } + + auto segment_size = use_input_key + ? ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]) + : static_cast(*(edge_partition.dcs_nzd_vertex_count())); + if (segment_size > 0) { + raft::grid_1d_thread_t update_grid(segment_size, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; } + auto segment_key_first = edge_partition_key_first; + auto segment_key_last = edge_partition_key_last; + if constexpr (use_input_key) { + segment_key_first += (*key_segment_offsets)[3]; + segment_key_last = + segment_key_first + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]); + } else { + assert(segment_key_first == nullptr); + assert(segment_key_last == nullptr); + } + detail::per_v_transform_reduce_e_hypersparse + <<>>( + edge_partition, + segment_key_first, + segment_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; } + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + *segment_key_first += (*key_segment_offsets)[2]; + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; } + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + *segment_key_first += (*key_segment_offsets)[1]; + detail::per_v_transform_reduce_e_mid_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_block_t update_grid( + (*key_segment_offsets)[1], + std::is_same_v> + ? detail::per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_high_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + (*key_segment_offsets)[1], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + } else { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + size_t num_keys{}; + if constexpr (use_input_key) { + num_keys = + static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); + } else { + num_keys = static_cast(edge_partition.major_range_size()); + } + + if (num_keys > size_t{0}) { + raft::grid_1d_thread_t update_grid(num_keys, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + num_keys, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } +} + +template +void per_v_transform_reduce_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + OptionalKeyIterator sorted_unique_key_first, + OptionalKeyIterator sorted_unique_key_last, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first) +{ + constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || !use_input_key); + constexpr bool filter_input_key = + GraphViewType::is_multi_gpu && use_input_key && + std::is_same_v>; // if GraphViewType::is_multi_gpu && update_major && + // std::is_same_v>, for any + // vertex in the frontier, we need to visit only local edges + // if we find any valid local edge (FIXME: this is + // applicable even when use_input_key is false). + + static_assert( + ReduceOp::pure_function && + ((reduce_op::has_compatible_raft_comms_op_v && + reduce_op::has_identity_element_v) || + (update_major && + std::is_same_v>))); // current restriction, to support general + // reduction, we may need to take a less + // efficient code path + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + using edge_partition_src_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeSrcValueInputWrapper::value_iterator, + typename EdgeSrcValueInputWrapper::value_type>>; + using edge_partition_dst_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeDstValueInputWrapper::value_iterator, + typename EdgeDstValueInputWrapper::value_type>>; + using edge_partition_e_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_edge_dummy_property_device_view_t, + detail::edge_partition_edge_property_device_view_t< + edge_t, + typename EdgeValueInputWrapper::value_iterator, + typename EdgeValueInputWrapper::value_type>>; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + constexpr bool try_bitmap = + GraphViewType::is_multi_gpu && use_input_key && std::is_same_v; + + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. drop zero degree keys & compute key_segment_offsets + + auto local_vertex_partition_segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + + std::conditional_t>, std::byte /* dummy */> + key_segment_offsets{}; + auto sorted_unique_nzd_key_last = sorted_unique_key_last; + if constexpr (use_input_key) { + if (local_vertex_partition_segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + sorted_unique_nzd_key_last = sorted_unique_key_first + (*key_segment_offsets).back(); + } + } + + // 2. initialize vertex value output buffer + + if constexpr (update_major) { // no vertices in the zero degree segment are visited (otherwise, + // no need to initialize) + if constexpr (use_input_key) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_key_last), + init); + } else { + if (local_vertex_partition_segment_offsets) { + thrust::fill( + handle.get_thrust_policy(), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin()), + init); + } + } + } else { + if constexpr (GraphViewType::is_multi_gpu) { + /* no need to initialize (we use minor_tmp_buffer) */ + } else { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + graph_view.local_vertex_partition_range_size(), + init); + } + } + + // 3. filter input keys & update key_segment_offsets + + auto edge_mask_view = graph_view.edge_mask_view(); + + auto tmp_key_buffer = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + auto tmp_output_indices = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + std::conditional_t, + VertexValueOutputIterator> + tmp_vertex_value_output_first{}; + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(static_cast(minor_comm_rank))); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, static_cast(minor_comm_rank)) + : thrust::nullopt; + + std::optional> edge_partition_stream_pool_indices{std::nullopt}; + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + edge_partition_stream_pool_indices = std::vector(max_segments); + std::iota((*edge_partition_stream_pool_indices).begin(), + (*edge_partition_stream_pool_indices).end(), + size_t{0}); + } + + if (edge_partition_stream_pool_indices) { handle.sync_stream(); } + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t( + edge_dst_value_input, static_cast(minor_comm_rank)); + } else { + edge_partition_src_value_input = edge_partition_src_input_device_view_t( + edge_src_value_input, static_cast(minor_comm_rank)); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, static_cast(minor_comm_rank)); + + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + vertex_value_output_first, + e_op, + init, + init, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices ? std::make_optional>( + (*edge_partition_stream_pool_indices).data(), + (*edge_partition_stream_pool_indices).size()) + : std::nullopt); + + if (edge_partition_stream_pool_indices) { + handle.sync_stream_pool(*edge_partition_stream_pool_indices); + } + + auto num_tmp_keys = thrust::count( + handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + init); // we allow false positives (some edge operations may actually return init) + + resize_optional_dataframe_buffer(tmp_key_buffer, num_tmp_keys, handle.get_stream()); + resize_optional_dataframe_buffer(tmp_output_indices, num_tmp_keys, handle.get_stream()); + + auto input_first = + thrust::make_zip_iterator(sorted_unique_key_first, thrust::make_counting_iterator(size_t{0})); + thrust::copy_if( + handle.get_thrust_policy(), + input_first, + input_first + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first, + thrust::make_zip_iterator(get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_output_indices)), + is_equal_t{init}); + + sorted_unique_key_first = get_optional_dataframe_buffer_begin(tmp_key_buffer); + sorted_unique_nzd_key_last = get_optional_dataframe_buffer_end(tmp_key_buffer); + tmp_vertex_value_output_first = thrust::make_permutation_iterator( + vertex_value_output_first, get_optional_dataframe_buffer_begin(tmp_output_indices)); + + if (key_segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + edge_partition.major_range_first(), + handle.get_stream()); + assert((*key_segment_offsets).back() == *((*key_segment_offsets).rbegin() + 1)); + assert(sorted_uniue_nzd_key_last == sorted_unique_key_first + (*key_segment_offsets).back()); + } + } else { + tmp_vertex_value_output_first = vertex_value_output_first; + } + + /* 4. compute subgroup_size (used to compute priority in device_gatherv) */ + + [[maybe_unused]] std::conditional_t>, + int, + std::byte /* dummy */> + subgroup_size{}; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size <= num_gpus_per_node) { + subgroup_size = minor_comm_size; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / major_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } + } + + // 5. collect max_tmp_buffer_size, approx_tmp_buffer_size_per_loop, local_key_list_sizes, + // local_v_list_range_firsts, local_v_list_range_lasts, local_key_list_deg1_sizes, + // key_segment_offset_vectors + + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; + std::conditional_t, std::byte /* dummy */> + local_key_list_sizes{}; + std::conditional_t, std::byte /* dummy */> + local_v_list_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_v_list_range_lasts{}; + std::conditional_t>, std::byte /* dummy */> + local_key_list_deg1_sizes{}; // if global degree is 1, any valid local value should be selected + std::conditional_t>>, + std::byte /* dummy */> + key_segment_offset_vectors{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{0}; + if constexpr (update_major) { + size_t key_size{0}; + if constexpr (use_input_key) { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + size_t value_size{0}; + if constexpr (std::is_arithmetic_v) { + value_size = sizeof(T); + } else { + value_size = sum_thrust_tuple_element_sizes(); + } + + size_t major_range_size{}; + if constexpr (use_input_key) { + major_range_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + ; + } else { + major_range_size = graph_view.local_vertex_partition_range_size(); + } + size_t size_per_key{}; + if constexpr (filter_input_key) { + size_per_key = + key_size + + value_size / 2; // to reflect that many keys will be filtered out, note that this is a + // simple approximation, memory requirement in this case is much more + // complex as we store additional temporary variables + + } else { + size_per_key = key_size + value_size; + } + approx_tmp_buffer_size_per_loop = major_range_size * size_per_key; + } + + size_t num_scalars = 2; // max_tmp_buffer_size, approx_tmp_buffer_size_per_loop + size_t num_scalars_less_key_segment_offsets = num_scalars; + if constexpr (use_input_key) { + num_scalars += 1; // local_key_list_size + if constexpr (try_bitmap) { + num_scalars += 2; // local_key_list_range_first, local_key_list_range_last + } + if (filter_input_key && graph_view.use_dcs()) { + num_scalars += 1; // local_key_list_degree_1_size + } + num_scalars_less_key_segment_offsets = num_scalars; + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + } + + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + auto hypersparse_degree_offsets = + graph_view.local_vertex_partition_hypersparse_degree_offsets(); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets, + [max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + deg1_v_first = (filter_input_key && graph_view.use_dcs()) + ? thrust::make_optional(graph_view.local_vertex_partition_range_first() + + (*local_vertex_partition_segment_offsets)[3] + + *((*hypersparse_degree_offsets).rbegin() + 1)) + : thrust::nullopt, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return approx_tmp_buffer_size_per_loop; + } + if constexpr (use_input_key) { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + if (i == 2) { return v_list_size; } + if constexpr (try_bitmap) { + if (i == 3) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else if (i == 4) { + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } else if (i == 5) { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } + } + } else { + if (i == 3) { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } + } + } + } + assert(false); + return size_t{0}; + }); + if constexpr (use_input_key) { + if (key_segment_offsets) { + raft::update_device(d_aggregate_tmps.data() + (num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); + if constexpr (use_input_key) { + local_key_list_sizes = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_v_list_range_firsts = std::vector(minor_comm_size); + local_v_list_range_lasts = std::vector(minor_comm_size); + } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + local_key_list_deg1_sizes = std::vector(minor_comm_size); + } + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + } + for (int i = 0; i < minor_comm_size; ++i) { + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 1]; + if constexpr (use_input_key) { + local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars + 2]; + if constexpr (try_bitmap) { + local_v_list_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 3]); + local_v_list_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 4]); + } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + (*local_key_list_deg1_sizes)[i] = + static_cast(h_aggregate_tmps[i * num_scalars + (try_bitmap ? 5 : 3)]); + } + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back( + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets, + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets + + (*key_segment_offsets).size()); + } + } + } + } else { + if constexpr (use_input_key) { + local_key_list_sizes = std::vector{ + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } + } + } + + // 6. compute optional bitmap info & compressed vertex list + + bool v_compressible{false}; + std:: + conditional_t>, std::byte /* dummy */> + v_list_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_v_list{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + if constexpr (sizeof(vertex_t) == 8) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_key_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + double threshold_ratio = + 2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ / + static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); + auto avg_key_list_size = + std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_key_list_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to considerr additional kernel launch overhead */)) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, + sorted_unique_nzd_key_last, + local_v_list_range_firsts[minor_comm_rank], + local_v_list_range_lasts[minor_comm_rank], + handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_key_list_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_key_first, + sorted_unique_nzd_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); + } + } + } + + bool uint32_key_output_offset = false; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + size_t max_key_offset_size = std::numeric_limits::max(); + if constexpr (filter_input_key) { + max_key_offset_size = std::reduce( + local_key_list_sizes.begin(), local_key_list_sizes.end(), size_t{0}, [](auto l, auto r) { + return std::max(l, r); + }); + } else { + static_assert(!use_input_key); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto const& segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + + auto output_range_size = + segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + + max_key_offset_size = std::max(static_cast(output_range_size), max_key_offset_size); + } + } + uint32_key_output_offset = + (max_key_offset_size <= static_cast(std::numeric_limits::max())); + } + + // 7. set-up stream pool & events + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. + } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } + + size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loops streams from stream_pool_indices + if (stream_pool_indices) { + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); + } + + // 8. set-up temporary buffers + + using minor_tmp_buffer_type = std::conditional_t, + edge_dst_property_t>; + [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + minor_tmp_buffer = std::make_unique(handle, graph_view); + auto minor_init = init; + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer ma not + // store values for the entire minor rangey + minor_init = ReduceOp::identity_element; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; + } + fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); + } + + using edge_partition_minor_output_device_view_t = + std::conditional_tmutable_view().value_first())>, + void /* dummy */>; + + auto counters = allocate_optional_dataframe_buffer< + std::conditional_t>( + num_concurrent_loops, handle.get_stream()); + + if constexpr (!GraphViewType::is_multi_gpu || !use_input_key) { + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + + // 9. process local edge partitions + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); + + std::conditional_t< + GraphViewType::is_multi_gpu && use_input_key, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> + edge_partition_key_buffers{}; + std::conditional_t, rmm::device_uvector>>>, + std::byte /* dummy */> + edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in th + // hypersparse regione + std::conditional_t>, std::byte /* dummy */> + edge_partition_deg1_hypersparse_key_offset_counts{}; + std::vector process_local_edges(loop_count, true); + + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto const minor_comm_rank = minor_comm.get_rank(); + + edge_partition_key_buffers.reserve(loop_count); + + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_bitmap_buffers{std::nullopt}; + if constexpr (try_bitmap) { + if (v_list_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); + } + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); + use_bitmap_buffer = true; + } + } + if (!use_bitmap_buffer) { + bool allocated{false}; + if constexpr (try_bitmap) { + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_key_list_sizes[partition_idx], handle.get_stream())); + allocated = true; + } + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_key_list_sizes[partition_idx], handle.get_stream())); + } + } + + if constexpr (filter_input_key) { + if (static_cast(partition_idx) == minor_comm_rank) { + process_local_edges[j] = false; + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if constexpr (try_bitmap) { + if (v_list_bitmap) { + device_bcast(minor_comm, + (*v_list_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(minor_comm, + (*compressed_v_list).data(), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + // copy keys from temporary bitmap buffers to key buffers (copy only the sparse segments + // if filter_input_key is true) + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + } else { + keys = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + } + + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + if (process_local_edges[j]) { + auto range_first = local_v_list_range_firsts[partition_idx]; + auto range_last = local_v_list_range_lasts[partition_idx]; + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { // skip copying the hypersparse segment + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + range_last = std::min(range_last, *(edge_partition.major_hypersparse_first())); + } + } + if (range_first < range_last) { + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + range_first, + range_last, + loop_stream); + } + } + } else { + rx_bitmap.resize(0, loop_stream); + rx_bitmap.shrink_to_fit(loop_stream); + } + edge_partition_key_buffers.push_back(std::move(keys)); + } + } + } + + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + edge_partition_hypersparse_key_offset_vectors = + std::vector, rmm::device_uvector>>{}; + (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); + edge_partition_deg1_hypersparse_key_offset_counts = std::vector(loop_count, 0); + + std::conditional_t, + rmm::device_uvector>>, + std::vector>>>, + std::byte /* dummy */> + edge_partition_new_key_buffers{}; + bool allocate_new_key_buffer{true}; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { allocate_new_key_buffer = false; } + } + if (allocate_new_key_buffer) { // allocate new key buffers and copy the sparse segment + // keys to the new key buffers + if constexpr (try_bitmap) { + edge_partition_new_key_buffers = std::vector< + std::variant, rmm::device_uvector>>{}; + } else { + edge_partition_new_key_buffers = std::vector>{}; + } + (*edge_partition_new_key_buffers).reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + if constexpr (try_bitmap) { + if (v_compressible) { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<0>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<0>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } else { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<1>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<1>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } else { + auto new_key_buffer = allocate_dataframe_buffer( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + edge_partition_key_buffers[j].resize(0, loop_stream); + edge_partition_key_buffers[j].shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } + } + + if constexpr (try_bitmap) { // if we are using a bitmap buffer + if (v_list_bitmap) { + std::vector> input_count_offset_vectors{}; + input_count_offset_vectors.reserve(loop_count); + + std::vector> filtered_bitmap_vectors{}; + std::vector> output_count_offset_vectors{}; + filtered_bitmap_vectors.reserve(loop_count); + output_count_offset_vectors.reserve(loop_count); + + std::vector range_offset_firsts(loop_count, 0); + std::vector range_offset_lasts(loop_count, 0); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector input_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto range_offset_first = + std::min((edge_partition.major_range_first() + (*segment_offsets)[3] > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[3]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + auto range_offset_last = + std::min(((edge_partition.major_range_first() + (*segment_offsets)[4]) > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[4]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto input_count_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(packed_bool_offset(range_offset_first)), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + range_offset_first] __device__(size_t i) { + auto word = range_bitmap[i]; + if (i == packed_bool_offset(range_offset_first)) { + word &= ~packed_bool_partial_mask( + range_offset_first % + packed_bools_per_word()); // clear the bits in the sparse region + } + return static_cast(__popc(word)); + })); + input_count_offsets.resize( + (rx_bitmap.size() - packed_bool_offset(range_offset_first)) + 1, loop_stream); + input_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan( + rmm::exec_policy_nosync(loop_stream), + input_count_first, + input_count_first + + (rx_bitmap.size() - packed_bool_offset(range_offset_first)), + input_count_offsets.begin() + 1); + } + range_offset_firsts[j] = range_offset_first; + range_offset_lasts[j] = range_offset_last; + } + input_count_offset_vectors.push_back(std::move(input_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector filtered_bitmap(0, loop_stream); + rmm::device_uvector output_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + filtered_bitmap.resize( + rx_bitmap.size() - packed_bool_offset(range_offset_first), loop_stream); + thrust::tabulate( + rmm::exec_policy_nosync(loop_stream), + filtered_bitmap.begin(), + filtered_bitmap.end(), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + range_offset_last, + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(size_t i) { + auto this_word_range_offset_first = cuda::std::max( + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()), + range_offset_first); + auto this_word_range_offset_last = + cuda::std::min(static_cast( + (packed_bool_offset(range_offset_first) + (i + 1)) * + packed_bools_per_word()), + range_offset_last); + auto range_lead_bits = static_cast(this_word_range_offset_first % + packed_bools_per_word()); + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask(range_offset_first % + packed_bools_per_word()); + } + auto this_word_hypersparse_offset_first = + (range_first + this_word_range_offset_first) - major_hypersparse_first; + auto num_bits = static_cast(this_word_range_offset_last - + this_word_range_offset_first); + auto hypersparse_lead_bits = + static_cast(this_word_hypersparse_offset_first) % + packed_bools_per_word(); + auto segment_bitmap_word = ((segment_bitmap[packed_bool_offset( + this_word_hypersparse_offset_first)] >> + hypersparse_lead_bits)) + << range_lead_bits; + auto remaining_bits = + (num_bits > (packed_bools_per_word() - hypersparse_lead_bits)) + ? (num_bits - (packed_bools_per_word() - hypersparse_lead_bits)) + : size_t{0}; + if (remaining_bits > 0) { + segment_bitmap_word |= + ((segment_bitmap + [packed_bool_offset(this_word_hypersparse_offset_first) + 1] & + packed_bool_partial_mask(remaining_bits)) + << ((packed_bools_per_word() - hypersparse_lead_bits) + + range_lead_bits)); + } + return range_bitmap_word & segment_bitmap_word; + })); + auto output_count_first = thrust::make_transform_iterator( + filtered_bitmap.begin(), + cuda::proclaim_return_type([] __device__(uint32_t word) { + return static_cast(__popc(word)); + })); + output_count_offsets.resize(filtered_bitmap.size() + 1, loop_stream); + output_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + output_count_first, + output_count_first + filtered_bitmap.size(), + output_count_offsets.begin() + 1); + } + } + filtered_bitmap_vectors.push_back(std::move(filtered_bitmap)); + output_count_offset_vectors.push_back(std::move(output_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto const& input_count_offsets = input_count_offset_vectors[j]; + auto const& filtered_bitmap = filtered_bitmap_vectors[j]; + auto const& output_count_offsets = output_count_offset_vectors[j]; + + if (keys.index() == 0) { + if (offsets.index() == 0) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } else { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } + } else { + if (offsets.index() == 0) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } else { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } + } + thrust::transform( + rmm::exec_policy_nosync(loop_stream), + output_count_offsets.begin() + (output_count_offsets.size() - 1), + output_count_offsets.end(), + counters.data() + j, + typecast_t{}); + } else { + thrust::fill(rmm::exec_policy_nosync(loop_stream), + counters.data() + j, + counters.data() + (j + 1), + size_t{0}); + } + } + + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + } + } + } + if (edge_partition_new_key_buffers) { // if there is no bitmap buffer + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + + auto& new_keys = (*edge_partition_new_key_buffers)[j]; + if constexpr (try_bitmap) { + assert(!v_list_bitmap); + if (keys.index() == 0) { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + major_hypersparse_first = + *(edge_partition + .major_hypersparse_first())] __device__(uint32_t v_offset) { + auto v = range_first + static_cast(v_offset); + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(vertex_t v) { + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } else { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(keys) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(auto key) { + auto segment_offset = + thrust_tuple_get_or_identity(key) - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + if (edge_partition_new_key_buffers) { + for (size_t j = 0; j < loop_count; ++j) { + edge_partition_key_buffers[j] = std::move((*edge_partition_new_key_buffers)[j]); + } + } + if (edge_partition_bitmap_buffers) { (*edge_partition_bitmap_buffers).clear(); } + + std::vector h_counts(loop_count); + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if (process_local_edges[j]) { + auto& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + resize_dataframe_buffer( + std::get<0>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } else { + resize_dataframe_buffer( + std::get<1>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } + } else { + resize_dataframe_buffer(keys, key_segment_offsets[3] + h_counts[j], loop_stream); + } + // skip shrink_to_fit to cut execution time + + auto& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(h_counts[j], loop_stream); + } else { + std::get<1>(offsets).resize(h_counts[j], loop_stream); + } + // skip shrink_to_fit to cut execution time + } + } + + { // update edge_partition_deg1_hypersparse_key_offset_counts + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector h_ptrs( + loop_count); // pointers to hypersparse key offset vectors + std::vector h_scalars( + loop_count * 2); // (key offset vector sizes, start degree 1 key offset) + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if (process_local_edges[j]) { + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + h_ptrs[j] = static_cast(std::get<0>(offsets).data()); + h_scalars[j * 2] = std::get<0>(offsets).size(); + } else { + h_ptrs[j] = static_cast(std::get<1>(offsets).data()); + h_scalars[j * 2] = std::get<1>(offsets).size(); + } + h_scalars[j * 2 + 1] = + local_key_list_sizes[partition_idx] - (*local_key_list_deg1_sizes)[partition_idx]; + } else { + h_ptrs[j] = static_cast(nullptr); + h_scalars[j * 2] = size_t{0}; + h_scalars[j * 2 + 1] = size_t{0}; + } + } + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + rmm::device_uvector d_scalars(h_scalars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_scalars.data(), h_scalars.data(), h_scalars.size(), handle.get_stream()); + rmm::device_uvector d_counts(loop_count, handle.get_stream()); + thrust::transform( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(loop_count), + d_counts.begin(), + cuda::proclaim_return_type( + [d_ptrs = raft::device_span(d_ptrs.data(), d_ptrs.size()), + d_scalars = raft::device_span(d_scalars.data(), d_scalars.size()), + uint32_key_output_offset] __device__(auto i) { + auto first = d_ptrs[i]; + if (first != static_cast(nullptr)) { + auto size = d_scalars[i * 2]; + auto start_offset = d_scalars[i * 2 + 1]; + if (uint32_key_output_offset) { + auto casted_first = static_cast(first); + return size - static_cast(thrust::distance( + casted_first, + thrust::lower_bound(thrust::seq, + casted_first, + casted_first + size, + static_cast(start_offset)))); + } else { + auto casted_first = static_cast(first); + return size - + static_cast(thrust::distance( + casted_first, + thrust::lower_bound( + thrust::seq, casted_first, casted_first + size, start_offset))); + } + } else { + return size_t{0}; + } + })); + raft::update_host((*edge_partition_deg1_hypersparse_key_offset_counts).data(), + d_counts.data(), + d_counts.size(), + handle.get_stream()); + handle.sync_stream(); + } + } + } + } + + std::conditional_t>, + std::byte /* dummy */> + edge_partition_major_output_buffers{}; + if constexpr (GraphViewType::is_multi_gpu && update_major) { + edge_partition_major_output_buffers.reserve(loop_count); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + size_t buffer_size{0}; + if (process_local_edges[j]) { + if constexpr (use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + buffer_size = size_dataframe_buffer(std::get<0>(keys)); + } else { + buffer_size = size_dataframe_buffer(std::get<1>(keys)); + } + } else { + buffer_size = size_dataframe_buffer(keys); + } + } else { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + buffer_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + } + } + edge_partition_major_output_buffers.push_back( + allocate_dataframe_buffer(buffer_size, loop_stream)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + auto partition_idx = i + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; + } + auto edge_partition_stream_pool_indices = + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; + + T major_init{}; + T major_identity_element{}; + if constexpr (update_major) { + if constexpr (std::is_same_v>) { // if any edge has a non-init value, + // one of the non-init values will + // be selected. + major_init = init; + major_identity_element = init; + } else { + major_init = ReduceOp::identity_element; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + major_init = (static_cast(partition_idx) == minor_comm_rank) + ? init + : ReduceOp::identity_element; + } else { + major_init = init; + } + major_identity_element = ReduceOp::identity_element; + } + } + + std::optional> key_segment_offsets{std::nullopt}; + if constexpr (use_input_key) { + if (key_segment_offset_vectors) { + key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + (*key_segment_offsets).back() = + size_dataframe_buffer(edge_partition_major_output_buffers[j]); + *((*key_segment_offsets).rbegin() + 1) = (*key_segment_offsets).back(); + } + } + } + } else { + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); + } + } + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + std::conditional_t, + edge_partition_minor_output_device_view_t>, + VertexValueOutputIterator> + output_buffer{}; + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (update_major) { + output_buffer = get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]); + } else { + output_buffer = + edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + } + } else { + output_buffer = tmp_vertex_value_output_first; + } + + bool processed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_first + std::get<0>(keys).size(), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + processed = true; + } + } + if (!processed) { + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + edge_partition_key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + edge_partition_key_last = get_dataframe_buffer_end(std::get<1>(keys)); + } else { + edge_partition_key_first = get_dataframe_buffer_begin(keys); + edge_partition_key_last = get_dataframe_buffer_end(keys); + } + } + + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + } + } + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + if constexpr (use_input_key) { + edge_partition_key_buffers.clear(); + edge_partition_key_buffers.shrink_to_fit(); + } + + if constexpr (std::is_same_v>) { + std::conditional_t< + filter_input_key, + std::optional, raft::device_span>>>, + std::byte /* dummy */> + edge_partition_hypersparse_non_deg1_key_offset_spans{}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + edge_partition_hypersparse_non_deg1_key_offset_spans = std::vector< + std::variant, raft::device_span>>( + loop_count); + } + } + + std::vector edge_partition_allreduce_sizes(loop_count); + std::vector edge_partition_allreduce_displacements(loop_count); + std::vector edge_partition_contiguous_sizes(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + size_t allreduce_size{}; + size_t contiguous_size{}; + if constexpr (filter_input_key) { + allreduce_size = local_key_list_sizes[partition_idx]; + if (local_key_list_deg1_sizes) { + allreduce_size -= (*local_key_list_deg1_sizes)[partition_idx]; + } + if (key_segment_offset_vectors) { + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + contiguous_size = key_segment_offsets[3]; + } else { + contiguous_size = local_key_list_sizes[partition_idx]; + } + } else { + static_assert(!use_input_key); + auto hypersparse_degree_offsets = + graph_view.local_edge_partition_hypersparse_degree_offsets(partition_idx); + allreduce_size = size_dataframe_buffer(output_buffer); + if (hypersparse_degree_offsets) { + allreduce_size -= *((*hypersparse_degree_offsets).rbegin()) - + *((*hypersparse_degree_offsets).rbegin() + 1); + } + contiguous_size = size_dtaframe_buffer(output_buffer); + } + edge_partition_allreduce_sizes[j] = allreduce_size; + edge_partition_contiguous_sizes[j] = contiguous_size; + } + std::exclusive_scan(edge_partition_allreduce_sizes.begin(), + edge_partition_allreduce_sizes.end(), + edge_partition_allreduce_displacements.begin(), + size_t{0}); + std::variant, rmm::device_uvector> + aggregate_priorities = rmm::device_uvector(0, handle.get_stream()); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities) + .resize( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } else { // priority == uint32_t + aggregate_priorities = rmm::device_uvector( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + + if (offsets.index() == 0) { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<0>(offsets).data(), + std::get<0>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } else { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<1>(offsets).data(), + std::get<1>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] = + *hypersparse_non_deg1_key_offsets; + } + } + + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + compute_priorities( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + raft::device_span(std::get<0>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), + hypersparse_non_deg1_key_offsets, + edge_partition_contiguous_sizes[j], + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + } else { // priority == uint32_t + compute_priorities( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + raft::device_span(std::get<1>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), + hypersparse_non_deg1_key_offsets, + edge_partition_contiguous_sizes[j], + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + device_allreduce(minor_comm, + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } else { // priority == uint32_t + device_allreduce(minor_comm, + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } + + std::vector< + std::variant, rmm::device_uvector>, + std::optional>>> + edge_partition_selected_ranks_or_flags{}; + edge_partition_selected_ranks_or_flags.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& output_buffer = edge_partition_major_output_buffers[j]; + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + hypersparse_non_deg1_key_offsets = + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + } + } + + auto contiguous_size = edge_partition_contiguous_sizes[j]; + + std::variant, rmm::device_uvector>, + std::optional>> + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + rmm::device_uvector(0, loop_stream)); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + auto priorities = raft::device_span( + std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); + auto tmp = compute_selected_ranks_from_priorities( + minor_comm, + priorities, + hypersparse_non_deg1_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } + } else { // priority_t == uint32_t + auto priorities = raft::device_span( + std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); + auto tmp = compute_selected_ranks_from_priorities( + minor_comm, + priorities, + hypersparse_non_deg1_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } + } + edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<0>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } else { + std::get<1>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<1>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } + + std::vector> edge_partition_values{}; + edge_partition_values.reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto& output_buffer = edge_partition_major_output_buffers[j]; + + auto values = allocate_dataframe_buffer( + process_local_edges[j] ? size_dataframe_buffer(output_buffer) : size_t{0}, loop_stream); + if (process_local_edges[j]) { + if (minor_comm_rank == static_cast(partition_idx)) { + assert(!use_input_key); + assert(edge_partition_selected_ranks_or_flags[j].index() == 0); + auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<0>(selected_ranks).begin(), + cuda::proclaim_return_type([minor_comm_rank] __device__(auto rank) { + return static_cast(rank) == minor_comm_rank; + })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<1>(selected_ranks).begin(), + cuda::proclaim_return_type( + [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + size_t input_end_offset{}; + if constexpr (filter_input_key) { + input_end_offset = edge_partition_contiguous_sizes[j]; + if (edge_partition_hypersparse_non_deg1_key_offset_spans) { + auto const& span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (span.index() == 0) { + input_end_offset += std::get<0>(span).size(); + } else { + input_end_offset += std::get<1>(span).size(); + } + } + } else { + input_end_offset = edge_partition_allreduce_sizes[j]; + } + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + input_end_offset, + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span( + (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { + auto word = keep_flags[packed_bool_offset(offset)]; + return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); + })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + (*keep_flags).resize(0, loop_stream); + (*keep_flags).shrink_to_fit(loop_stream); + } + } + + edge_partition_values.push_back(std::move(values)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector copy_sizes(loop_count); + raft::update_host(copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + std::optional< + std::vector, rmm::device_uvector>>> + edge_partition_deg1_hypersparse_output_offset_vectors{}; + + if (graph_view.use_dcs()) { + edge_partition_deg1_hypersparse_output_offset_vectors = + std::vector, rmm::device_uvector>>{}; + (*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto& output_buffer = edge_partition_major_output_buffers[j]; + std::variant, rmm::device_uvector> + output_offsets = rmm::device_uvector(0, loop_stream); + if (!uint32_key_output_offset) { + output_offsets = rmm::device_uvector(0, loop_stream); + } + + if (process_local_edges[j]) { + auto& values = edge_partition_values[j]; + + size_t output_offset_buf_size{0}; + if constexpr (filter_input_key) { + output_offset_buf_size = (*edge_partition_deg1_hypersparse_key_offset_counts)[j]; + } else { + assert(!use_input_key); + output_offset_buf_size = + size_dataframe_buffer(output_buffer) - edge_partition_allreduce_sizes[j]; + } + + if (output_offsets.index() == 0) { + std::get<0>(output_offsets).resize(output_offset_buf_size, loop_stream); + } else { + output_offsets = rmm::device_uvector(output_offset_buf_size, loop_stream); + } + + size_t input_start_offset{}; + if constexpr (filter_input_key) { + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + input_start_offset = + edge_partition_contiguous_sizes[j] + + (span.index() == 0 ? std::get<0>(span).size() : std::get<1>(span).size()); + } else { + static_assert(!use_input_key); + input_start_offset = edge_partition_allreduce_sizes[j]; + } + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + cuda::proclaim_return_type( + [init] __device__(auto val) { return val != init; })); + + if constexpr (filter_input_key) { + auto& hypersparse_key_offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (hypersparse_key_offsets.index() == 0) { + assert(output_offsets.index() == 0); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<0>(hypersparse_key_offsets).begin() + std::get<0>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<0>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<0>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } else { + assert(output_offsets.index() == 1); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<1>(hypersparse_key_offsets).begin() + std::get<1>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<1>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<1>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } + } else { + static_assert(!use_input_key); + assert(process_local_edges[j]); + if (output_offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(uint32_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(size_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + + (*edge_partition_deg1_hypersparse_output_offset_vectors) + .push_back(std::move(output_offsets)); + + resize_dataframe_buffer(output_buffer, 0, loop_stream); + shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector deg1_copy_sizes(loop_count); + raft::update_host( + deg1_copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + copy_sizes[j] += deg1_copy_sizes[j]; + auto& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } else { + assert(offsets.index() == 1); + std::get<1>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } + // skip shrink_to_fit() to cut execution time + } + } + } + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + resize_dataframe_buffer(edge_partition_values[j], copy_sizes[j], handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } + } + + size_t min_element_size{cache_line_size}; + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(T), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + assert((cache_line_size % min_element_size) == 0); + size_t value_alignment = cache_line_size / min_element_size; + + size_t offset_alignment = 1; + if (graph_view.use_dcs()) { + static_assert(((cache_line_size % sizeof(uint32_t)) == 0) && + ((cache_line_size % sizeof(size_t)) == 0)); + offset_alignment = + cache_line_size / (uint32_key_output_offset ? sizeof(uint32_t) : sizeof(size_t)); + } + + std::optional> rx_value_sizes{}; + std::optional> rx_value_displs{}; + std::optional> rx_values{}; + + std::optional> rx_offset_sizes{}; + std::optional> rx_offset_displs{}; + std::optional, rmm::device_uvector>> + rx_offsets{}; + { + auto size_per_rank = + loop_count * (graph_view.use_dcs() ? 2 /* value buffer size, offset buffer size */ + : 1 /* value buffer size */); + rmm::device_uvector d_aggregate_buffer_sizes(minor_comm_size * size_per_rank, + handle.get_stream()); + std::vector h_buffer_sizes(size_per_rank); + for (size_t j = 0; j < loop_count; ++j) { + h_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); + if (graph_view.use_dcs()) { + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + h_buffer_sizes[loop_count + j] = std::get<0>(offsets).size(); + } else { + assert(offsets.index() == 1); + h_buffer_sizes[loop_count + j] = std::get<1>(offsets).size(); + } + } + } + raft::update_device(d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + h_buffer_sizes.data(), + h_buffer_sizes.size(), + handle.get_stream()); + device_allgather(minor_comm, + d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + d_aggregate_buffer_sizes.data(), + size_per_rank, + handle.get_stream()); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + std::vector h_aggregate_buffer_sizes(d_aggregate_buffer_sizes.size()); + raft::update_host(h_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.size(), + handle.get_stream()); + handle.sync_stream(); + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + rx_value_sizes = std::vector(minor_comm_size); + rx_value_displs = std::vector(minor_comm_size); + if (graph_view.use_dcs()) { + rx_offset_sizes = std::vector(minor_comm_size); + rx_offset_displs = std::vector(minor_comm_size); + } + for (int k = 0; k < minor_comm_size; ++k) { + (*rx_value_sizes)[k] = h_aggregate_buffer_sizes[k * size_per_rank + j]; + if (graph_view.use_dcs()) { + (*rx_offset_sizes)[k] = + h_aggregate_buffer_sizes[k * size_per_rank + loop_count + j]; + } + } + + std::vector aligned_sizes(minor_comm_size); + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_value_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_value_sizes)[k], value_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_value_displs).begin(), size_t{0}); + + if (graph_view.use_dcs()) { + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_offset_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_offset_sizes)[k], offset_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_offset_displs).begin(), size_t{0}); + } + + rx_values = allocate_dataframe_buffer( + (*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream()); + if (graph_view.use_dcs()) { + if (uint32_key_output_offset) { + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } else { + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& values = edge_partition_values[j]; + + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + get_dataframe_buffer_begin(*rx_values), + values.size(), + *rx_value_sizes, + *rx_value_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + dataframe_buffer_iterator_type_t{}, + values.size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (graph_view.use_dcs()) { + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& values = edge_partition_values[j]; + + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + std::get<0>(offsets).data(), + std::get<0>(*rx_offsets).data(), + std::get<0>(offsets).size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + std::get<0>(offsets).data(), + static_cast(nullptr), + std::get<0>(offsets).size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } else { + assert(offsets.index() == 1); + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + std::get<1>(offsets).data(), + std::get<1>(*rx_offsets).data(), + std::get<1>(offsets).size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + std::get<1>(offsets).data(), + static_cast(nullptr), + std::get<1>(offsets).size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } + } + device_group_end(minor_comm); + } + handle.sync_stream(); // this is required before edge_partition_values.clear(); + edge_partition_values.clear(); + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); + } // to ensure that memory is freed + + if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) { + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + auto partition_idx = i + j; + + { // remove gaps introduced to enforce alignment + rmm::device_uvector bitmap( + packed_bool_size(size_dataframe_buffer(*rx_values)), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + rmm::device_uvector d_displs((*rx_value_displs).size(), handle.get_stream()); + rmm::device_uvector d_sizes((*rx_value_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_value_displs).data(), + (*rx_value_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_value_sizes).data(), + (*rx_value_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + value_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = value_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + resize_dataframe_buffer( + *rx_values, + thrust::distance( + get_dataframe_buffer_begin(*rx_values), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + std::exclusive_scan((*rx_value_sizes).begin(), + (*rx_value_sizes).end(), + (*rx_value_displs).begin(), + size_t{0}); // now gaps are removed + + if (rx_offsets) { + size_t num_offsets = ((*rx_offsets).index() == 0) + ? size_dataframe_buffer(std::get<0>(*rx_offsets)) + : size_dataframe_buffer(std::get<1>(*rx_offsets)); + bitmap.resize(packed_bool_size(num_offsets), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + d_displs.resize((*rx_offset_displs).size(), handle.get_stream()); + d_sizes.resize((*rx_offset_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_offset_displs).data(), + (*rx_offset_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_offset_sizes).data(), + (*rx_offset_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + offset_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = offset_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if ((*rx_offsets).index() == 0) { + resize_dataframe_buffer( + std::get<0>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + get_dataframe_buffer_end(std::get<0>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } else { + resize_dataframe_buffer( + std::get<1>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + get_dataframe_buffer_end(std::get<1>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } + std::exclusive_scan((*rx_offset_sizes).begin(), + (*rx_offset_sizes).end(), + (*rx_offset_displs).begin(), + size_t{0}); // now gaps are removed + } + } + + size_t output_range_size{}; + if constexpr (filter_input_key) { + output_range_size = local_key_list_sizes[partition_idx]; + } else { + auto const& segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + output_range_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : graph_view.local_vertex_partition_range_size(); + } + auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + auto old_size = std::get<0>(selected_ranks).size(); + std::get<0>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin() + old_size, + std::get<0>(selected_ranks).end(), + static_cast(minor_comm_size)); + } else { + assert(selected_ranks.index() == 1); + auto old_size = std::get<1>(selected_ranks).size(); + std::get<1>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin() + old_size, + std::get<1>(selected_ranks).end(), + minor_comm_size); + } + if (rx_offsets) { + rmm::device_uvector lasts((*rx_offset_displs).size(), handle.get_stream()); + raft::update_device(lasts.data(), + (*rx_offset_displs).data() + 1, + (*rx_offset_displs).size() - 1, + handle.get_stream()); + auto num_elements = (*rx_offset_displs).back() + (*rx_offset_sizes).back(); + lasts.set_element_async(lasts.size() - 1, num_elements, handle.get_stream()); + handle.sync_stream(); // this is necessary before num_elements becomes out-of-scope + + if ((*rx_offsets).index() == 0) { + auto& offsets = std::get<0>(*rx_offsets); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } else { + assert((*rx_offsets).index() == 1); + auto& offsets = std::get<1>(*rx_offsets); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } + } + + size_t num_positions = (selected_ranks.index() == 0) ? std::get<0>(selected_ranks).size() + : std::get<1>(selected_ranks).size(); + if (num_positions <= static_cast(std::numeric_limits::max())) { + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), uint32_t{0}); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_positions.begin(), + tmp_vertex_value_output_first); + } else { + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), size_t{0}); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_positions.begin(), + tmp_vertex_value_output_first); + } + } + handle.sync_stream(); + } else { + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + device_reduce(minor_comm, + get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]), + tmp_vertex_value_output_first, + size_dataframe_buffer(edge_partition_major_output_buffers[j]), + ReduceOp::compatible_raft_comms_op, + static_cast(partition_idx), + handle.get_stream()); + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + } + } + + // 10. communication + + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // applying the initial value is deferred to here + vertex_t max_vertex_partition_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + max_vertex_partition_size = + std::max(max_vertex_partition_size, + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); + } + auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); + auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); + std::optional> minor_key_offsets{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); + } else { + minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); + } + for (int i = 0; i < major_comm_size; ++i) { + auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + thrust::fill(handle.get_thrust_policy(), + tx_buffer_first, + tx_buffer_first + + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), + minor_init); + auto value_first = thrust::make_transform_iterator( + view.value_first(), + cuda::proclaim_return_type( + [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); + thrust::scatter(handle.get_thrust_policy(), + value_first + (*minor_key_offsets)[i], + value_first + (*minor_key_offsets)[i + 1], + thrust::make_transform_iterator( + (*(view.keys())).begin() + (*minor_key_offsets)[i], + cuda::proclaim_return_type( + [key_first = graph_view.vertex_partition_range_first( + this_segment_vertex_partition_id)] __device__(auto key) { + return key - key_first; + })), + tx_buffer_first); + device_reduce(major_comm, + tx_buffer_first, + tmp_vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } else { + auto first_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); + vertex_t minor_range_first = + graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - + minor_range_first; + device_reduce(major_comm, + view.value_first() + offset, + tmp_vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } + } +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh index f426cd993ea..a166f37906a 100644 --- a/cpp/src/prims/detail/prim_functors.cuh +++ b/cpp/src/prims/detail/prim_functors.cuh @@ -21,6 +21,23 @@ namespace cugraph { namespace detail { +template +struct const_true_e_op_t { + __device__ auto operator()(std::conditional_t key_or_src, + std::conditional_t key_or_dst, + src_value_t, + dst_value_t, + e_value_t) const + { + return true; + } +}; + template +struct call_const_true_e_op_t { + __device__ auto operator()(edge_t i) const { return true; } +}; + template +#include + +namespace cugraph { + +namespace detail { + +template +__host__ __device__ priority_t +rank_to_priority(int rank, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffic */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (rank == root) { + return priority_t{0}; + } else if (rank / subgroup_size == + root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in + // [1, subgroup_size) + auto rank_dist = + static_cast(((static_cast(rank) + subgroup_size) - root) % subgroup_size); + int modulo = subgroup_size - 1; + return static_cast(1 + (static_cast(rank_dist - 1) + (offset % modulo)) % + modulo); + } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) + auto subgroup_dist = + static_cast(((static_cast(rank / subgroup_size) + (comm_size / subgroup_size)) - + (root / subgroup_size)) % + (comm_size / subgroup_size)); + auto intra_subgroup_rank_dist = static_cast( + ((static_cast(rank % subgroup_size) + subgroup_size) - (root % subgroup_size)) % + subgroup_size); + auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist; + int modulo = comm_size - subgroup_size; + return static_cast( + subgroup_size + + (static_cast(rank_dist - subgroup_size) + (offset % modulo)) % modulo); + } +} + +template +__host__ __device__ int priority_to_rank( + priority_t priority, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffict */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (priority == priority_t{0}) { + return root; + } else if (priority < static_cast(subgroup_size)) { + int modulo = subgroup_size - 1; + auto rank_dist = static_cast( + 1 + ((static_cast(priority - 1) + modulo) - (offset % modulo)) % modulo); + return static_cast((root - (root % subgroup_size)) + + ((static_cast(root) + rank_dist) % subgroup_size)); + } else { + int modulo = comm_size - subgroup_size; + auto rank_dist = static_cast( + subgroup_size + + ((static_cast(priority) - subgroup_size) + (modulo - (offset % modulo))) % modulo); + auto subgroup_dist = rank_dist / subgroup_size; + auto intra_subgroup_rank_dist = rank_dist % subgroup_size; + return static_cast( + ((static_cast((root / subgroup_size) * subgroup_size) + + subgroup_dist * subgroup_size) + + (static_cast(root) + intra_subgroup_rank_dist) % subgroup_size) % + comm_size); + } +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh index d51e03628e1..5741c98d90e 100644 --- a/cpp/src/prims/extract_transform_e.cuh +++ b/cpp/src/prims/extract_transform_e.cuh @@ -116,8 +116,8 @@ extract_transform_e(raft::handle_t const& handle, thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last())); auto value_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(std::ignore, value_buffer) = detail:: - extract_transform_v_frontier_e( + std::tie(std::ignore, value_buffer) = + detail::extract_transform_v_frontier_e( handle, graph_view, frontier, diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh index 7ad033b93c2..ba227b263bc 100644 --- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh +++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh @@ -64,13 +64,13 @@ namespace cugraph { * @return Dataframe buffer object storing extracted and accumulated valid @p e_op return values. */ template decltype(allocate_dataframe_buffer< - typename detail::edge_op_result_type(size_t{0}, handle.get_stream()); std::tie(std::ignore, value_buffer) = - detail::extract_transform_v_frontier_e(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - do_expensive_check); + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + do_expensive_check); return value_buffer; } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 58dbf7e74a0..a36cf332eb4 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include "prims/vertex_frontier.cuh" + #include #include #include @@ -129,8 +131,8 @@ template void fill_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMajorPropertyOutputWrapper edge_major_property_output, T input) { @@ -153,12 +155,12 @@ void fill_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto local_v_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -169,14 +171,18 @@ void fill_edge_major_property(raft::handle_t const& handle, edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + local_v_list_sizes[i], + i, + handle.get_stream()); if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), input, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -199,7 +205,7 @@ void fill_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), input, @@ -219,7 +225,7 @@ void fill_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), val_first, - val_first + rx_counts[i], + val_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -232,17 +238,18 @@ void fill_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_firsts[0]] __device__( auto v) { packed_bool_atomic_set(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -286,8 +293,8 @@ template void fill_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMinorPropertyOutputWrapper edge_minor_property_output, T input) { @@ -300,22 +307,269 @@ void fill_edge_minor_property(raft::handle_t const& handle, using edge_t = typename GraphViewType::edge_type; auto edge_partition_value_first = edge_minor_property_output.value_first(); + vertex_t minor_range_first{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_range_first = graph_view.local_edge_partition_src_range_first(); + } else { + minor_range_first = graph_view.local_edge_partition_dst_range_first(); + } + if constexpr (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); - }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); + constexpr size_t packed_bool_word_bcast_alignment = + 128 / + sizeof( + uint32_t); // 128B cache line alignment (unaligned ncclBroadcast operations are slower) + + std::vector max_tmp_buffer_sizes{}; + std::vector local_v_list_sizes{}; + std::vector local_v_list_range_firsts{}; + std::vector local_v_list_range_lasts{}; + { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + rmm::device_uvector d_aggregate_tmps(major_comm_size * size_t{4}, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + major_comm_rank * size_t{4}, + d_aggregate_tmps.begin() + (major_comm_rank + 1) * size_t{4}, + [max_tmp_buffer_size = static_cast( + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05), + sorted_unique_vertex_first, + v_list_size, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return static_cast(v_list_size); + } else if (i == 2) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_vertex_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else { + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_vertex_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + }); + + if (major_comm_size > 1) { // allgather max_tmp_buffer_size, v_list_size, v_list_range_first + // (inclusive), v_list_range_last (exclusive) + device_allgather(major_comm, + d_aggregate_tmps.data() + major_comm_rank * size_t{4}, + d_aggregate_tmps.data(), + size_t{4}, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(major_comm_size); + local_v_list_sizes = std::vector(major_comm_size); + local_v_list_range_firsts = std::vector(major_comm_size); + local_v_list_range_lasts = std::vector(major_comm_size); + for (int i = 0; i < major_comm_size; ++i) { + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * size_t{4}]; + local_v_list_sizes[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 1]); + local_v_list_range_firsts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 2]); + local_v_list_range_lasts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 3]); + } + } + + auto edge_partition_keys = edge_minor_property_output.keys(); + + std::optional> v_list_bitmap{std::nullopt}; + std::optional> compressed_v_list{std::nullopt}; + if (major_comm_size > 1) { + bool v_compressible{false}; + if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += (range_size > 0) + ? (static_cast(num_keys) / static_cast(range_size)) + : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + double threshold_ratio = + 1.0 / static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); + auto avg_v_list_size = std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) / + static_cast(major_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_v_list_size) > packed_bool_word_bcast_alignment)) { + if (is_packed_bool() && + !edge_partition_keys) { // directly update edge_minor_property_output (with special + // care for unaligned boundaries) + rmm::device_uvector boundary_words( + packed_bool_word_bcast_alignment, + handle.get_stream()); // for unaligned boundaries + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) == + packed_bool_offset(graph_view.local_vertex_partition_range_first() - + minor_range_first)) && + (((local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bools_per_word()) != + 0)) { // there are unaligned bits (fewer than packed_bools_per_word()) in the vertex + // partition boundary + leading_boundary_words = packed_bool_word_bcast_alignment; + } + thrust::fill(handle.get_thrust_policy(), + boundary_words.begin(), + boundary_words.begin() + leading_boundary_words, + packed_bool_empty_mask()); + if (local_v_list_range_firsts[major_comm_rank] < + local_v_list_range_lasts[major_comm_rank]) { + auto word_offset_first = + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first); + auto word_offset_last = + packed_bool_offset((local_v_list_range_lasts[major_comm_rank] - 1) - + minor_range_first) + + 1; + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(word_offset_first), + thrust::make_counting_iterator(word_offset_last), + [sorted_unique_vertex_first, + sorted_unique_vertex_last, + input, + minor_range_first, + leading_boundary_words, + word_offset_first, + vertex_partition_range_last = graph_view.local_vertex_partition_range_last(), + output_value_first = edge_partition_value_first, + boundary_words = raft::device_span( + boundary_words.data(), boundary_words.size())] __device__(auto i) { + auto& word = ((i - word_offset_first) < leading_boundary_words) + ? boundary_words[i - word_offset_first] + : *(output_value_first + i); + auto word_v_first = + minor_range_first + static_cast(i * packed_bools_per_word()); + auto word_v_last = + ((vertex_partition_range_last - word_v_first) <= packed_bools_per_word()) + ? vertex_partition_range_last + : (word_v_first + static_cast(packed_bools_per_word())); + auto it = thrust::lower_bound( + thrust::seq, sorted_unique_vertex_first, sorted_unique_vertex_last, word_v_first); + while ((it != sorted_unique_vertex_last) && (*it < word_v_last)) { + auto v_offset = *it - minor_range_first; + if (input) { + word |= packed_bool_mask(v_offset); + } else { + word &= ~packed_bool_mask(v_offset); + } + ++it; + } + }); + } + rmm::device_uvector aggregate_boundary_words( + major_comm_size * packed_bool_word_bcast_alignment, handle.get_stream()); + device_allgather(major_comm, + boundary_words.data(), + aggregate_boundary_words.data(), + packed_bool_word_bcast_alignment, + handle.get_stream()); + v_list_bitmap = std::move(aggregate_boundary_words); + } else { + v_list_bitmap = + compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } else if (v_compressible) { + rmm::device_uvector tmps(local_v_list_sizes[major_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[major_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); + } + } + + std::optional> stream_pool_indices{std::nullopt}; + size_t num_concurrent_bcasts{1}; + { + size_t tmp_buffer_size_per_loop{}; + for (int i = 0; i < major_comm_size; ++i) { + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { + tmp_buffer_size_per_loop += 0; + } else if (v_list_bitmap) { + tmp_buffer_size_per_loop += + packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) * + sizeof(uint32_t) + + static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); + } else { + tmp_buffer_size_per_loop += static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); + } + } + tmp_buffer_size_per_loop /= major_comm_size; + size_t max_streams = + static_cast(major_comm_size); // to allow setting num_concurrent_bcasts above + // hnadle.get_stream_pool_size() + stream_pool_indices = init_stream_pool_indices( + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(major_comm_size), + tmp_buffer_size_per_loop, + major_comm_size, + 1, + max_streams); + num_concurrent_bcasts = (*stream_pool_indices).size(); + if ((*stream_pool_indices).size() > handle.get_stream_pool_size()) { + (*stream_pool_indices).resize(handle.get_stream_pool_size()); + } + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -324,88 +578,417 @@ void fill_edge_minor_property(raft::handle_t const& handle, key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(size_t{0})); - auto edge_partition_keys = edge_minor_property_output.keys(); - for (int i = 0; i < major_comm_size; ++i) { - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + for (size_t i = 0; i < static_cast(major_comm_size); i += num_concurrent_bcasts) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); + auto loop_count = std::min(num_concurrent_bcasts, static_cast(major_comm_size) - i); + + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { + std::vector leading_boundary_word_counts(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, partition_idx, minor_comm_rank); + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) == + packed_bool_offset(graph_view.vertex_partition_range_first(vertex_partition_id) - + minor_range_first)) && + (((local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bools_per_word()) != 0)) { + leading_boundary_words = packed_bool_word_bcast_alignment; + } + leading_boundary_word_counts[j] = leading_boundary_words; + } + device_group_start(major_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + size_t bcast_size{0}; + vertex_t packed_bool_offset_first{0}; + if (local_v_list_range_firsts[partition_idx] < local_v_list_range_lasts[partition_idx]) { + auto leading_boundary_words = leading_boundary_word_counts[j]; + packed_bool_offset_first = + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) + + static_cast(leading_boundary_words); + auto packed_bool_offset_last = + packed_bool_offset(local_v_list_range_lasts[partition_idx] - 1 - minor_range_first); + if (packed_bool_offset_first <= packed_bool_offset_last) { + bcast_size = (packed_bool_offset_last - packed_bool_offset_first) + 1; + } + } + + device_bcast(major_comm, + edge_partition_value_first + packed_bool_offset_first, + edge_partition_value_first + packed_bool_offset_first, + bcast_size, + static_cast(partition_idx), + handle.get_stream()); + } + device_group_end(major_comm); + + rmm::device_uvector d_leading_boundary_word_counts( + leading_boundary_word_counts.size(), handle.get_stream()); + raft::update_device(d_leading_boundary_word_counts.data(), + leading_boundary_word_counts.data(), + leading_boundary_word_counts.size(), + handle.get_stream()); + + rmm::device_uvector d_local_v_list_range_firsts(loop_count, handle.get_stream()); + raft::update_device(d_local_v_list_range_firsts.data(), + local_v_list_range_firsts.data() + i, + loop_count, + handle.get_stream()); - if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), - [rx_vertex_first = rx_vertices.begin(), - input, - subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], - subrange_key_last = (*edge_partition_keys).begin() + (*key_offsets)[i + 1], - edge_partition_value_first = edge_partition_value_first, - subrange_start_offset = (*key_offsets)[i]] __device__(auto i) { - auto minor = *(rx_vertex_first + i); - auto it = - thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); - if ((it != subrange_key_last) && (*it == minor)) { - auto subrange_offset = thrust::distance(subrange_key_first, it); - if constexpr (contains_packed_bool_element) { - fill_scalar_or_thrust_tuple( - edge_partition_value_first, subrange_start_offset + subrange_offset, input); - } else { - *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + thrust::make_counting_iterator(loop_count * packed_bool_word_bcast_alignment), + [input, + minor_range_first, + leading_boundary_word_counts = raft::device_span( + d_leading_boundary_word_counts.data(), d_leading_boundary_word_counts.size()), + local_v_list_range_firsts = raft::device_span( + d_local_v_list_range_firsts.data(), d_local_v_list_range_firsts.size()), + aggregate_boundary_words = raft::device_span( + (*v_list_bitmap).data() + i * packed_bool_word_bcast_alignment, + loop_count * packed_bool_word_bcast_alignment), + output_value_first = edge_partition_value_first] __device__(size_t i) { + auto j = i / packed_bool_word_bcast_alignment; + auto leading_boundary_words = leading_boundary_word_counts[j]; + if ((i % packed_bool_word_bcast_alignment) < leading_boundary_words) { + auto boundary_word = aggregate_boundary_words[i]; + if (boundary_word != packed_bool_empty_mask()) { + auto word_offset = + packed_bool_offset(local_v_list_range_firsts[j] - minor_range_first) + + (i % packed_bool_word_bcast_alignment); + cuda::atomic_ref word( + *(output_value_first + word_offset)); + if (input) { + word.fetch_or(aggregate_boundary_words[i], cuda::std::memory_order_relaxed); + } else { + word.fetch_and(~aggregate_boundary_words[i], cuda::std::memory_order_relaxed); + } } } }); } else { - if constexpr (contains_packed_bool_element) { - thrust::for_each(handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), - [edge_partition, - rx_vertex_first = rx_vertices.begin(), - input, - output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = - edge_partition.minor_offset_from_minor_nocheck(rx_vertex); - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); - }); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), - cuda::proclaim_return_type([edge_partition] __device__(auto v) { - return edge_partition.minor_offset_from_minor_nocheck(v); - })); - auto val_first = thrust::make_constant_iterator(input); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + rx_counts[i], - map_first, - edge_partition_value_first); + std::vector, rmm::device_uvector>> + edge_partition_v_buffers{}; + edge_partition_v_buffers.reserve(loop_count); + rmm::device_uvector dummy_counters(loop_count, handle.get_stream()); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + std::variant, rmm::device_uvector> v_buffer = + rmm::device_uvector(0, handle.get_stream()); + if (v_list_bitmap) { + v_buffer = rmm::device_uvector( + packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); + } else if (compressed_v_list) { + v_buffer = + rmm::device_uvector(local_v_list_sizes[partition_idx], handle.get_stream()); + } else { + std::get<0>(v_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); + } + edge_partition_v_buffers.push_back(std::move(v_buffer)); + } + + device_group_start(major_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto& v_buffer = edge_partition_v_buffers[j]; + if (v_list_bitmap) { + device_bcast(major_comm, + (*v_list_bitmap).data(), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(major_comm, + (*compressed_v_list).data(), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(major_comm, + (static_cast(partition_idx) == major_comm_rank) + ? sorted_unique_vertex_first + : static_cast(nullptr), + std::get<0>(v_buffer).data(), + std::get<0>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(major_comm); + bool kernel_fusion = + !edge_partition_keys && !v_list_bitmap && (loop_count > 1) && + (static_cast(std::reduce(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count))) < + size_t{256 * 1024} /* tuning parameter (binary search vs kernel launch overhead) */ * + loop_count); // FIXME: kernle fusion can be useful even when + // edge_partition_keys.has_value() is true + + if (!kernel_fusion) { + if (stream_pool_indices) { handle.sync_stream(); } + } + + if (!kernel_fusion) { + size_t stream_pool_size{0}; + if (stream_pool_indices) { stream_pool_size = (*stream_pool_indices).size(); } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j % stream_pool_size]) + : handle.get_stream(); + + if (v_list_bitmap) { + auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], + loop_stream); + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + rx_vertices.begin(), + raft::device_span(dummy_counters.data() + j, size_t{1}), + local_v_list_range_firsts[partition_idx], + local_v_list_range_lasts[partition_idx], + loop_stream); + edge_partition_v_buffers[j] = std::move(rx_vertices); + } + + if (edge_partition_keys) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], + input, + subrange_key_first = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], + subrange_key_last = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], + edge_partition_value_first = edge_partition_value_first, + subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto it = + thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); + if ((it != subrange_key_last) && (*it == minor)) { + auto subrange_offset = thrust::distance(subrange_key_first, it); + if constexpr (contains_packed_bool_element) { + fill_scalar_or_thrust_tuple( + edge_partition_value_first, subrange_start_offset + subrange_offset, input); + } else { + *(edge_partition_value_first + subrange_start_offset + subrange_offset) = + input; + } + } + }); + } else { + if constexpr (contains_packed_bool_element) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [minor_range_first, + rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + std::get<1>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first, + range_first = + local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { + return static_cast(v_offset + (range_first - minor_range_first)); + })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + std::get<0>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } + } + } + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + } else { // kernel fusion + std::vector h_vertex_vars(loop_count /* range_first values */ + + (loop_count + 1) /* loop offsets */); + std::copy(local_v_list_range_firsts.begin() + i, + local_v_list_range_firsts.begin() + (i + loop_count), + h_vertex_vars.begin()); + h_vertex_vars[loop_count] = 0; + std::inclusive_scan(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count), + h_vertex_vars.begin() + (loop_count + 1)); + std::vector h_ptrs(loop_count); + if (compressed_v_list) { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<1>(edge_partition_v_buffers[j]).data()); + } + } else { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<0>(edge_partition_v_buffers[j]).data()); + } + } + rmm::device_uvector d_vertex_vars(h_vertex_vars.size(), handle.get_stream()); + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_vertex_vars.data(), h_vertex_vars.data(), h_vertex_vars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + + raft::device_span range_firsts(d_vertex_vars.data(), loop_count); + raft::device_span loop_offsets(d_vertex_vars.data() + loop_count, + loop_count + 1); + if constexpr (contains_packed_bool_element) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(h_vertex_vars.back()), + [range_firsts, + loop_offsets, + minor_range_first, + input, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + output_value_first = edge_partition_value_first, + compressed = compressed_v_list.has_value()] __device__(auto i) { + auto loop_idx = + thrust::distance(loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto rx_first = rx_firsts[loop_idx]; + vertex_t minor{}; + if (compressed) { + minor = range_firsts[loop_idx] + + *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } else { + minor = *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + auto val_first = thrust::make_constant_iterator(input); + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [range_firsts, + loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = + range_firsts[loop_idx] + *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } + } } } } } else { assert(graph_view.local_vertex_partition_range_size() == - graph_view.local_edge_partition_src_range_size()); + (GraphViewType::is_storage_transposed + ? graph_view.local_edge_partition_src_range_size() + : graph_view.local_edge_partition_dst_range_sizse())); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_first] __device__(auto v) { fill_scalar_or_thrust_tuple(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -451,8 +1034,8 @@ void fill_edge_src_property(raft::handle_t const& handle, /** * @brief Fill graph edge source property values to the input value. * - * This version fills only a subset of graph edge source property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -461,10 +1044,12 @@ void fill_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [vertex_first, sorted_unique_vertex_last) should be sorted & distinct (and + * should belong to the vertex partition assigned to this process in multi-GPU), otherwise undefined + * behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_src_property_output edge_src_property_view_t class object to store source property * values (for the edge source assigned to this process in multi-GPU). * @param input Edge source property values will be set to @p input. @@ -476,8 +1061,8 @@ template void fill_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeSrcValueOutputWrapper edge_src_property_output, T input, bool do_expensive_check = false) @@ -486,8 +1071,8 @@ void fill_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -498,17 +1083,25 @@ void fill_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } else { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } } @@ -552,8 +1145,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, /** * @brief Fill graph edge destination property values to the input value. * - * This version fills only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -563,10 +1156,12 @@ void fill_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be sorted & + * distinct (and should belong to the vertex partition assigned to this process in multi-GPU), + * otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_dst_property_output edge_dst_property_view_t class object to store destination * property values (for the edge destinations assigned to this process in multi-GPU). * @param input Edge destination property values will be set to @p input. @@ -578,8 +1173,8 @@ template void fill_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeDstValueOutputWrapper edge_dst_property_output, T input, bool do_expensive_check = false) @@ -588,8 +1183,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -600,17 +1195,25 @@ void fill_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } else { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } } diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh index ce5e5d3e8cf..f03e8f54fb2 100644 --- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh +++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh @@ -250,11 +250,14 @@ void per_v_pair_transform_dst_nbr_intersection( } auto num_input_pairs = static_cast(thrust::distance(vertex_pair_first, vertex_pair_last)); - std::optional> unique_vertices{std::nullopt}; + std::optional> sorted_unique_vertices{std::nullopt}; std::optional(size_t{0}, rmm::cuda_stream_view{}))> - property_buffer_for_unique_vertices{std::nullopt}; + property_buffer_for_sorted_unique_vertices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - unique_vertices = rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); + auto& comm = handle.get_comms(); + + sorted_unique_vertices = + rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); auto elem0_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -262,7 +265,7 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem0_first, elem0_first + num_input_pairs, - (*unique_vertices).begin()); + (*sorted_unique_vertices).begin()); auto elem1_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -270,25 +273,25 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem1_first, elem1_first + num_input_pairs, - (*unique_vertices).begin() + num_input_pairs); - thrust::sort(handle.get_thrust_policy(), (*unique_vertices).begin(), (*unique_vertices).end()); - (*unique_vertices) - .resize(thrust::distance((*unique_vertices).begin(), + (*sorted_unique_vertices).begin() + num_input_pairs); + thrust::sort(handle.get_thrust_policy(), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end()); + (*sorted_unique_vertices) + .resize(thrust::distance((*sorted_unique_vertices).begin(), thrust::unique(handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end())), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end())), handle.get_stream()); - std::tie(unique_vertices, property_buffer_for_unique_vertices) = - collect_values_for_unique_int_vertices(handle, - std::move(*unique_vertices), - vertex_value_input_first, - graph_view.vertex_partition_range_lasts()); - thrust::sort_by_key( - handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end(), - (*property_buffer_for_unique_vertices).begin()); // necessary for binary search + property_buffer_for_sorted_unique_vertices = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span((*sorted_unique_vertices).data(), + (*sorted_unique_vertices).size()), + vertex_value_input_first, + graph_view.vertex_partition_range_lasts(), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } rmm::device_uvector vertex_pair_indices(num_input_pairs, handle.get_stream()); @@ -412,32 +415,32 @@ void per_v_pair_transform_dst_nbr_intersection( do_expensive_check); } - if (unique_vertices) { - auto vertex_value_input_for_unique_vertices_first = - get_dataframe_buffer_begin(*property_buffer_for_unique_vertices); - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(this_chunk_size), - detail::call_intersection_op_t< - GraphViewType, - decltype(vertex_value_input_for_unique_vertices_first), - typename decltype(r_nbr_intersection_property_values0)::const_pointer, - IntersectionOp, - decltype(chunk_vertex_pair_index_first), - VertexPairIterator, - VertexPairValueOutputIterator>{edge_partition, - thrust::make_optional>( - (*unique_vertices).data(), (*unique_vertices).size()), - vertex_value_input_for_unique_vertices_first, - intersection_op, - intersection_offsets.data(), - intersection_indices.data(), - r_nbr_intersection_property_values0.data(), - r_nbr_intersection_property_values1.data(), - chunk_vertex_pair_index_first, - vertex_pair_first, - vertex_pair_value_output_first}); + if (sorted_unique_vertices) { + auto vertex_value_input_for_sorted_unique_vertices_first = + get_dataframe_buffer_begin(*property_buffer_for_sorted_unique_vertices); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(this_chunk_size), + detail::call_intersection_op_t< + GraphViewType, + decltype(vertex_value_input_for_sorted_unique_vertices_first), + typename decltype(r_nbr_intersection_property_values0)::const_pointer, + IntersectionOp, + decltype(chunk_vertex_pair_index_first), + VertexPairIterator, + VertexPairValueOutputIterator>{ + edge_partition, + thrust::make_optional>( + (*sorted_unique_vertices).data(), (*sorted_unique_vertices).size()), + vertex_value_input_for_sorted_unique_vertices_first, + intersection_op, + intersection_offsets.data(), + intersection_indices.data(), + r_nbr_intersection_property_values0.data(), + r_nbr_intersection_property_values1.data(), + chunk_vertex_pair_index_first, + vertex_pair_first, + vertex_pair_value_output_first}); } else { thrust::for_each(handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index dd34d4b06ab..30706632ad2 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -206,7 +206,7 @@ struct return_value_compute_offset_t { template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -237,7 +237,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using key_buffer_t = dataframe_buffer_type_t; using edge_partition_src_input_device_view_t = std::conditional_t< @@ -286,15 +286,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (do_expensive_check) { // FIXME: better re-factor this check function? - auto frontier_vertex_first = - thrust_tuple_get_or_identity(frontier.begin()); - auto frontier_vertex_last = - thrust_tuple_get_or_identity(frontier.end()); + auto key_list_vertex_first = + thrust_tuple_get_or_identity(key_list.begin()); + auto key_list_vertex_last = + thrust_tuple_get_or_identity(key_list.end()); auto num_invalid_keys = - frontier.size() - + key_list.size() - thrust::count_if(handle.get_thrust_policy(), - frontier_vertex_first, - frontier_vertex_last, + key_list_vertex_first, + key_list_vertex_last, check_in_range_t{graph_view.local_vertex_partition_range_first(), graph_view.local_vertex_partition_range_last()}); if constexpr (GraphViewType::is_multi_gpu) { @@ -302,35 +302,35 @@ per_v_random_select_transform_e(raft::handle_t const& handle, handle.get_comms(), num_invalid_keys, raft::comms::op_t::SUM, handle.get_stream()); } CUGRAPH_EXPECTS(num_invalid_keys == size_t{0}, - "Invalid input argument: frontier includes out-of-range keys."); + "Invalid input argument: key_list includes out-of-range keys."); } - std::vector local_frontier_sizes{}; + std::vector local_key_list_sizes{}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); + local_key_list_sizes = host_scalar_allgather(minor_comm, key_list.size(), handle.get_stream()); } else { - local_frontier_sizes = std::vector{frontier.size()}; + local_key_list_sizes = std::vector{key_list.size()}; } - std::vector local_frontier_displacements(local_frontier_sizes.size()); - std::exclusive_scan(local_frontier_sizes.begin(), - local_frontier_sizes.end(), - local_frontier_displacements.begin(), + std::vector local_key_list_displacements(local_key_list_sizes.size()); + std::exclusive_scan(local_key_list_sizes.begin(), + local_key_list_sizes.end(), + local_key_list_displacements.begin(), size_t{0}); - // 1. aggregate frontier + // 1. aggregate key_list - std::optional aggregate_local_frontier{std::nullopt}; + std::optional aggregate_local_key_list{std::nullopt}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - aggregate_local_frontier = allocate_dataframe_buffer( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + aggregate_local_key_list = allocate_dataframe_buffer( + local_key_list_displacements.back() + local_key_list_sizes.back(), handle.get_stream()); device_allgatherv(minor_comm, - frontier.begin(), - get_dataframe_buffer_begin(*aggregate_local_frontier), - local_frontier_sizes, - local_frontier_displacements, + key_list.begin(), + get_dataframe_buffer_begin(*aggregate_local_key_list), + local_key_list_sizes, + local_key_list_displacements, handle.get_stream()); } @@ -339,66 +339,66 @@ per_v_random_select_transform_e(raft::handle_t const& handle, rmm::device_uvector sample_local_nbr_indices(0, handle.get_stream()); std::optional> sample_key_indices{std::nullopt}; - std::vector local_frontier_sample_offsets{}; + std::vector local_key_list_sample_offsets{}; if constexpr (std::is_same_v>) { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = uniform_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier) - : frontier.begin(), - local_frontier_displacements, - local_frontier_sizes, + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement); } else { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = biased_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier) - : frontier.begin(), + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, e_bias_op, - local_frontier_displacements, - local_frontier_sizes, + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement, do_expensive_check); } - std::vector local_frontier_sample_counts(minor_comm_size); - std::adjacent_difference(local_frontier_sample_offsets.begin() + 1, - local_frontier_sample_offsets.end(), - local_frontier_sample_counts.begin()); + std::vector local_key_list_sample_counts(minor_comm_size); + std::adjacent_difference(local_key_list_sample_offsets.begin() + 1, + local_key_list_sample_offsets.end(), + local_key_list_sample_counts.begin()); // 3. transform auto sample_e_op_results = - allocate_dataframe_buffer(local_frontier_sample_offsets.back(), handle.get_stream()); + allocate_dataframe_buffer(local_key_list_sample_offsets.back(), handle.get_stream()); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - auto edge_partition_frontier_key_first = - ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier) - : frontier.begin()) + - local_frontier_displacements[i]; + auto edge_partition_key_list_first = + ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin()) + + local_key_list_displacements[i]; auto edge_partition_sample_local_nbr_index_first = - sample_local_nbr_indices.begin() + local_frontier_sample_offsets[i]; + sample_local_nbr_indices.begin() + local_key_list_sample_offsets[i]; auto edge_partition_sample_e_op_result_first = - get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_offsets[i]; + get_dataframe_buffer_begin(sample_e_op_results) + local_key_list_sample_offsets[i]; edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -415,14 +415,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (sample_key_indices) { auto edge_partition_sample_key_index_first = - (*sample_key_indices).begin() + local_frontier_sample_offsets[i]; + (*sample_key_indices).begin() + local_key_list_sample_offsets[i]; thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_frontier_sample_counts[i]), + thrust::make_counting_iterator(local_key_list_sample_counts[i]), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{ edge_partition, thrust::make_optional(edge_partition_sample_key_index_first), - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -444,10 +444,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle, thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(frontier.size() * K), + thrust::make_counting_iterator(key_list.size() * K), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{edge_partition, thrust::nullopt, - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -466,13 +466,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle, K}); } } - aggregate_local_frontier = std::nullopt; + aggregate_local_key_list = std::nullopt; // 4. shuffle randomly selected & transformed results and update sample_offsets auto sample_offsets = invalid_value ? std::nullopt : std::make_optional>( - frontier.size() + 1, handle.get_stream()); + key_list.size() + 1, handle.get_stream()); assert(K <= std::numeric_limits::max()); if (minor_comm_size > 1) { sample_local_nbr_indices.resize(0, handle.get_stream()); @@ -483,12 +483,12 @@ per_v_random_select_transform_e(raft::handle_t const& handle, std::tie(sample_e_op_results, std::ignore) = shuffle_values(minor_comm, get_dataframe_buffer_begin(sample_e_op_results), - local_frontier_sample_counts, + local_key_list_sample_counts, handle.get_stream()); std::tie(sample_key_indices, std::ignore) = shuffle_values( - minor_comm, (*sample_key_indices).begin(), local_frontier_sample_counts, handle.get_stream()); + minor_comm, (*sample_key_indices).begin(), local_key_list_sample_counts, handle.get_stream()); - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::fill( handle.get_thrust_policy(), sample_counts.begin(), sample_counts.end(), int32_t{0}); auto sample_intra_partition_displacements = @@ -504,7 +504,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_counts.resize(0, handle.get_stream()); sample_counts.shrink_to_fit(handle.get_stream()); - resize_dataframe_buffer(tmp_sample_e_op_results, frontier.size() * K, handle.get_stream()); + resize_dataframe_buffer(tmp_sample_e_op_results, key_list.size() * K, handle.get_stream()); thrust::fill(handle.get_thrust_policy(), get_dataframe_buffer_begin(tmp_sample_e_op_results), get_dataframe_buffer_end(tmp_sample_e_op_results), @@ -553,7 +553,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_e_op_results = std::move(tmp_sample_e_op_results); } else { if (!invalid_value) { - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::tabulate( handle.get_thrust_policy(), sample_counts.begin(), @@ -597,8 +597,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases. * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -609,8 +609,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -647,11 +647,11 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -682,7 +682,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, { return detail::per_v_random_select_transform_e(handle, graph_view, - frontier, + key_list, edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, @@ -705,8 +705,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * (uniform neighbor sampling). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -715,8 +715,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -747,11 +747,11 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -775,7 +775,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, return detail::per_v_random_select_transform_e( handle, graph_view, - frontier, + key_list, edge_src_dummy_property_t{}.view(), edge_dst_dummy_property_t{}.view(), edge_dummy_property_t{}.view(), @@ -783,7 +783,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, detail::edge_endpoint_dummy_property_view_t, detail::edge_endpoint_dummy_property_view_t, edge_dummy_property_view_t, - typename VertexFrontierBucketType::key_type>{}, + typename KeyBucketType::key_type>{}, edge_src_value_input, edge_dst_value_input, edge_value_input, diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index 5a5e9332094..c13816242bc 100644 --- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -924,11 +924,12 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); std::tie(unique_minor_keys, values_for_unique_keys) = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, kv_store_view, std::move(unique_minor_keys), cugraph::detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); if constexpr (KVStoreViewType::binary_search) { multi_gpu_minor_key_value_map_ptr = diff --git a/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh new file mode 100644 index 00000000000..1e0d366429e --- /dev/null +++ b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "prims/detail/per_v_transform_reduce_e.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include + +#include + +#include +#include + +namespace cugraph { + +/** + * @brief Iterate over every vertex's incoming edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce. In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to + * fill the wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first + * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the incoming + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 incoming edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(GraphViewType::is_storage_transposed); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief Iterate over every vertex's outgoing edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the + * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +} // namespace cugraph diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 027ef1f662d..5ba7edec894 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -15,558 +15,165 @@ */ #pragma once -#include "detail/graph_partition_utils.cuh" -#include "prims/detail/prim_functors.cuh" -#include "prims/fill_edge_src_dst_property.cuh" -#include "prims/property_op_utils.cuh" -#include "prims/reduce_op.cuh" +#include "prims/detail/per_v_transform_reduce_e.cuh" +#include "prims/vertex_frontier.cuh" -#include -#include -#include #include #include -#include -#include -#include #include -#include -#include #include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #include namespace cugraph { -namespace detail { - -int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; - -template -struct transform_and_atomic_reduce_t { - edge_partition_device_view_t const& edge_partition{}; - result_t identity_element{}; - vertex_t const* indices{nullptr}; - TransformOp const& transform_op{}; - ResultValueOutputIteratorOrWrapper& result_value_output{}; - - __device__ void operator()(edge_t i) const - { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - if constexpr (multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } -}; - -template -__device__ void update_result_value_output( - edge_partition_device_view_t const& edge_partition, - vertex_t const* indices, - edge_t local_degree, - TransformOp const& transform_op, - result_t init, - ReduceOp const& reduce_op, - size_t output_idx /* relevent only when update_major === true */, - result_t identity_element, - ResultValueOutputIteratorOrWrapper& result_value_output) -{ - if constexpr (update_major) { - *(result_value_output + output_idx) = - thrust::transform_reduce(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_op, - init, - reduce_op); - } else { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_and_atomic_reduce_t{ - edge_partition, identity_element, indices, transform_op, result_value_output}); - } -} - -template -__global__ static void per_v_transform_reduce_e_hypersparse( - edge_partition_device_view_t edge_partition, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - - edge_partition.major_range_first()); - auto idx = static_cast(tid); - - auto dcs_nzd_vertex_count = *(edge_partition.dcs_nzd_vertex_count()); - - while (idx < static_cast(dcs_nzd_vertex_count)) { - auto major = - *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto major_idx = - major_start_offset + idx; // major_offset != major_idx in the hypersparse region - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_idx)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - major - *(edge_partition).major_hypersparse_first(), - identity_element, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - major - *(edge_partition).major_hypersparse_first(), - identity_element, - result_value_output); - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_low_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid); - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_offset)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - idx, - identity_element, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - idx, - identity_element, - result_value_output); - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_mid_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) + typename T, + typename VertexValueOutputIterator> +void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) { - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); - auto const lane_id = tid % raft::warp_size(); - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid / raft::warp_size()); - - using WarpReduce = cub::WarpReduce; - [[maybe_unused]] __shared__ typename WarpReduce::TempStorage - temp_storage[per_v_transform_reduce_e_kernel_block_size / - raft::warp_size()]; // relevant only if update_major == true - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - [[maybe_unused]] auto reduced_e_op_result = - lane_id == 0 ? init : identity_element; // relevant only if update_major == true - if (edge_partition_e_mask) { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } else { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - - if constexpr (update_major) { - reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(reduced_e_op_result, reduce_op); - if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - - idx += gridDim.x * (blockDim.x / raft::warp_size()); + if (do_expensive_check) { + // currently, nothing to do } -} - -template -__global__ static void per_v_transform_reduce_e_high_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(blockIdx.x); - - using BlockReduce = cub::BlockReduce; - [[maybe_unused]] __shared__ - typename BlockReduce::TempStorage temp_storage; // relevant only if update_major == true - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - [[maybe_unused]] auto reduced_e_op_result = - threadIdx.x == 0 ? init : identity_element; // relevant only if update_major == true - if (edge_partition_e_mask) { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } else { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - - if constexpr (update_major) { - reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); - if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - - idx += gridDim.x; - } + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } -template -void per_v_transform_reduce_e(raft::handle_t const& handle, - GraphViewType const& graph_view, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - T init, - ReduceOp reduce_op, - VertexValueOutputIterator vertex_value_output_first) +void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) { - static_assert(ReduceOp::pure_function && reduce_op::has_compatible_raft_comms_op_v && - reduce_op::has_identity_element_v); // current restriction, to support - // general reduction, we may need to - // take a less efficient code path - - constexpr auto update_major = (incoming == GraphViewType::is_storage_transposed); - [[maybe_unused]] constexpr auto max_segments = - detail::num_sparse_segments_per_vertex_partition + size_t{1}; - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; + static_assert(GraphViewType::is_storage_transposed); - using edge_partition_src_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeSrcValueInputWrapper::value_iterator, - typename EdgeSrcValueInputWrapper::value_type>>; - using edge_partition_dst_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeDstValueInputWrapper::value_iterator, - typename EdgeDstValueInputWrapper::value_type>>; - using edge_partition_e_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_edge_dummy_property_device_view_t, - detail::edge_partition_edge_property_device_view_t< - edge_t, - typename EdgeValueInputWrapper::value_iterator, - typename EdgeValueInputWrapper::value_type>>; - - static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - - using minor_tmp_buffer_type = std::conditional_t, - edge_dst_property_t>; - [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - minor_tmp_buffer = std::make_unique(handle, graph_view); - } - - using edge_partition_minor_output_device_view_t = - std::conditional_tmutable_view().value_first())>, - void /* dummy */>; - - if constexpr (update_major) { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { // no vertices in the zero degree segment are visited - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), - init); - } - } else { - if constexpr (GraphViewType::is_multi_gpu) { - auto minor_init = init; - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may not - // store values for the entire minor range - minor_init = ReduceOp::identity_element; - } else { - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; - } - fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); - } else { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first, - vertex_value_output_first + graph_view.local_vertex_partition_range_size(), - init); - } - } - - std::optional> stream_pool_indices{std::nullopt}; - if constexpr (GraphViewType::is_multi_gpu) { - if ((graph_view.local_edge_partition_segment_offsets(0)) && - (handle.get_stream_pool_size() >= max_segments)) { - for (size_t i = 1; i < graph_view.number_of_local_edge_partitions(); ++i) { - assert(graph_view.local_edge_partition_segment_offsets(i)); - } - - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is - // update_major ? V / comm_size * sizeof(T) : 0 - // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - - size_t num_streams = - std::min(static_cast(minor_comm_size) * max_segments, - raft::round_down_safe(handle.get_stream_pool_size(), max_segments)); - if constexpr (update_major) { - size_t value_size{0}; - if constexpr (is_thrust_tuple_of_arithmetic::value) { - auto elem_sizes = compute_thrust_tuple_element_sizes{}(); - value_size = std::reduce(elem_sizes.begin(), elem_sizes.end()); - } else { - value_size = sizeof(T); - } - - auto avg_vertex_degree = - graph_view.number_of_vertices() > 0 - ? (static_cast(graph_view.compute_number_of_edges(handle)) / - static_cast(graph_view.number_of_vertices())) - : double{0.0}; - - num_streams = - std::min(static_cast(avg_vertex_degree * (static_cast(sizeof(vertex_t)) / - static_cast(value_size))) * - max_segments, - num_streams); - } - - if (num_streams >= max_segments) { - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - } - - std::vector(0, rmm::cuda_stream_view{}))> - major_tmp_buffers{}; - if constexpr (GraphViewType::is_multi_gpu && update_major) { - std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), - size_t{0}); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - major_tmp_buffer_sizes[i] = - *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment - } else { - if constexpr (GraphViewType::is_storage_transposed) { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); - } else { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); - } - } - } - if (stream_pool_indices) { - auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; - major_tmp_buffers.reserve(num_concurrent_loops); - for (size_t i = 0; i < num_concurrent_loops; ++i) { - size_t max_size{0}; - for (size_t j = i; j < graph_view.number_of_local_edge_partitions(); - j += num_concurrent_loops) { - max_size = std::max(major_tmp_buffer_sizes[j], max_size); - } - major_tmp_buffers.push_back(allocate_dataframe_buffer(max_size, handle.get_stream())); - } - } else { - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer( - *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()), - handle.get_stream())); - } - } else { // dummy - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); - } - - if (stream_pool_indices) { handle.sync_stream(); } - - auto edge_mask_view = graph_view.edge_mask_view(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto major_init = ReduceOp::identity_element; - if constexpr (update_major) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; - } else { - major_init = init; - } - } - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); - } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); - - auto major_buffer_first = - get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); - - std::conditional_t, - VertexValueOutputIterator> - output_buffer{}; - if constexpr (GraphViewType::is_multi_gpu) { - if constexpr (update_major) { - output_buffer = major_buffer_first; - } else { - output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); - } - } else { - output_buffer = vertex_value_output_first; - } - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - - // FIXME: we may further improve performance by 1) individually tuning block sizes for - // different segments; and 2) adding one more segment for very high degree vertices and - // running segmented reduction - if (edge_partition.dcs_nzd_vertex_count()) { - auto exec_stream = - stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) - : handle.get_stream(); - - if constexpr (update_major) { // this is necessary as we don't visit every vertex in the - // hypersparse segment - thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + (*segment_offsets)[3], - output_buffer + (*segment_offsets)[4], - major_init); - } - - if (*(edge_partition.dcs_nzd_vertex_count()) > 0) { - raft::grid_1d_thread_t update_grid(*(edge_partition.dcs_nzd_vertex_count()), - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } - detail::per_v_transform_reduce_e_hypersparse - <<>>( - edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[2], - edge_partition.major_range_first() + (*segment_offsets)[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; } - detail::per_v_transform_reduce_e_mid_degree - <<>>( - edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[1], - edge_partition.major_range_first() + (*segment_offsets)[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - if ((*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_block_t update_grid((*segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::per_v_transform_reduce_e_high_degree - <<>>( - edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_first() + (*segment_offsets)[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } else { - if (edge_partition.major_range_size() > 0) { - raft::grid_1d_thread_t update_grid(edge_partition.major_range_size(), - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_last(), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } - - if constexpr (GraphViewType::is_multi_gpu && update_major) { - auto& comm = handle.get_comms(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - if (segment_offsets && stream_pool_indices) { - if (edge_partition.dcs_nzd_vertex_count()) { - device_reduce( - minor_comm, - major_buffer_first + (*segment_offsets)[3], - vertex_value_output_first + (*segment_offsets)[3], - (*segment_offsets)[4] - (*segment_offsets)[3], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size())); - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[2], - vertex_value_output_first + (*segment_offsets)[2], - (*segment_offsets)[3] - (*segment_offsets)[2], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size())); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[1], - vertex_value_output_first + (*segment_offsets)[1], - (*segment_offsets)[2] - (*segment_offsets)[1], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size())); - } - if ((*segment_offsets)[1] > 0) { - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - (*segment_offsets)[1], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size())); - } - } else { - size_t reduction_size = static_cast( - segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - reduction_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream()); - } - } - - if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { - handle.sync_stream_pool( - *stream_pool_indices); // to prevent buffer over-write (this can happen as *segment_offsets - // do not necessarily coincide in different edge partitions). - } + if (do_expensive_check) { + // currently, nothing to do } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // applying the initial value is deferred to here - vertex_t max_vertex_partition_size{0}; - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - max_vertex_partition_size = - std::max(max_vertex_partition_size, - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); - } - auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); - auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); - std::optional> minor_key_offsets{}; - if constexpr (GraphViewType::is_storage_transposed) { - minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); - } else { - minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); - } - for (int i = 0; i < major_comm_size; ++i) { - auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - thrust::fill(handle.get_thrust_policy(), - tx_buffer_first, - tx_buffer_first + - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), - minor_init); - auto value_first = thrust::make_transform_iterator( - view.value_first(), - cuda::proclaim_return_type( - [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); - thrust::scatter(handle.get_thrust_policy(), - value_first + (*minor_key_offsets)[i], - value_first + (*minor_key_offsets)[i + 1], - thrust::make_transform_iterator( - (*(view.keys())).begin() + (*minor_key_offsets)[i], - cuda::proclaim_return_type( - [key_first = graph_view.vertex_partition_range_first( - this_segment_vertex_partition_id)] __device__(auto key) { - return key - key_first; - })), - tx_buffer_first); - device_reduce(major_comm, - tx_buffer_first, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } else { - auto first_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); - vertex_t minor_range_first = - graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - - minor_range_first; - device_reduce(major_comm, - view.value_first() + offset, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } - } + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } -} // namespace detail - /** - * @brief Iterate over every vertex's incoming edges to update vertex properties. + * @brief Iterate over every vertex's outgoing edges to update vertex properties. * - * This function is inspired by thrust::transform_reduce. + * This function is inspired by thrust::transform_reduce(). * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. @@ -1131,8 +240,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to - * fill the wrapper. + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. * @param edge_dst_value_input Wrapper used to access destination input property values (for the * edge destinations assigned to this process in multi-GPU). Use either * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or @@ -1145,14 +254,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has * known member variables) to take a more optimized code path. See the documentation in the * reduce_op.cuh file for instructions on writing custom reduction operators. - * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first - * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the + * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` * (exclusive) is deduced as @p vertex_value_output_first + @p * graph_view.local_vertex_partition_range_size(). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). @@ -1165,7 +276,7 @@ template -void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, +void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, @@ -1180,23 +291,37 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } /** - * @brief Iterate over every vertex's outgoing edges to update vertex properties. + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. * * This function is inspired by thrust::transform_reduce(). * * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -1207,6 +332,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -1223,20 +350,22 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, * access edge property values). * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. - * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has * known member variables) to take a more optimized code path. See the documentation in the * reduce_op.cuh file for instructions on writing custom reduction operators. - * @param vertex_value_output_first Iterator pointing to the vertex property variables for the - * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` - * (exclusive) is deduced as @p vertex_value_output_first + @p - * graph_view.local_vertex_partition_range_size(). + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). */ template void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, + KeyBucketType const& key_list, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -1255,19 +385,33 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, VertexValueOutputIterator vertex_value_output_first, bool do_expensive_check = false) { + static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); + if (do_expensive_check) { // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } } // namespace cugraph diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh new file mode 100644 index 00000000000..87f590f571f --- /dev/null +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -0,0 +1,1196 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "detail/graph_partition_utils.cuh" +#include "prims/detail/extract_transform_v_frontier_e.cuh" +#include "prims/detail/prim_utils.cuh" +#include "prims/property_op_utils.cuh" +#include "prims/reduce_op.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { + +namespace detail { + +int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512; + +template +struct transform_reduce_v_frontier_call_e_op_t { + EdgeOp e_op{}; + + __device__ thrust::optional< + std::conditional_t && !std::is_same_v, + thrust::tuple, + std::conditional_t, key_t, payload_t>>> + operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const + { + auto e_op_result = e_op(key, dst, sv, dv, ev); + if (e_op_result.has_value()) { + auto reduce_by = dst; + if constexpr (std::is_same_v && std::is_same_v) { + return reduce_by; + } else if constexpr (std::is_same_v && !std::is_same_v) { + return thrust::make_tuple(reduce_by, *e_op_result); + } else if constexpr (!std::is_same_v && std::is_same_v) { + return thrust::make_tuple(reduce_by, *e_op_result); + } else { + return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)), + thrust::get<1>(*e_op_result)); + } + } else { + return thrust::nullopt; + } + } +}; + +template +struct update_keep_flag_t { + using input_key_t = + typename thrust::iterator_traits::value_type; // uint32_t (compressed) or + // key_t (i.e. vertex_t) + + raft::device_span bitmap{}; + raft::device_span keep_flags{}; + key_t v_range_first{}; + InputKeyIterator input_key_first{}; + thrust::optional invalid_input_key{}; + + __device__ void operator()(size_t i) const + { + auto v = *(input_key_first + i); + if (invalid_input_key && (v == *invalid_input_key)) { + return; // just discard + } + input_key_t v_offset{}; + if constexpr ((sizeof(key_t) == 8) && std::is_same_v) { + v_offset = v; + } else { + v_offset = v - v_range_first; + } + cuda::atomic_ref bitmap_word( + bitmap[packed_bool_offset(v_offset)]); + auto old = bitmap_word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + if ((old & packed_bool_mask(v_offset)) == packed_bool_empty_mask()) { + cuda::atomic_ref keep_flag_word( + keep_flags[packed_bool_offset(i)]); + keep_flag_word.fetch_or(packed_bool_mask(i), cuda::std::memory_order_relaxed); + } + } +}; + +template +std::tuple, optional_dataframe_buffer_type_t> +filter_buffer_elements( + raft::handle_t const& handle, + rmm::device_uvector&& + unique_v_buffer, // assumes that buffer elements are locally reduced first and unique + optional_dataframe_buffer_type_t&& payload_buffer, + raft::device_span vertex_partition_range_offsets, // size = major_comm_size + 1 + vertex_t allreduce_count_per_rank, + int subgroup_size) +{ + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + + rmm::device_uvector priorities(allreduce_count_per_rank * major_comm_size, + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + thrust::for_each( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + [offsets = vertex_partition_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = + thrust::distance(offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + priorities[allreduce_count_per_rank * root + v_offset] = + rank_to_priority( + major_comm_rank, root, subgroup_size, major_comm_size, v_offset); + } + }); + device_allreduce(major_comm, + priorities.data(), + priorities.data(), + priorities.size(), + raft::comms::op_t::MIN, + handle.get_stream()); + if constexpr (std::is_same_v) { + unique_v_buffer.resize( + thrust::distance( + unique_v_buffer.begin(), + thrust::remove_if( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + unique_v_buffer.begin(), + [offsets = vertex_partition_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + } else { + auto kv_pair_first = thrust::make_zip_iterator(unique_v_buffer.begin(), + get_dataframe_buffer_begin(payload_buffer)); + unique_v_buffer.resize( + thrust::distance( + kv_pair_first, + thrust::remove_if( + handle.get_thrust_policy(), + kv_pair_first, + kv_pair_first + unique_v_buffer.size(), + unique_v_buffer.begin(), + [offsets = vertex_partition_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + resize_dataframe_buffer(payload_buffer, unique_v_buffer.size(), handle.get_stream()); + } + + return std::make_tuple(std::move(unique_v_buffer), std::move(payload_buffer)); +} + +template +std::tuple, optional_dataframe_buffer_type_t> +sort_and_reduce_buffer_elements( + raft::handle_t const& handle, + dataframe_buffer_type_t&& key_buffer, + optional_dataframe_buffer_type_t&& payload_buffer, + ReduceOp reduce_op, + std::conditional_t, std::tuple, std::byte /* dummy */> + vertex_range, + std::optional invalid_key /* drop (key, (payload)) pairs with invalid key */) +{ + constexpr bool compressed = + std::is_integral_v && (sizeof(key_t) == 8) && + std::is_same_v; // we currently compress only when key_t is an integral + // type (i.e. vertex_t) + static_assert(compressed || std::is_same_v); + + if constexpr (std::is_integral_v && + (std::is_same_v || + std::is_same_v>)) { // try to use + // bitmap for + // filtering + key_t range_size = std::get<1>(vertex_range) - std::get<0>(vertex_range); + if (static_cast(size_dataframe_buffer(key_buffer)) >= + static_cast(range_size) * + 0.125 /* tuning parameter */) { // use bitmap for filtering + rmm::device_uvector bitmap(packed_bool_size(range_size), handle.get_stream()); + rmm::device_uvector keep_flags(packed_bool_size(size_dataframe_buffer(key_buffer)), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + thrust::fill( + handle.get_thrust_policy(), keep_flags.begin(), keep_flags.end(), packed_bool_empty_mask()); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), + update_keep_flag_t{ + raft::device_span(bitmap.data(), bitmap.size()), + raft::device_span(keep_flags.data(), keep_flags.size()), + std::get<0>(vertex_range), + get_dataframe_buffer_begin(key_buffer), + to_thrust_optional(invalid_key)}); + auto stencil_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span(keep_flags.data(), + keep_flags.size())] __device__(size_t i) { + return (keep_flags[packed_bool_offset(i)] & packed_bool_mask(i)) != + packed_bool_empty_mask(); + })); + if constexpr (std::is_same_v) { + resize_dataframe_buffer( + key_buffer, + thrust::distance(get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer)); + } else { + static_assert(std::is_same_v>); + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + thrust::sort_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + } + + if constexpr (compressed) { + rmm::device_uvector output_key_buffer(key_buffer.size(), handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + key_buffer.begin(), + key_buffer.end(), + output_key_buffer.begin(), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(uint32_t v_offset) { + return static_cast(v_first + v_offset); + })); + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); + } else { + return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + } + } + } + + if constexpr (std::is_same_v) { + thrust::sort(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer)); + } else { + thrust::sort_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer)); + } + + auto output_key_buffer = allocate_dataframe_buffer(0, handle.get_stream()); + if constexpr (std::is_same_v) { + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + get_dataframe_buffer_begin(output_key_buffer), + thrust::copy_if(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + get_dataframe_buffer_begin(output_key_buffer), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + } else { + resize_dataframe_buffer( + key_buffer, + thrust::distance( + get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); + } else if constexpr (std::is_same_v>) { + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + auto tmp_payload_buffer = allocate_dataframe_buffer( + size_dataframe_buffer(payload_buffer), handle.get_stream()); + auto input_pair_first = + thrust::make_zip_iterator(input_key_first, get_dataframe_buffer_begin(payload_buffer)); + auto output_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_key_buffer), + get_dataframe_buffer_begin(tmp_payload_buffer)); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + output_pair_first, + thrust::copy_if(handle.get_thrust_policy(), + input_pair_first, + input_pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + output_pair_first, + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + tmp_payload_buffer, size_dataframe_buffer(output_key_buffer), handle.get_stream()); + payload_buffer = std::move(tmp_payload_buffer); + } else { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance( + pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + } else { + if (invalid_key) { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + cuda::proclaim_return_type( + [invalid_key = *invalid_key] __device__(auto kv) { + auto key = thrust::get<0>(kv); + return key == invalid_key; + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + } + auto num_uniques = + thrust::count_if(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), + is_first_in_run_t{ + get_dataframe_buffer_begin(key_buffer)}); + + auto new_key_buffer = allocate_dataframe_buffer(num_uniques, handle.get_stream()); + auto new_payload_buffer = + allocate_dataframe_buffer(num_uniques, handle.get_stream()); + + if constexpr (compressed) { + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + thrust::reduce_by_key(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } else { + thrust::reduce_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } + + output_key_buffer = std::move(new_key_buffer); + payload_buffer = std::move(new_payload_buffer); + } + + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); +} + +template +std::conditional_t< + !std::is_same_v, + std::tuple( + 0, rmm::cuda_stream_view{})), + decltype(detail::allocate_optional_dataframe_buffer( + 0, rmm::cuda_stream_view{}))>, + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> +transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + ReduceOp reduce_op, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed, + "GraphViewType should support the push model."); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename KeyBucketType::key_type; + using payload_t = typename ReduceOp::value_type; + + if (do_expensive_check) { + // currently, nothing to do + } + + // 1. fill the buffer + + detail::transform_reduce_v_frontier_call_e_op_t + e_op_wrapper{e_op}; + + auto [key_buffer, payload_buffer] = + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op_wrapper, + do_expensive_check); + // 2. reduce the buffer + + std::vector vertex_partition_range_offsets{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + vertex_partition_range_offsets = std::vector(major_comm_size + 1); + for (int i = 0; i < major_comm_size; ++i) { + auto vertex_partition_id = + detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + vertex_partition_range_offsets[i] = + graph_view.vertex_partition_range_first(vertex_partition_id); + } + vertex_partition_range_offsets.back() = graph_view.local_edge_partition_dst_range_last(); + } else { + vertex_partition_range_offsets = + std::vector{graph_view.local_edge_partition_dst_range_first(), + graph_view.local_edge_partition_dst_range_last()}; + } + std::conditional_t, std::tuple, std::byte /* dummy */> + vertex_range{}; + if constexpr (std::is_integral_v) { + vertex_range = std::make_tuple(vertex_partition_range_offsets.front(), + vertex_partition_range_offsets.back()); + } + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + std::nullopt); + if constexpr (GraphViewType::is_multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + if (major_comm_size > 1) { + size_t local_key_buffer_size = size_dataframe_buffer(key_buffer); + auto avg_key_buffer_size = + host_scalar_allreduce( + major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / + major_comm_size; + + rmm::device_uvector d_vertex_partition_range_offsets( + vertex_partition_range_offsets.size(), handle.get_stream()); + raft::update_device(d_vertex_partition_range_offsets.data(), + vertex_partition_range_offsets.data(), + vertex_partition_range_offsets.size(), + handle.get_stream()); + + constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; + std::conditional_t + max_vertex_partition_size{}; + if constexpr (try_compression) { + for (int i = 0; i < major_comm_size; ++i) { + max_vertex_partition_size = + std::max(vertex_partition_range_offsets[i + 1] - vertex_partition_range_offsets[i], + max_vertex_partition_size); + } + } + + if constexpr (std::is_same_v && + std::is_same_v>) { + vertex_t min_vertex_partition_size = std::numeric_limits::max(); + for (int i = 0; i < major_comm_size; ++i) { + min_vertex_partition_size = + std::min(vertex_partition_range_offsets[i + 1] - vertex_partition_range_offsets[i], + min_vertex_partition_size); + } + + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + if (segment_offsets && + (static_cast(avg_key_buffer_size) > + static_cast(graph_view.number_of_vertices() / comm_size) * + double{0.2})) { // duplicates expected for high in-degree vertices (and we assume + // correlation between in-degrees & out-degrees) // FIXME: we need + // a better criterion + size_t key_size{0}; + size_t payload_size{0}; + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + key_size = sizeof(uint32_t); + } else { + key_size = sizeof(key_t); + } + } else { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + payload_size = sizeof(payload_t); + } else { + payload_size = sum_thrust_tuple_element_sizes(); + } + } + + int subgroup_size{}; + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size <= num_gpus_per_node) { + subgroup_size = major_comm_size; + } else { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::min(major_comm_size, num_gpus_per_node) + : std::max(num_gpus_per_node / minor_comm_size, int{1}); + } + + auto p2p_size_per_rank = avg_key_buffer_size * (key_size + payload_size); + auto p2p_size_per_node = p2p_size_per_rank * std::min(num_gpus_per_node, comm_size); + auto allreduce_size_per_node = p2p_size_per_node / 16 /* tuning parameter */; + auto allreduce_size_per_rank = + allreduce_size_per_node / (major_comm_size * (num_gpus_per_node / subgroup_size)); + + if (major_comm_size <= std::numeric_limits::max()) { // priority = uint8_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_partition_range_offsets.data(), + d_vertex_partition_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint8_t)), + min_vertex_partition_size), + subgroup_size); + } else { // priority = uint32_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_partition_range_offsets.data(), + d_vertex_partition_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint32_t)), + min_vertex_partition_size), + subgroup_size); + } + } + } + + rmm::device_uvector d_tx_buffer_last_boundaries(major_comm_size, handle.get_stream()); + auto key_v_first = + thrust_tuple_get_or_identity( + get_dataframe_buffer_begin(key_buffer)); + thrust::lower_bound(handle.get_thrust_policy(), + key_v_first, + key_v_first + size_dataframe_buffer(key_buffer), + d_vertex_partition_range_offsets.begin() + 1, + d_vertex_partition_range_offsets.end(), + d_tx_buffer_last_boundaries.begin()); + std::conditional_t>, + std::byte /* dummy */> + compressed_v_buffer{}; + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + compressed_v_buffer = + rmm::device_uvector(size_dataframe_buffer(key_buffer), handle.get_stream()); + thrust::transform( + handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + (*compressed_v_buffer).begin(), + cuda::proclaim_return_type( + [firsts = raft::device_span(d_vertex_partition_range_offsets.data(), + static_cast(major_comm_size)), + lasts = raft::device_span( + d_vertex_partition_range_offsets.data() + 1, + static_cast(major_comm_size))] __device__(auto v) { + auto major_comm_rank = thrust::distance( + lasts.begin(), thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); + return static_cast(v - firsts[major_comm_rank]); + })); + resize_dataframe_buffer(key_buffer, 0, handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + } + } + std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); + raft::update_host(h_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.size(), + handle.get_stream()); + handle.sync_stream(); + std::vector tx_counts(h_tx_buffer_last_boundaries.size()); + std::adjacent_difference( + h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); + + size_t min_element_size{cache_line_size}; + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + min_element_size = std::min(sizeof(uint32_t), min_element_size); + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(payload_t), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + } + assert((cache_line_size % min_element_size) == 0); + auto alignment = cache_line_size / min_element_size; + std::optional, key_t>> + invalid_key{std::nullopt}; + + if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) { + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + invalid_key = std::numeric_limits::max(); + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = key_t{}; + thrust::get<0>(*invalid_key) = invalid_vertex_id_v; + } + + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + alignment, + std::make_optional(std::get<1>(*invalid_key)), + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + std::make_optional(std::get<0>(*invalid_key)), + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + invalid_key, + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(payload_buffer), + tx_counts, + alignment, + std::nullopt, + handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } else { + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, std::ignore) = + shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } + + if constexpr (std::is_integral_v) { + vertex_range = std::make_tuple(graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_last()); + } + if constexpr (try_compression) { + if (compressed_v_buffer) { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(*compressed_v_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt); + } else { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt); + } + } else { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key); + } + } + } + + if constexpr (!std::is_same_v) { + return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + } else { + return std::move(key_buffer); + } +} + +} // namespace detail + +template +size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier) +{ + static_assert(!GraphViewType::is_storage_transposed, + "GraphViewType should support the push model."); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename KeyBucketType::key_type; + + size_t ret{0}; + + auto local_frontier_vertex_first = + thrust_tuple_get_or_identity(frontier.begin()); + + std::vector local_frontier_sizes{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); + } else { + local_frontier_sizes = std::vector{static_cast(frontier.size())}; + } + + auto edge_mask_view = graph_view.edge_mask_view(); + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; + + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + rmm::device_uvector edge_partition_frontier_vertices(local_frontier_sizes[i], + handle.get_stream()); + device_bcast(minor_comm, + local_frontier_vertex_first, + edge_partition_frontier_vertices.data(), + local_frontier_sizes[i], + static_cast(i), + handle.get_stream()); + + if (edge_partition_e_mask) { + ret += + edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), + edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } + } else { + assert(i == 0); + if (edge_partition_e_mask) { + ret += edge_partition.compute_number_of_edges_with_mask( + (*edge_partition_e_mask).value_first(), + local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } + } + } + + return ret; +} + +/** + * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor + * outputs by (tagged-)destination ID. + * + * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are + * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag + * type (KeyBucketType::key_type is identical to a vertex type otherwise). + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the + * current (tagged-)vertex frontier. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param frontier KeyBucketType class object for the current vertex frontier. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for + * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be + * discarded); 2) dummy (but valid) thrust::optional object (e.g. + * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is + * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be + * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); + * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type + * is not void). + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key + * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order + * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. + */ +template +std::conditional_t< + !std::is_same_v, + std::tuple( + 0, rmm::cuda_stream_view{})), + decltype(detail::allocate_optional_dataframe_buffer( + 0, rmm::cuda_stream_view{}))>, + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> +transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + ReduceOp reduce_op, + bool do_expensive_check = false) +{ + return detail::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + reduce_op, + do_expensive_check); +} + +} // namespace cugraph diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh deleted file mode 100644 index e58ab08fa97..00000000000 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "detail/graph_partition_utils.cuh" -#include "prims/detail/extract_transform_v_frontier_e.cuh" -#include "prims/property_op_utils.cuh" -#include "prims/reduce_op.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace cugraph { - -namespace detail { - -int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512; - -template -struct transform_reduce_v_frontier_call_e_op_t { - EdgeOp e_op{}; - - __device__ thrust::optional< - std::conditional_t && !std::is_same_v, - thrust::tuple, - std::conditional_t, key_t, payload_t>>> - operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const - { - auto e_op_result = e_op(key, dst, sv, dv, ev); - if (e_op_result.has_value()) { - auto reduce_by = reduce_by_src ? thrust_tuple_get_or_identity(key) : dst; - if constexpr (std::is_same_v && std::is_same_v) { - return reduce_by; - } else if constexpr (std::is_same_v && !std::is_same_v) { - return thrust::make_tuple(reduce_by, *e_op_result); - } else if constexpr (!std::is_same_v && std::is_same_v) { - return thrust::make_tuple(reduce_by, *e_op_result); - } else { - return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)), - thrust::get<1>(*e_op_result)); - } - } else { - return thrust::nullopt; - } - } -}; - -template -auto sort_and_reduce_buffer_elements( - raft::handle_t const& handle, - decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))&& key_buffer, - decltype(allocate_optional_dataframe_buffer(0, - rmm::cuda_stream_view{}))&& payload_buffer, - ReduceOp reduce_op) -{ - if constexpr (std::is_same_v) { - thrust::sort(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer)); - } else { - thrust::sort_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer)); - } - - if constexpr (std::is_same_v) { - auto it = thrust::unique(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer)); - resize_dataframe_buffer( - key_buffer, - static_cast(thrust::distance(get_dataframe_buffer_begin(key_buffer), it)), - handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); - } else if constexpr (std::is_same_v>) { - auto it = thrust::unique_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer)); - resize_dataframe_buffer(key_buffer, - static_cast(thrust::distance( - get_dataframe_buffer_begin(key_buffer), thrust::get<0>(it))), - handle.get_stream()); - resize_dataframe_buffer(payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); - shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); - } else { - auto num_uniques = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), - is_first_in_run_t{ - get_dataframe_buffer_begin(key_buffer)}); - - auto new_key_buffer = allocate_dataframe_buffer(num_uniques, handle.get_stream()); - auto new_payload_buffer = - allocate_dataframe_buffer(num_uniques, handle.get_stream()); - - thrust::reduce_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer), - get_dataframe_buffer_begin(new_key_buffer), - get_dataframe_buffer_begin(new_payload_buffer), - thrust::equal_to(), - reduce_op); - - key_buffer = std::move(new_key_buffer); - payload_buffer = std::move(new_payload_buffer); - } - - return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); -} - -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - static_assert(!GraphViewType::is_storage_transposed, - "GraphViewType should support the push model."); - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; - using payload_t = typename ReduceOp::value_type; - - if (do_expensive_check) { - // currently, nothing to do - } - - // 1. fill the buffer - - detail::transform_reduce_v_frontier_call_e_op_t - e_op_wrapper{e_op}; - - bool constexpr max_one_e_per_frontier_key = - reduce_by_src && std::is_same_v>; - auto [key_buffer, payload_buffer] = - detail::extract_transform_v_frontier_e( - handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op_wrapper, - do_expensive_check); - - // 2. reduce the buffer - - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); - if constexpr (GraphViewType::is_multi_gpu) { - // FIXME: this step is unnecessary if major_comm_size== 1 - auto& comm = handle.get_comms(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - std::vector h_vertex_lasts(reduce_by_src ? minor_comm_size : major_comm_size); - for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - auto vertex_partition_id = - reduce_by_src - ? detail::compute_local_edge_partition_major_range_vertex_partition_id_t{major_comm_size, - minor_comm_size, - major_comm_rank, - minor_comm_rank}( - i) - : detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id); - } - - rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); - raft::update_device( - d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream()); - rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), - handle.get_stream()); - auto reduce_by_first = - thrust_tuple_get_or_identity( - get_dataframe_buffer_begin(key_buffer)); - thrust::lower_bound(handle.get_thrust_policy(), - reduce_by_first, - reduce_by_first + size_dataframe_buffer(key_buffer), - d_vertex_lasts.begin(), - d_vertex_lasts.end(), - d_tx_buffer_last_boundaries.begin()); - std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); - raft::update_host(h_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.size(), - handle.get_stream()); - handle.sync_stream(); - std::vector tx_counts(h_tx_buffer_last_boundaries.size()); - std::adjacent_difference( - h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); - - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, std::ignore) = shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(key_buffer), - tx_counts, - handle.get_stream()); - key_buffer = std::move(rx_key_buffer); - - if constexpr (!std::is_same_v) { - auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_payload_buffer, std::ignore) = - shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(payload_buffer), - tx_counts, - handle.get_stream()); - payload_buffer = std::move(rx_payload_buffer); - } - - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); - } - - if constexpr (!std::is_same_v) { - return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); - } else { - return std::move(key_buffer); - } -} - -} // namespace detail - -template -size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier) -{ - static_assert(!GraphViewType::is_storage_transposed, - "GraphViewType should support the push model."); - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; - - size_t ret{0}; - - auto local_frontier_vertex_first = - thrust_tuple_get_or_identity(frontier.begin()); - - std::vector local_frontier_sizes{}; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); - } else { - local_frontier_sizes = std::vector{static_cast(frontier.size())}; - } - - auto edge_mask_view = graph_view.edge_mask_view(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - - rmm::device_uvector edge_partition_frontier_vertices(local_frontier_sizes[i], - handle.get_stream()); - device_bcast(minor_comm, - local_frontier_vertex_first, - edge_partition_frontier_vertices.data(), - local_frontier_sizes[i], - static_cast(i), - handle.get_stream()); - - if (edge_partition_e_mask) { - ret += - edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), - edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); - } else { - ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); - } - } else { - assert(i == 0); - if (edge_partition_e_mask) { - ret += edge_partition.compute_number_of_edges_with_mask( - (*edge_partition_e_mask).value_first(), - local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); - } else { - ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); - } - } - } - - return ret; -} - -/** - * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor - * outputs by (tagged-)source ID. - * - * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. - * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. - * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. - * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeOp Type of the quinary edge operator. - * @tparam ReduceOp Type of the binary reduction operator. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. - * @param edge_src_value_input Wrapper used to access source input property values (for the edge - * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() - * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to fill the - * wrapper. - * @param edge_dst_value_input Wrapper used to access destination input property values (for the - * edge destinations assigned to this process in multi-GPU). Use either - * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or - * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property - * values). Use update_edge_dst_property to fill the wrapper. - * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned - * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to - * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not - * access edge property values). - * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for - * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be - * discarded); 2) dummy (but valid) thrust::optional object (e.g. - * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is - * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be - * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); - * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type - * is not void). - * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. - * There are pre-defined reduction operators in prims/reduce_op.cuh. It is - * recommended to use the pre-defined reduction operators whenever possible as the current (and - * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has - * known member variables) to take a more optimized code path. See the documentation in the - * reduce_op.cuh file for instructions on writing custom reduction operators. - * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key - * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order - * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. - */ -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); -} - -/** - * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor - * outputs by (tagged-)destination ID. - * - * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. - * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. - * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. - * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeOp Type of the quinary edge operator. - * @tparam ReduceOp Type of the binary reduction operator. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. - * @param edge_src_value_input Wrapper used to access source input property values (for the edge - * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() - * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to fill the - * wrapper. - * @param edge_dst_value_input Wrapper used to access destination input property values (for the - * edge destinations assigned to this process in multi-GPU). Use either - * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or - * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property - * values). Use update_edge_dst_property to fill the wrapper. - * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned - * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to - * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not - * access edge property values). - * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for - * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be - * discarded); 2) dummy (but valid) thrust::optional object (e.g. - * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is - * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be - * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); - * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type - * is not void). - * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. - * There are pre-defined reduction operators in prims/reduce_op.cuh. It is - * recommended to use the pre-defined reduction operators whenever possible as the current (and - * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has - * known member variables) to take a more optimized code path. See the documentation in the - * reduce_op.cuh file for instructions on writing custom reduction operators. - * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key - * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order - * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. - */ -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); -} - -} // namespace cugraph diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh index 1bfdc23c66d..2f842f710ca 100644 --- a/cpp/src/prims/update_edge_src_dst_property.cuh +++ b/cpp/src/prims/update_edge_src_dst_property.cuh @@ -16,6 +16,7 @@ #pragma once #include "detail/graph_partition_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -265,8 +266,8 @@ template void update_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMajorPropertyOutputWrapper edge_major_property_output) { @@ -288,12 +289,12 @@ void update_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto local_v_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -317,7 +318,7 @@ void update_edge_major_property(raft::handle_t const& handle, graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -325,34 +326,41 @@ void update_edge_major_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + local_v_list_sizes[i], + i, + handle.get_stream()); device_bcast(minor_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -360,7 +368,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -386,7 +394,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -407,7 +415,7 @@ void update_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -420,20 +428,22 @@ void update_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_firsts[0]] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -455,13 +465,11 @@ void update_edge_minor_property(raft::handle_t const& handle, auto edge_partition_value_first = edge_minor_property_output.value_first(); if constexpr (GraphViewType::is_multi_gpu) { - using vertex_t = typename GraphViewType::vertex_type; - using bcast_buffer_type = - decltype(allocate_dataframe_buffer< - std::conditional_t>( - size_t{0}, handle.get_stream())); + using vertex_t = typename GraphViewType::vertex_type; + using bcast_buffer_type = dataframe_buffer_type_t< + std::conditional_t>; auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -487,8 +495,8 @@ void update_edge_minor_property(raft::handle_t const& handle, (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t)) / std::max(bcast_size, size_t{1}); - num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1}); - num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast(major_comm_size)); + num_concurrent_bcasts = + std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); auto num_rounds = (static_cast(major_comm_size) + num_concurrent_bcasts - size_t{1}) / num_concurrent_bcasts; @@ -532,15 +540,17 @@ void update_edge_minor_property(raft::handle_t const& handle, *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets()); } } else { - std::vector rx_counts(major_comm_size, size_t{0}); + std::vector local_v_list_sizes(major_comm_size, size_t{0}); for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = compute_local_edge_partition_minor_range_vertex_partition_id_t{ major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - rx_counts[i] = graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); + local_v_list_sizes[i] = + graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector rx_displacements(major_comm_size, size_t{0}); - std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); + std::exclusive_scan( + local_v_list_sizes.begin(), local_v_list_sizes.end(), rx_displacements.begin(), size_t{0}); key_offsets_or_rx_displacements = std::move(rx_displacements); } @@ -683,8 +693,8 @@ template void update_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMinorPropertyOutputWrapper edge_minor_property_output) { @@ -706,22 +716,49 @@ void update_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); - }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - std::conditional_t>( - contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size, - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + auto v_list_size = + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + std::array v_list_range = {vertex_t{0}, vertex_t{0}}; + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_vertex_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *sorted_unique_vertex_first + : (*(sorted_unique_vertex_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } + + auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); + auto local_v_list_range_firsts = + host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); + auto local_v_list_range_lasts = + host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); + + std::optional> v_list_bitmap{std::nullopt}; + if (major_comm_size > 1) { + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + + constexpr double threshold_ratio = + 0.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + if (avg_fill_ratio > threshold_ratio) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -735,13 +772,23 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_view(size_t{0})); auto edge_partition_keys = edge_minor_property_output.keys(); for (int i = 0; i < major_comm_size; ++i) { + rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + std::conditional_t>( + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + if (i == major_comm_rank) { auto vertex_partition = vertex_partition_device_view_t( graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -749,34 +796,53 @@ void update_edge_minor_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + std::variant, decltype(sorted_unique_vertex_first)> + v_list{}; + if (v_list_bitmap) { + v_list = + (i == major_comm_rank) + ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), size_t{0}); + } else { + v_list = sorted_unique_vertex_first; + } + device_bcast_vertex_list(major_comm, + v_list, + rx_vertices.begin(), + local_v_list_range_firsts[i], + local_v_list_range_lasts[i], + local_v_list_sizes[i], + i, + handle.get_stream()); device_bcast(major_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -784,7 +850,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], @@ -812,7 +878,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -833,7 +899,7 @@ void update_edge_minor_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_first); } @@ -844,20 +910,22 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_src_range_size()); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_first] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -909,8 +977,9 @@ void update_edge_src_property(raft::handle_t const& handle, /** * @brief Update graph edge source property values from the input vertex property values. * - * This version updates only a subset of graph edge source property values. [@p vertex_first, @p - * vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -919,10 +988,12 @@ void update_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -937,8 +1008,8 @@ template void update_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeSrcValueOutputWrapper edge_src_property_output, bool do_expensive_check = false) @@ -946,8 +1017,8 @@ void update_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -958,23 +1029,23 @@ void update_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } else { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } @@ -1026,8 +1097,9 @@ void update_edge_dst_property(raft::handle_t const& handle, /** * @brief Update graph edge destination property values from the input vertex property values. * - * This version updates only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -1037,10 +1109,12 @@ void update_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -1055,8 +1129,8 @@ template void update_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeDstValueOutputWrapper edge_dst_property_output, bool do_expensive_check = false) @@ -1064,8 +1138,8 @@ void update_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -1076,23 +1150,23 @@ void update_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } else { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index b13e6bfd458..6e7d8515beb 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -15,15 +15,24 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" + +#include #include #include +#include +#include #include +#include #include #include #include +#include +#include +#include #include #include #include @@ -48,6 +57,191 @@ namespace cugraph { +template +KeyIterator compute_key_lower_bound(KeyIterator sorted_unique_key_first, + KeyIterator sorted_unique_key_last, + vertex_t v_threshold, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + if constexpr (std::is_same_v) { + return thrust::lower_bound( + rmm::exec_policy(stream_view), sorted_unique_key_first, sorted_unique_key_last, v_threshold); + } else { + key_t k_threshold{}; + thrust::get<0>(k_threshold) = v_threshold; + return thrust::lower_bound( + rmm::exec_policy(stream_view), + sorted_unique_key_first, + sorted_unique_key_last, + k_threshold, + [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); }); + } +} + +template +std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, + KeyIterator sorted_key_last, + raft::host_span segment_offsets, + vertex_t vertex_range_first, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + std::vector h_thresholds(segment_offsets.size() - 2); + for (size_t i = 0; i < h_thresholds.size(); ++i) { + h_thresholds[i] = vertex_range_first + segment_offsets[i + 1]; + } + + rmm::device_uvector d_thresholds(h_thresholds.size(), stream_view); + raft::update_device(d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), stream_view); + + rmm::device_uvector d_offsets(d_thresholds.size(), stream_view); + if constexpr (std::is_same_v) { + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_key_first, + sorted_key_last, + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } else { + auto sorted_vertex_first = + thrust::make_transform_iterator(sorted_key_first, thrust_tuple_get{}); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_vertex_first, + sorted_vertex_first + thrust::distance(sorted_key_first, sorted_key_last), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } + + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + h_offsets[0] = size_t{0}; + h_offsets.back() = static_cast(thrust::distance(sorted_key_first, sorted_key_last)); + + return h_offsets; +} + +template +rmm::device_uvector compute_vertex_list_bitmap_info( + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + auto bitmap = rmm::device_uvector( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + rmm::device_uvector lasts(bitmap.size(), stream_view); + auto bdry_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{1}), + cuda::proclaim_return_type( + [vertex_range_first, + vertex_range_size = vertex_range_last - vertex_range_first] __device__(vertex_t i) { + return vertex_range_first + + static_cast( + std::min(packed_bools_per_word() * i, static_cast(vertex_range_size))); + })); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + bdry_first, + bdry_first + bitmap.size(), + lasts.begin()); + thrust::tabulate( + rmm::exec_policy_nosync(stream_view), + bitmap.begin(), + bitmap.end(), + cuda::proclaim_return_type( + [sorted_unique_vertex_first, + vertex_range_first, + lasts = raft::device_span(lasts.data(), lasts.size())] __device__(size_t i) { + auto offset_first = (i != 0) ? lasts[i - 1] : vertex_t{0}; + auto offset_last = lasts[i]; + auto ret = packed_bool_empty_mask(); + for (auto j = offset_first; j < offset_last; ++j) { + auto v_offset = *(sorted_unique_vertex_first + j) - vertex_range_first; + ret |= packed_bool_mask(v_offset); + } + return ret; + })); + + return bitmap; +} + +template +void device_bcast_vertex_list( + raft::comms::comms_t const& comm, + std::variant, InputVertexIterator> v_list, + OutputVertexIterator output_v_first, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + size_t v_list_size, + int root, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + static_assert( + std::is_same_v::value_type, vertex_t>); + + if (v_list.index() == 0) { // bitmap + rmm::device_uvector tmp_bitmap( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + assert((comm.get_rank() != root) || (std::get<0>(v_list).size() == tmp_bitmap.size())); + device_bcast( + comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view); + rmm::device_scalar dummy(size_t{0}, stream_view); // we already know the count + detail::copy_if_nosync( + thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + tmp_bitmap.data(), tmp_bitmap.size())] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != + packed_bool_empty_mask()); + })), + output_v_first, + raft::device_span(dummy.data(), size_t{1}), + stream_view); + } else { + device_bcast(comm, std::get<1>(v_list), output_v_first, v_list_size, root, stream_view); + } +} + +template +void retrieve_vertex_list_from_bitmap( + raft::device_span bitmap, + OutputVertexIterator output_v_first, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + assert((comm.get_rank() != root) || + (bitmap.size() >= packed_bool_size(vertex_range_last - vertex_ragne_first))); + detail::copy_if_nosync(thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type([bitmap] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & + packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + output_v_first, + count, + stream_view); +} + // key type is either vertex_t (tag_t == void) or thrust::tuple (tag_t != void) // if sorted_unique is true, stores unique key objects in the sorted (non-descending) order. // if false, there can be duplicates and the elements may not be sorted. @@ -328,20 +522,6 @@ class key_bucket_t { } } - auto const begin() const - { - if constexpr (std::is_same_v) { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() - : std::get<1>(vertices_).begin(); - } else { - return vertices_.index() == 0 - ? thrust::make_zip_iterator( - thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) - : thrust::make_zip_iterator( - thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); - } - } - auto begin() { CUGRAPH_EXPECTS( @@ -355,12 +535,22 @@ class key_bucket_t { } } - auto const end() const + auto const cbegin() const { - return begin() + - (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); + if constexpr (std::is_same_v) { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() + : std::get<1>(vertices_).begin(); + } else { + return vertices_.index() == 0 + ? thrust::make_zip_iterator( + thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) + : thrust::make_zip_iterator( + thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); + } } + auto const begin() const { return cbegin(); } + auto end() { CUGRAPH_EXPECTS( @@ -369,15 +559,13 @@ class key_bucket_t { return begin() + std::get<0>(vertices_).size(); } - auto const vertex_begin() const + auto const cend() const { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + return begin() + + (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); } - auto const vertex_end() const - { - return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); - } + auto const end() const { return cend(); } auto vertex_begin() { @@ -387,6 +575,13 @@ class key_bucket_t { return std::get<0>(vertices_).begin(); } + auto const vertex_cbegin() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + } + + auto const vertex_begin() const { return vertex_cbegin(); } + auto vertex_end() { CUGRAPH_EXPECTS( @@ -395,6 +590,13 @@ class key_bucket_t { return std::get<0>(vertices_).end(); } + auto const vertex_cend() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); + } + + auto const vertex_end() const { return vertex_cend(); } + bool is_owning() { return (vertices_.index() == 0); } private: diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 9796ddd60a1..cd98db31654 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -44,6 +44,7 @@ #include #include +#include namespace cugraph { @@ -299,6 +300,121 @@ bool check_no_parallel_edge(raft::handle_t const& handle, (org_edge_first + edgelist_srcs.size()); } +template +std::vector> +split_edge_chunk_compressed_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_compressed_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors, + size_t element_size) +{ + auto num_chunks = edgelist_compressed_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_compressed_elements{}; + edge_partition_compressed_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_compressed_elements.push_back(rmm::device_uvector( + edge_partition_edge_counts[i] * element_size, handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy( + handle.get_thrust_policy(), + edgelist_compressed_elements[k].begin() + segment_offset * element_size, + edgelist_compressed_elements[k].begin() + (segment_offset + segment_size) * element_size, + edge_partition_compressed_elements[i].begin() + output_offset * element_size); + } + } + } + edgelist_compressed_elements.clear(); + + return edge_partition_compressed_elements; +} + +template +std::vector> split_edge_chunk_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors) +{ + static_assert(std::is_arithmetic_v); // otherwise, unimplemented. + auto num_chunks = edgelist_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_elements{}; + edge_partition_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_elements.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy(handle.get_thrust_policy(), + edgelist_elements[k].begin() + segment_offset, + edgelist_elements[k].begin() + (segment_offset + segment_size), + edge_partition_elements[i].begin() + output_offset); + } + } + } + edgelist_elements.clear(); + + return edge_partition_elements; +} + +template +void decompress_vertices(raft::handle_t const& handle, + raft::device_span compressed_vertices, + raft::device_span vertices, + size_t compressed_v_size) +{ + auto input_v_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [byte_first = compressed_vertices.begin(), compressed_v_size] __device__(size_t i) { + uint64_t v{0}; + for (size_t j = 0; j < compressed_v_size; ++j) { + auto b = *(byte_first + i * compressed_v_size + j); + v |= static_cast(b) << (8 * j); + } + return static_cast(v); + })); + thrust::copy( + handle.get_thrust_policy(), input_v_first, input_v_first + vertices.size(), vertices.begin()); +} + template >>&& edge_partition_edgelist_weights, std::optional>>&& edge_partition_edgelist_edge_ids, std::optional>>&& edge_partition_edgelist_edge_types, - std::vector> const& edgelist_intra_partition_segment_offsets, + std::vector> const& edgelist_intra_partition_segment_offset_vectors, graph_properties_t graph_properties, bool renumber) { @@ -347,14 +463,14 @@ create_graph_from_partitioned_edgelist( src_ptrs[i] = edge_partition_edgelist_srcs[i].begin(); dst_ptrs[i] = edge_partition_edgelist_dsts[i].begin(); } - auto [renumber_map_labels, meta] = - cugraph::renumber_edgelist(handle, - std::move(local_vertices), - src_ptrs, - dst_ptrs, - edgelist_edge_counts, - edgelist_intra_partition_segment_offsets, - store_transposed); + auto [renumber_map_labels, meta] = cugraph::renumber_edgelist( + handle, + std::move(local_vertices), + src_ptrs, + dst_ptrs, + edgelist_edge_counts, + edgelist_intra_partition_segment_offset_vectors, + store_transposed); auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); @@ -369,7 +485,7 @@ create_graph_from_partitioned_edgelist( if (edge_partition_edgelist_edge_ids) { element_size += sizeof(edge_id_t); } if (edge_partition_edgelist_edge_types) { element_size += sizeof(edge_type_t); } auto constexpr mem_frugal_ratio = - 0.25; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the // total_global_mem, switch to the memory frugal approach auto mem_frugal_threshold = static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); @@ -684,11 +800,13 @@ create_graph_from_partitioned_edgelist( std::move(edge_partition_offsets), std::move(edge_partition_indices), std::move(edge_partition_dcs_nzd_vertices), - cugraph::graph_meta_t{meta.number_of_vertices, - meta.number_of_edges, - graph_properties, - meta.partition, - meta.edge_partition_segment_offsets}), + cugraph::graph_meta_t{ + meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.edge_partition_segment_offsets, + meta.edge_partition_hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -790,7 +908,7 @@ create_graph_from_edgelist_impl( handle.sync_stream(); std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); - auto edgelist_intra_partition_segment_offsets = std::vector>( + auto edgelist_intra_partition_segment_offset_vectors = std::vector>( minor_comm_size, std::vector(major_comm_size + 1, edge_t{0})); for (int i = 0; i < minor_comm_size; ++i) { edgelist_edge_counts[i] = std::accumulate(h_edge_counts.begin() + major_comm_size * i, @@ -798,7 +916,7 @@ create_graph_from_edgelist_impl( edge_t{0}); std::partial_sum(h_edge_counts.begin() + major_comm_size * i, h_edge_counts.begin() + major_comm_size * (i + 1), - edgelist_intra_partition_segment_offsets[i].begin() + 1); + edgelist_intra_partition_segment_offset_vectors[i].begin() + 1); } std::vector edgelist_displacements(minor_comm_size, edge_t{0}); std::partial_sum(edgelist_edge_counts.begin(), @@ -898,7 +1016,7 @@ create_graph_from_edgelist_impl( std::move(edge_partition_edgelist_weights), std::move(edge_partition_edgelist_edge_ids), std::move(edge_partition_edgelist_edge_types), - edgelist_intra_partition_segment_offsets, + edgelist_intra_partition_segment_offset_vectors, graph_properties, renumber); } @@ -1021,30 +1139,66 @@ create_graph_from_edgelist_impl( } } - // 1. groupby each edge chunks to their target local adjacency matrix partition (and further + auto num_chunks = edgelist_srcs.size(); + + // 1. set whether to temporarily compress vertex IDs or not in splitting edge chunks + + size_t compressed_v_size = + sizeof(vertex_t); // if set to a value smaller than sizeof(vertex_t), temporarily store vertex + // IDs in compressed_v_size byte variables + + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if constexpr (sizeof(vertex_t) == 8) { // 64 bit vertex ID + static_assert(std::is_signed_v); // __clzll takes a signed integer + + auto total_global_mem = handle.get_device_properties().totalGlobalMem; + size_t element_size = sizeof(vertex_t) * 2; + if (edgelist_weights) { element_size += sizeof(weight_t); } + if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); } + if (edgelist_edge_types) { element_size += sizeof(edge_type_t); } + edge_t num_edges{0}; + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + num_edges += edgelist_srcs[i].size(); + } + bool compress{false}; + if (static_cast(num_edges) * element_size > + static_cast(total_global_mem * 0.5 /* tuning parameter */)) { + compress = true; + } + + if (compress) { + size_t min_clz{sizeof(vertex_t) * 8}; + for (size_t i = 0; i < num_chunks; ++i) { + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_srcs[i].begin(), + edgelist_srcs[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_dsts[i].begin(), + edgelist_dsts[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + } + compressed_v_size = sizeof(vertex_t) - (min_clz / 8); + compressed_v_size = std::max(compressed_v_size, size_t{1}); + } + } + + // 2. groupby each edge chunks to their target local adjacency matrix partition (and further // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex // IDs). - std::vector>> edgelist_partitioned_srcs( - edgelist_srcs.size()); - std::vector>> edgelist_partitioned_dsts( - edgelist_srcs.size()); - auto edgelist_partitioned_weights = - edgelist_weights ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - auto edgelist_partitioned_edge_ids = - edgelist_edge_ids - ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - auto edgelist_partitioned_edge_types = - edgelist_edge_types - ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - - for (size_t i = 0; i < edgelist_srcs.size(); ++i) { // iterate over input edge chunks + std::vector> edgelist_edge_offset_vectors(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks std::optional> this_chunk_weights{std::nullopt}; if (edgelist_weights) { this_chunk_weights = std::move((*edgelist_weights)[i]); } std::optional> this_chunk_edge_ids{std::nullopt}; @@ -1060,6 +1214,9 @@ create_graph_from_edgelist_impl( this_chunk_edge_ids, this_chunk_edge_types, true); + if (this_chunk_weights) { (*edgelist_weights)[i] = std::move(*this_chunk_weights); } + if (this_chunk_edge_ids) { (*edgelist_edge_ids)[i] = std::move(*this_chunk_edge_ids); } + if (this_chunk_edge_types) { (*edgelist_edge_types)[i] = std::move(*this_chunk_edge_types); } std::vector h_this_chunk_edge_counts(d_this_chunk_edge_counts.size()); raft::update_host(h_this_chunk_edge_counts.data(), @@ -1067,132 +1224,84 @@ create_graph_from_edgelist_impl( d_this_chunk_edge_counts.size(), handle.get_stream()); handle.sync_stream(); - std::vector h_this_chunk_edge_displacements(h_this_chunk_edge_counts.size()); - std::exclusive_scan(h_this_chunk_edge_counts.begin(), + std::vector h_this_chunk_edge_offsets( + h_this_chunk_edge_counts.size() + 1, + 0); // size = minor_comm_size (# local edge partitions) * major_comm_size (# segments in the + // local minor range) + std::inclusive_scan(h_this_chunk_edge_counts.begin(), h_this_chunk_edge_counts.end(), - h_this_chunk_edge_displacements.begin(), - size_t{0}); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_srcs(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_displacements[j]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_srcs.size(), tmp_srcs.begin()); - edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs)); - } - edgelist_srcs[i].resize(0, handle.get_stream()); - edgelist_srcs[i].shrink_to_fit(handle.get_stream()); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_dsts(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_displacements[j]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_dsts.size(), tmp_dsts.begin()); - edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts)); - } - edgelist_dsts[i].resize(0, handle.get_stream()); - edgelist_dsts[i].shrink_to_fit(handle.get_stream()); - - if (this_chunk_weights) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_weights(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = (*this_chunk_weights).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_weights.size(), - tmp_weights.begin()); - (*edgelist_partitioned_weights)[i].push_back(std::move(tmp_weights)); - } - (*this_chunk_weights).resize(0, handle.get_stream()); - (*this_chunk_weights).shrink_to_fit(handle.get_stream()); - } - - if (this_chunk_edge_ids) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_ids(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_ids).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_ids.size(), - tmp_edge_ids.begin()); - (*edgelist_partitioned_edge_ids)[i].push_back(std::move(tmp_edge_ids)); - } - (*this_chunk_edge_ids).resize(0, handle.get_stream()); - (*this_chunk_edge_ids).shrink_to_fit(handle.get_stream()); - } + h_this_chunk_edge_offsets.begin() + 1); + edgelist_edge_offset_vectors[i] = std::move(h_this_chunk_edge_offsets); + } - if (this_chunk_edge_types) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_types(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_types).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_types.size(), - tmp_edge_types.begin()); - (*edgelist_partitioned_edge_types)[i].push_back(std::move(tmp_edge_types)); - } - (*this_chunk_edge_types).resize(0, handle.get_stream()); - (*this_chunk_edge_types).shrink_to_fit(handle.get_stream()); + // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement + + std::optional>> edgelist_compressed_srcs{std::nullopt}; + std::optional>> edgelist_compressed_dsts{std::nullopt}; + if (compressed_v_size < sizeof(vertex_t)) { + edgelist_compressed_srcs = std::vector>{}; + edgelist_compressed_dsts = std::vector>{}; + (*edgelist_compressed_srcs).reserve(num_chunks); + (*edgelist_compressed_dsts).reserve(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks + // compress source values + auto tmp_srcs = rmm::device_uvector(edgelist_srcs[i].size() * compressed_v_size, + handle.get_stream()); + auto input_src_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [src_first = edgelist_srcs[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(src_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_src_first, + input_src_first + edgelist_srcs[i].size() * compressed_v_size, + tmp_srcs.begin()); + edgelist_srcs[i].resize(0, handle.get_stream()); + edgelist_srcs[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_srcs).push_back(std::move(tmp_srcs)); + + // compress destination values + + auto tmp_dsts = rmm::device_uvector(edgelist_dsts[i].size() * compressed_v_size, + handle.get_stream()); + auto input_dst_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [dst_first = edgelist_dsts[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(dst_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_dst_first, + input_dst_first + edgelist_dsts[i].size() * compressed_v_size, + tmp_dsts.begin()); + edgelist_dsts[i].resize(0, handle.get_stream()); + edgelist_dsts[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_dsts).push_back(std::move(tmp_dsts)); } } - edgelist_srcs.clear(); - edgelist_dsts.clear(); - if (edgelist_weights) { (*edgelist_weights).clear(); } - if (edgelist_edge_ids) { (*edgelist_edge_ids).clear(); } - if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } - // 2. split the grouped edge chunks to local partitions + // 4. compute additional copy_offset vectors - auto edgelist_intra_partition_segment_offsets = std::vector>(minor_comm_size); - - std::vector> edge_partition_edgelist_srcs{}; - edge_partition_edgelist_srcs.reserve(minor_comm_size); - std::vector> edge_partition_edgelist_dsts{}; - edge_partition_edgelist_dsts.reserve(minor_comm_size); - auto edge_partition_edgelist_weights = - edgelist_partitioned_weights ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_weights) { (*edge_partition_edgelist_weights).reserve(minor_comm_size); } - auto edge_partition_edgelist_edge_ids = - edgelist_partitioned_edge_ids - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_ids) { - (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size); - } - auto edge_partition_edgelist_edge_types = - edgelist_partitioned_edge_types - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_types) { - (*edge_partition_edgelist_edge_types).reserve(minor_comm_size); - } - - for (int i = 0; i < minor_comm_size; ++i) { // iterate over local edge partitions + std::vector edge_partition_edge_counts(minor_comm_size); + std::vector> edge_partition_intra_partition_segment_offset_vectors( + minor_comm_size); + std::vector> edge_partition_intra_segment_copy_output_displacement_vectors( + minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { edge_t edge_count{0}; std::vector intra_partition_segment_sizes(major_comm_size, 0); - std::vector intra_segment_copy_output_displacements(major_comm_size * - edgelist_partitioned_srcs.size()); + std::vector intra_segment_copy_output_displacements(major_comm_size * num_chunks); for (int j = 0; j < major_comm_size /* # segments in the local minor range */; ++j) { edge_t displacement{0}; - for (size_t k = 0; k < edgelist_partitioned_srcs.size() /* # input edge chunks */; ++k) { - auto segment_size = edgelist_partitioned_srcs[k][i * major_comm_size + j].size(); + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; edge_count += segment_size; intra_partition_segment_sizes[j] += segment_size; - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k] = - displacement; + intra_segment_copy_output_displacements[j * num_chunks + k] = displacement; displacement += segment_size; } } @@ -1201,93 +1310,133 @@ create_graph_from_edgelist_impl( intra_partition_segment_sizes.end(), intra_partition_segment_offsets.begin() + 1); - rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) { - auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_srcs.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); + edge_partition_edge_counts[i] = edge_count; + edge_partition_intra_partition_segment_offset_vectors[i] = + std::move(intra_partition_segment_offsets); + edge_partition_intra_segment_copy_output_displacement_vectors[i] = + std::move(intra_segment_copy_output_displacements); + } - rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_dsts.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); + // 5. split the grouped edge chunks to local partitions - if (edge_partition_edgelist_weights) { - rmm::device_uvector tmp_weights(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_weights)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_weights.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); - } + std::vector> edge_partition_edgelist_srcs{}; + std::vector> edge_partition_edgelist_dsts{}; + std::optional>> edge_partition_edgelist_weights{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_ids{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_types{ + std::nullopt}; - if (edge_partition_edgelist_edge_ids) { - rmm::device_uvector tmp_edge_ids(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_ids)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_ids.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); - } + std::optional>> + edge_partition_edgelist_compressed_srcs{}; + std::optional>> + edge_partition_edgelist_compressed_dsts{}; - if (edge_partition_edgelist_edge_types) { - rmm::device_uvector tmp_edge_types(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_types)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_types.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); - } + if (compressed_v_size < sizeof(vertex_t)) { + edge_partition_edgelist_compressed_srcs = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); + + edge_partition_edgelist_compressed_dsts = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); + } else { + edge_partition_edgelist_srcs = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + + edge_partition_edgelist_dsts = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + + if (edgelist_weights) { + edge_partition_edgelist_weights = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_weights), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_ids) { + edge_partition_edgelist_edge_ids = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_ids), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_types) { + edge_partition_edgelist_edge_types = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_types), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } - edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets); + // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory + // requirement + + if (compressed_v_size < sizeof(vertex_t)) { + assert(edge_partition_edgelist_compressed_srcs); + assert(edge_partition_edgelist_compressed_dsts); + + edge_partition_edgelist_srcs.reserve(minor_comm_size); + edge_partition_edgelist_dsts.reserve(minor_comm_size); + + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_srcs(edge_partition_edge_counts[i], handle.get_stream()); + decompress_vertices( + handle, + raft::device_span((*edge_partition_edgelist_compressed_srcs)[i].data(), + (*edge_partition_edgelist_compressed_srcs)[i].size()), + raft::device_span(tmp_srcs.data(), tmp_srcs.size()), + compressed_v_size); + edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); + (*edge_partition_edgelist_compressed_srcs)[i].resize(0, handle.get_stream()); + (*edge_partition_edgelist_compressed_srcs)[i].shrink_to_fit(handle.get_stream()); + + rmm::device_uvector tmp_dsts(edge_partition_edge_counts[i], handle.get_stream()); + decompress_vertices( + handle, + raft::device_span((*edge_partition_edgelist_compressed_dsts)[i].data(), + (*edge_partition_edgelist_compressed_dsts)[i].size()), + raft::device_span(tmp_dsts.data(), tmp_dsts.size()), + compressed_v_size); + edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); + (*edge_partition_edgelist_compressed_dsts)[i].resize(0, handle.get_stream()); + (*edge_partition_edgelist_compressed_dsts)[i].shrink_to_fit(handle.get_stream()); + } } return create_graph_from_partitioned_edgelist(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " + "Invalid input arguments: graph_properties.is_symmetric is true but the input edge " + "list is " "not symmetric."); } @@ -1377,7 +1527,8 @@ create_graph_from_edgelist_impl( handle, raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but the input edge " + "list " "has parallel edges."); } } @@ -1605,7 +1756,8 @@ create_graph_from_edgelist_impl( cugraph::graph_meta_t{ num_vertices, graph_properties, - renumber ? std::optional>{meta.segment_offsets} : std::nullopt}), + renumber ? std::optional>{meta.segment_offsets} : std::nullopt, + meta.hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -1759,15 +1911,15 @@ create_graph_from_edgelist_impl( renumber); if (graph_properties.is_symmetric) { - CUGRAPH_EXPECTS( - (check_symmetric( - handle, - raft::device_span(aggregate_edgelist_srcs.data(), - aggregate_edgelist_srcs.size()), - raft::device_span(aggregate_edgelist_dsts.data(), - aggregate_edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " - "not symmetric."); + CUGRAPH_EXPECTS((check_symmetric( + handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size()))), + "Invalid input arguments: graph_properties.is_symmetric is true but the " + "input edge list is " + "not symmetric."); } if (!graph_properties.is_multigraph) { @@ -1777,7 +1929,8 @@ create_graph_from_edgelist_impl( aggregate_edgelist_srcs.size()), raft::device_span(aggregate_edgelist_dsts.data(), aggregate_edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but " + "the input edge list " "has parallel edges."); } } diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh index 1ef975c1dec..86e3c45ca2f 100644 --- a/cpp/src/structure/detail/structure_utils.cuh +++ b/cpp/src/structure/detail/structure_utils.cuh @@ -60,7 +60,8 @@ rmm::device_uvector compute_sparse_offsets( bool edgelist_major_sorted, rmm::cuda_stream_view stream_view) { - rmm::device_uvector offsets((major_range_last - major_range_first) + 1, stream_view); + rmm::device_uvector offsets(static_cast(major_range_last - major_range_first) + 1, + stream_view); if (edgelist_major_sorted) { offsets.set_element_to_zero_async(0, stream_view); thrust::upper_bound(rmm::exec_policy(stream_view), @@ -77,7 +78,9 @@ rmm::device_uvector compute_sparse_offsets( edgelist_major_first, edgelist_major_last, [offset_view, major_range_first] __device__(auto v) { - atomicAdd(&offset_view[v - major_range_first], edge_t{1}); + cuda::atomic_ref atomic_counter( + offset_view[v - major_range_first]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); }); thrust::exclusive_scan( @@ -246,30 +249,112 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, rmm::device_uvector offsets(0, stream_view); rmm::device_uvector indices(0, stream_view); - auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); if (edgelist_minors.size() > mem_frugal_threshold) { - offsets = compute_sparse_offsets(edgelist_majors.begin(), - edgelist_majors.end(), - major_range_first, - major_range_last, - false, - stream_view); + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if ((sizeof(vertex_t) == 8) && (static_cast(major_range_last - major_range_first) <= + static_cast(std::numeric_limits::max()))) { + rmm::device_uvector edgelist_major_offsets(edgelist_majors.size(), stream_view); + thrust::transform( + rmm::exec_policy_nosync(stream_view), + edgelist_majors.begin(), + edgelist_majors.end(), + edgelist_major_offsets.begin(), + cuda::proclaim_return_type([major_range_first] __device__(vertex_t major) { + return static_cast(major - major_range_first); + })); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + + offsets = + compute_sparse_offsets(edgelist_major_offsets.begin(), + edgelist_major_offsets.end(), + uint32_t{0}, + static_cast(major_range_last - major_range_first), + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_major_offsets.size() * (i + 1)) / 4)))); + } - auto pivot = major_range_first + static_cast(thrust::distance( - offsets.begin(), - thrust::lower_bound(rmm::exec_policy(stream_view), - offsets.begin(), - offsets.end(), - edgelist_minors.size() / 2))); - auto second_first = - detail::mem_frugal_partition(edge_first, - edge_first + edgelist_minors.size(), - thrust_tuple_get, 0>{}, - pivot, - stream_view); - thrust::sort(rmm::exec_policy(stream_view), edge_first, second_first); - thrust::sort(rmm::exec_policy(stream_view), second_first, edge_first + edgelist_minors.size()); + auto pair_first = + thrust::make_zip_iterator(edgelist_major_offsets.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(pair_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(pair_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), pair_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), + last_quarter_first, + pair_first + edgelist_major_offsets.size()); + } else { + offsets = compute_sparse_offsets(edgelist_majors.begin(), + edgelist_majors.end(), + major_range_first, + major_range_last, + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = + major_range_first + + static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_minors.size() * (i + 1)) / 4)))); + } + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(edge_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(edge_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), edge_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort( + rmm::exec_policy(stream_view), last_quarter_first, edge_first + edgelist_majors.size()); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + } } else { + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); thrust::sort(rmm::exec_policy(stream_view), edge_first, edge_first + edgelist_minors.size()); offsets = compute_sparse_offsets(edgelist_majors.begin(), edgelist_majors.end(), @@ -277,12 +362,11 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, major_range_last, true, stream_view); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); } indices = std::move(edgelist_minors); - edgelist_majors.resize(0, stream_view); - edgelist_majors.shrink_to_fit(stream_view); - std::optional> dcs_nzd_vertices{std::nullopt}; if (major_hypersparse_first) { std::tie(offsets, dcs_nzd_vertices) = compress_hypersparse_offsets(std::move(offsets), diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index ef43b7b13ec..6661f0488d8 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -146,8 +146,7 @@ update_local_sorted_unique_edge_majors_minors( auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); - auto use_dcs = - num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); + auto use_dcs = edge_partition_dcs_nzd_vertices.has_value(); std::optional>> local_sorted_unique_edge_majors{ std::nullopt}; @@ -166,14 +165,15 @@ update_local_sorted_unique_edge_majors_minors( // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets - { + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range(); auto minor_range_size = meta.partition.local_edge_partition_minor_range_size(); - rmm::device_uvector minor_bitmaps( - (minor_range_size + (sizeof(uint32_t) * 8 - 1)) / (sizeof(uint32_t) * 8), - handle.get_stream()); - thrust::fill( - handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0}); + rmm::device_uvector minor_bitmaps(packed_bool_size(minor_range_size), + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + minor_bitmaps.begin(), + minor_bitmaps.end(), + packed_bool_empty_mask()); for (size_t i = 0; i < edge_partition_indices.size(); ++i) { thrust::for_each(handle.get_thrust_policy(), edge_partition_indices[i].begin(), @@ -281,92 +281,96 @@ update_local_sorted_unique_edge_majors_minors( // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets - std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - num_local_unique_edge_major_counts[i] += thrust::count_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), - has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); - } - auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), - num_local_unique_edge_major_counts.end()); - - vertex_t aggregate_major_range_size{0}; - for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { - aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); - } - - auto max_major_properties_fill_ratio = - host_scalar_allreduce(comm, - static_cast(num_local_unique_edge_majors) / - static_cast(aggregate_major_range_size), - raft::comms::op_t::MAX, - handle.get_stream()); + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { + std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + num_local_unique_edge_major_counts[i] = thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), + has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); + } + auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), + num_local_unique_edge_major_counts.end()); - if (max_major_properties_fill_ratio < - detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { - auto const chunk_size = - static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); + vertex_t aggregate_major_range_size{0}; + for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { + aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); + } - local_sorted_unique_edge_majors = std::vector>{}; - local_sorted_unique_edge_major_chunk_start_offsets = - std::vector>{}; + auto max_major_properties_fill_ratio = + host_scalar_allreduce(comm, + static_cast(num_local_unique_edge_majors) / + static_cast(aggregate_major_range_size), + raft::comms::op_t::MAX, + handle.get_stream()); - (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); - (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - auto [major_range_first, major_range_last] = - meta.partition.local_edge_partition_major_range(i); - auto sparse_range_last = - use_dcs - ? (major_range_first + - meta.edge_partition_segment_offsets[num_segments_per_vertex_partition * i + + if (max_major_properties_fill_ratio < + detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { + auto const chunk_size = + static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); + + local_sorted_unique_edge_majors = std::vector>{}; + local_sorted_unique_edge_major_chunk_start_offsets = + std::vector>{}; + + (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); + (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + auto [major_range_first, major_range_last] = + meta.partition.local_edge_partition_major_range(i); + auto sparse_range_last = + use_dcs + ? (major_range_first + + meta + .edge_partition_segment_offsets[num_segments_per_vertex_partition * i + detail::num_sparse_segments_per_vertex_partition]) - : major_range_last; - - rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], - handle.get_stream()); - CUGRAPH_EXPECTS( - sparse_range_last - major_range_first < std::numeric_limits::max(), - "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), work-around required."); - auto cur_size = thrust::distance( - unique_edge_majors.begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(major_range_first), - thrust::make_counting_iterator(sparse_range_last), + : major_range_last; + + rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], + handle.get_stream()); + CUGRAPH_EXPECTS(sparse_range_last - major_range_first < std::numeric_limits::max(), + "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), " + "work-around required."); + auto cur_size = thrust::distance( unique_edge_majors.begin(), - has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); - if (use_dcs) { - thrust::copy(handle.get_thrust_policy(), - (*edge_partition_dcs_nzd_vertices)[i].begin(), - (*edge_partition_dcs_nzd_vertices)[i].end(), - unique_edge_majors.begin() + cur_size); + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(major_range_first), + thrust::make_counting_iterator(sparse_range_last), + unique_edge_majors.begin(), + has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); + if (use_dcs) { + thrust::copy(handle.get_thrust_policy(), + (*edge_partition_dcs_nzd_vertices)[i].begin(), + (*edge_partition_dcs_nzd_vertices)[i].end(), + unique_edge_majors.begin() + cur_size); + } + + auto num_chunks = static_cast( + ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); + rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, + handle.get_stream()); + + auto chunk_start_vertex_first = + thrust::make_transform_iterator(thrust::make_counting_iterator(vertex_t{0}), + detail::multiply_and_add_t{ + static_cast(chunk_size), major_range_first}); + thrust::lower_bound(handle.get_thrust_policy(), + unique_edge_majors.begin(), + unique_edge_majors.end(), + chunk_start_vertex_first, + chunk_start_vertex_first + num_chunks, + unique_edge_major_chunk_start_offsets.begin()); + unique_edge_major_chunk_start_offsets.set_element( + num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); + + (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); + (*local_sorted_unique_edge_major_chunk_start_offsets) + .push_back(std::move(unique_edge_major_chunk_start_offsets)); } - - auto num_chunks = static_cast( - ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); - rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, - handle.get_stream()); - - auto chunk_start_vertex_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - detail::multiply_and_add_t{static_cast(chunk_size), major_range_first}); - thrust::lower_bound(handle.get_thrust_policy(), - unique_edge_majors.begin(), - unique_edge_majors.end(), - chunk_start_vertex_first, - chunk_start_vertex_first + num_chunks, - unique_edge_major_chunk_start_offsets.begin()); - unique_edge_major_chunk_start_offsets.set_element( - num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); - - (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); - (*local_sorted_unique_edge_major_chunk_start_offsets) - .push_back(std::move(unique_edge_major_chunk_start_offsets)); + local_sorted_unique_edge_major_chunk_size = chunk_size; } - local_sorted_unique_edge_major_chunk_size = chunk_size; } return std::make_tuple(std::move(local_sorted_unique_edge_majors), @@ -378,6 +382,50 @@ update_local_sorted_unique_edge_majors_minors( std::move(local_sorted_unique_edge_minor_vertex_partition_offsets)); } +template +std::enable_if_t>> +compute_edge_partition_dcs_nzd_range_bitmaps( + raft::handle_t const& handle, + graph_meta_t const& meta, + std::vector> const& edge_partition_dcs_nzd_vertices) +{ + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + auto num_segments_per_vertex_partition = + static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); + + std::vector> edge_partition_dcs_nzd_range_bitmaps{}; + edge_partition_dcs_nzd_range_bitmaps.reserve(edge_partition_dcs_nzd_vertices.size()); + for (size_t i = 0; i < edge_partition_dcs_nzd_vertices.size(); ++i) { + raft::host_span segment_offsets( + meta.edge_partition_segment_offsets.data() + num_segments_per_vertex_partition * i, + num_segments_per_vertex_partition); + rmm::device_uvector bitmap( + packed_bool_size(segment_offsets[detail::num_sparse_segments_per_vertex_partition + 1] - + segment_offsets[detail::num_sparse_segments_per_vertex_partition]), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + auto major_range_first = meta.partition.local_edge_partition_major_range_first(i); + auto major_hypersparse_first = + major_range_first + segment_offsets[detail::num_sparse_segments_per_vertex_partition]; + thrust::for_each(handle.get_thrust_policy(), + edge_partition_dcs_nzd_vertices[i].begin(), + edge_partition_dcs_nzd_vertices[i].end(), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + major_hypersparse_first] __device__(auto major) { + auto offset = major - major_hypersparse_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + }); + edge_partition_dcs_nzd_range_bitmaps.push_back(std::move(bitmap)); + } + + return edge_partition_dcs_nzd_range_bitmaps; +} + } // namespace template @@ -400,7 +448,8 @@ graph_t @@ -452,7 +506,8 @@ graph_t(indices.size()), meta.properties), offsets_(std::move(offsets)), indices_(std::move(indices)), - segment_offsets_(meta.segment_offsets) + segment_offsets_(meta.segment_offsets), + hypersparse_degree_offsets_(meta.hypersparse_degree_offsets) { } diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index f925a142737..31de9b1e5d3 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -488,14 +488,18 @@ graph_view_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta) : detail::graph_base_t( meta.number_of_vertices, meta.number_of_edges, meta.properties), edge_partition_offsets_(edge_partition_offsets), edge_partition_indices_(edge_partition_indices), edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices), + edge_partition_dcs_nzd_range_bitmaps_(edge_partition_dcs_nzd_range_bitmaps), partition_(meta.partition), edge_partition_segment_offsets_(meta.edge_partition_segment_offsets), + edge_partition_hypersparse_degree_offsets_(meta.edge_partition_hypersparse_degree_offsets), local_sorted_unique_edge_srcs_(meta.local_sorted_unique_edge_srcs), local_sorted_unique_edge_src_chunk_start_offsets_( meta.local_sorted_unique_edge_src_chunk_start_offsets), @@ -538,7 +542,8 @@ graph_view_t #include #include -#ifdef TIMING -#include -#endif #include @@ -127,10 +122,6 @@ extract_induced_subgraphs( raft::device_span subgraph_vertices, bool do_expensive_check) { -#ifdef TIMING - HighResTimer hr_timer; - hr_timer.start("extract_induced_subgraphs"); -#endif // 1. check input arguments if (do_expensive_check) { @@ -281,10 +272,6 @@ extract_induced_subgraphs( true, handle.get_stream()); -#ifdef TIMING - hr_timer.stop(); - hr_timer.display_and_clear(std::cout); -#endif return std::make_tuple(std::move(edge_majors), std::move(edge_minors), std::move(edge_weights), diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 41f81d72ab1..bd7d48ac314 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -51,6 +51,8 @@ #include #include +#include + #include #include #include @@ -233,128 +235,299 @@ std::optional find_locally_unused_ext_vertex_id( : std::nullopt /* if the entire range of vertex_t is used */; } -// returns renumber map and segment_offsets +// returns renumber map, segment_offsets, and hypersparse_degree_offsets template -std::tuple, std::vector, vertex_t> compute_renumber_map( - raft::handle_t const& handle, - std::optional>&& local_vertices, - std::vector const& edgelist_majors, - std::vector const& edgelist_minors, - std::vector const& edgelist_edge_counts) +std::tuple, + std::vector, + std::optional>, + vertex_t> +compute_renumber_map(raft::handle_t const& handle, + std::optional>&& local_vertices, + std::vector const& edgelist_majors, + std::vector const& edgelist_minors, + std::vector const& edgelist_edge_counts) { - rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); - - edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); - - // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct - // local_vertices) + // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to + // construct local_vertices) - rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); + rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { - sorted_unique_majors.resize(num_local_edges, handle.get_stream()); - size_t major_offset{0}; - for (size_t i = 0; i < edgelist_majors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[i], - edgelist_majors[i] + edgelist_edge_counts[i], - sorted_unique_majors.begin() + major_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]); - major_offset += static_cast(thrust::distance( - sorted_unique_majors.begin() + major_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]))); + constexpr size_t num_bins{ + 8}; // increase the number of bins to cut peak memory usage (at the expense of additional + // computing), limit the maximum temporary memory usage to "size of local edge list + // majors|minors * 2 / # bins" + constexpr uint32_t hash_seed = + 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function + // used to map vertices to GPUs, and we may not see the expected randomization) + + auto edge_major_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_majors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_major_count_vectors) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { + rmm::device_uvector d_edge_major_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_major_counts.begin(), + d_edge_major_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_major_counts.data(), + d_edge_major_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_major_count_vectors)[i].data(), + d_edge_major_counts.data(), + d_edge_major_counts.size(), + handle.get_stream()); + } } - sorted_unique_majors.resize(major_offset, handle.get_stream()); - if (edgelist_majors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end()); + auto edge_minor_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_minors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_minor_count_vectors) { + for (size_t i = 0; i < edgelist_minors.size(); ++i) { + rmm::device_uvector d_edge_minor_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_minor_counts.begin(), + d_edge_minor_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_minors[i], + edgelist_minors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_minor_counts.data(), + d_edge_minor_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_minor_count_vectors)[i].data(), + d_edge_minor_counts.data(), + d_edge_minor_counts.size(), + handle.get_stream()); + } } - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - } - - // 2. if local_vertices.has_value() is false, find unique vertices from edge minors (to construct - // local_vertices) - rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); - if (!local_vertices) { - sorted_unique_minors.resize(num_local_edges, handle.get_stream()); - size_t minor_offset{0}; - for (size_t i = 0; i < edgelist_minors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_minors[i], - edgelist_minors[i] + edgelist_edge_counts[i], - sorted_unique_minors.begin() + minor_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]); - minor_offset += static_cast(thrust::distance( - sorted_unique_minors.begin() + minor_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]))); - } - sorted_unique_minors.resize(minor_offset, handle.get_stream()); - if (edgelist_minors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end()); - sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end())), - handle.get_stream()); - } - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - } + handle.sync_stream(); - // 3. update sorted_local_vertices. - // if local_vertices.has_value() is false, reconstruct local_vertices first + for (size_t i = 0; i < num_bins; ++i) { + rmm::device_uvector this_bin_sorted_unique_majors(0, handle.get_stream()); + { + std::vector> edge_partition_tmp_majors{}; // for bin "i" + edge_partition_tmp_majors.reserve(edgelist_majors.size()); + for (size_t j = 0; j < edgelist_majors.size(); ++j) { + rmm::device_uvector tmp_majors(0, handle.get_stream()); + if (num_bins > 1) { + tmp_majors.resize((*edge_major_count_vectors)[j][i], handle.get_stream()); + thrust::copy_if(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + tmp_majors.begin(), + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + } else { + tmp_majors.resize(edgelist_edge_counts[j], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + tmp_majors.begin()); + } + thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); + tmp_majors.resize( + thrust::distance( + tmp_majors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), + handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + + edge_partition_tmp_majors.push_back(std::move(tmp_majors)); + } + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + std::vector tx_counts(minor_comm_size); + for (int j = 0; j < minor_comm_size; ++j) { + tx_counts[j] = edge_partition_tmp_majors[j].size(); + } + this_bin_sorted_unique_majors.resize(std::reduce(tx_counts.begin(), tx_counts.end()), + handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_majors[j].begin(), + edge_partition_tmp_majors[j].end(), + this_bin_sorted_unique_majors.begin() + output_offset); + output_offset += edge_partition_tmp_majors[j].size(); + } + this_bin_sorted_unique_majors = shuffle_and_unique_segment_sorted_values( + minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream()); + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } + } - if (local_vertices) { + rmm::device_uvector this_bin_sorted_unique_minors(0, handle.get_stream()); + { + std::vector> edge_partition_tmp_minors{}; // for bin "i" + edge_partition_tmp_minors.reserve(edgelist_minors.size()); + for (size_t j = 0; j < edgelist_minors.size(); ++j) { + rmm::device_uvector tmp_minors(0, handle.get_stream()); + if (num_bins > 1) { + tmp_minors.resize((*edge_minor_count_vectors)[j][i], handle.get_stream()); + thrust::copy_if(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin(), + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + } else { + tmp_minors.resize(edgelist_edge_counts[j], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin()); + } + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + + edge_partition_tmp_minors.push_back(std::move(tmp_minors)); + } + if (edge_partition_tmp_minors.size() == 1) { + this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); + } else { + edge_t aggregate_size{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + aggregate_size += edge_partition_tmp_minors[j].size(); + } + this_bin_sorted_unique_minors.resize(aggregate_size, handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_minors[j].begin(), + edge_partition_tmp_minors[j].end(), + this_bin_sorted_unique_minors.begin() + output_offset); + output_offset += edge_partition_tmp_minors[j].size(); + } + edge_partition_tmp_minors.clear(); + thrust::sort(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end()); + this_bin_sorted_unique_minors.resize( + thrust::distance(this_bin_sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end())), + handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); + } + if constexpr (multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + if (major_comm_size > 1) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + compute_gpu_id_from_ext_vertex_t gpu_id_func{ + comm_size, major_comm_size, minor_comm_size}; + auto d_tx_counts = groupby_and_count( + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), + [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { + return partition_manager::compute_major_comm_rank_from_global_comm_rank( + major_comm_size, minor_comm_size, gpu_id_func(v)); + }, + major_comm_size, + std::numeric_limits::max(), + handle.get_stream()); + std::vector h_tx_counts(d_tx_counts.size()); + raft::update_host( + h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream()); + handle.sync_stream(); + std::vector tx_displacements(h_tx_counts.size()); + std::exclusive_scan( + h_tx_counts.begin(), h_tx_counts.end(), tx_displacements.begin(), size_t{0}); + for (int j = 0; j < major_comm_size; ++j) { + thrust::sort( + handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin() + tx_displacements[j], + this_bin_sorted_unique_minors.begin() + (tx_displacements[j] + h_tx_counts[j])); + } + this_bin_sorted_unique_minors = shuffle_and_unique_segment_sorted_values( + major_comm, this_bin_sorted_unique_minors.begin(), h_tx_counts, handle.get_stream()); + } + } + } + rmm::device_uvector this_bin_sorted_unique_vertices(0, handle.get_stream()); + { + rmm::device_uvector merged_vertices( + this_bin_sorted_unique_majors.size() + this_bin_sorted_unique_minors.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + this_bin_sorted_unique_majors.begin(), + this_bin_sorted_unique_majors.end(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), + merged_vertices.begin()); + this_bin_sorted_unique_majors.resize(0, handle.get_stream()); + this_bin_sorted_unique_majors.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_minors.resize(0, handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); + merged_vertices.resize(thrust::distance(merged_vertices.begin(), + thrust::unique(handle.get_thrust_policy(), + merged_vertices.begin(), + merged_vertices.end())), + handle.get_stream()); + merged_vertices.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_vertices = std::move(merged_vertices); + } + if (sorted_local_vertices.size() == 0) { + sorted_local_vertices = std::move(this_bin_sorted_unique_vertices); + } else { + rmm::device_uvector merged_vertices( + sorted_local_vertices.size() + this_bin_sorted_unique_vertices.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + sorted_local_vertices.begin(), + sorted_local_vertices.end(), + this_bin_sorted_unique_vertices.begin(), + this_bin_sorted_unique_vertices.end(), + merged_vertices.begin()); // merging two unique sets from different hash + // bins, so the merged set can't have duplicates + sorted_local_vertices = std::move(merged_vertices); + } + } + } else { sorted_local_vertices = std::move(*local_vertices); thrust::sort( handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - } else { - sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), - handle.get_stream()); - - thrust::merge(handle.get_thrust_policy(), - sorted_unique_majors.begin(), - sorted_unique_majors.end(), - sorted_unique_minors.begin(), - sorted_unique_minors.end(), - sorted_local_vertices.begin()); - - sorted_unique_majors.resize(0, handle.get_stream()); - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - sorted_unique_minors.resize(0, handle.get_stream()); - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); - - if constexpr (multi_gpu) { - sorted_local_vertices = - cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( - handle, std::move(sorted_local_vertices)); - thrust::sort( - handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); - } } + // 2. find an unused vertex ID + auto locally_unused_vertex_id = find_locally_unused_ext_vertex_id( handle, raft::device_span(sorted_local_vertices.data(), sorted_local_vertices.size()), @@ -363,17 +536,9 @@ std::tuple, std::vector, vertex_t> compu "Invalid input arguments: there is no unused value in the entire range of " "vertex_t, increase vertex_t to 64 bit."); - // 4. compute global degrees for the sorted local vertices + // 3. compute global degrees for the sorted local vertices rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); - std::optional> stream_pool_indices{ - std::nullopt}; // FIXME: move this inside the if statement - - auto constexpr num_chunks = size_t{ - 2}; // tuning parameter, this trade-offs # binary searches (up to num_chunks times more binary - // searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and temporary - // buffer requirement (cut by num_chunks times), currently set to 2 to avoid peak memory - // usage happening in this part (especially when minor_comm_size is small) if constexpr (multi_gpu) { auto& comm = handle.get_comms(); @@ -386,94 +551,37 @@ std::tuple, std::vector, vertex_t> compu auto edge_partition_major_range_sizes = host_scalar_allgather(minor_comm, sorted_local_vertices.size(), handle.get_stream()); - if ((minor_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) { - auto vertex_edge_counts = host_scalar_allreduce( - comm, - thrust::make_tuple(static_cast(sorted_local_vertices.size()), num_local_edges), - raft::comms::op_t::SUM, - handle.get_stream()); - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is approximately - // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) + - // (E / (comm_size * minor_comm_size)) / num_chunks * sizeof(vertex_t) * 2 + - // std::min(V/P, (E / (comm_size * minor_comm_size)) / num_chunks) * (sizeof(vertex_t) + - // sizeof(edge_t)) - // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) - auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0 - ? static_cast(thrust::get<1>(vertex_edge_counts)) / - static_cast(thrust::get<0>(vertex_edge_counts)) - : double{0.0}; - auto num_streams = static_cast( - (avg_vertex_degree * sizeof(vertex_t)) / - (static_cast(sizeof(vertex_t) + sizeof(edge_t)) + - (((avg_vertex_degree / minor_comm_size) / num_chunks) * sizeof(vertex_t) * 2) + - (std::min(1.0, ((avg_vertex_degree / minor_comm_size) / num_chunks)) * - (sizeof(vertex_t) + sizeof(edge_t))))); - if (num_streams >= 2) { - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - for (int i = 0; i < minor_comm_size; ++i) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size()) - : handle.get_stream(); - - rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], loop_stream); + rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], + handle.get_stream()); device_bcast(minor_comm, sorted_local_vertices.data(), sorted_majors.data(), edge_partition_major_range_sizes[i], i, - loop_stream); + handle.get_stream()); - rmm::device_uvector sorted_major_degrees(sorted_majors.size(), loop_stream); - thrust::fill(rmm::exec_policy(loop_stream), + rmm::device_uvector sorted_major_degrees(sorted_majors.size(), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), sorted_major_degrees.begin(), sorted_major_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, loop_stream); - tmp_majors.reserve( - (static_cast(edgelist_edge_counts[i]) + (num_chunks - 1)) / num_chunks, - loop_stream); - size_t offset{0}; - for (size_t j = 0; j < num_chunks; ++j) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[i]) - offset); - tmp_majors.resize(this_chunk_size, loop_stream); - thrust::copy(rmm::exec_policy(loop_stream), - edgelist_majors[i] + offset, - edgelist_majors[i] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(rmm::exec_policy(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); - rmm::device_uvector tmp_values(num_unique_majors, loop_stream); - thrust::reduce_by_key(rmm::exec_policy(loop_stream), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(rmm::exec_policy(loop_stream), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_majors.data(), - static_cast(sorted_majors.size()), - sorted_major_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [sorted_majors = + raft::device_span(sorted_majors.data(), sorted_majors.size()), + sorted_major_degrees = raft::device_span( + sorted_major_degrees.data(), sorted_major_degrees.size())] __device__(auto major) { + auto it = + thrust::lower_bound(thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); device_reduce(minor_comm, sorted_major_degrees.begin(), @@ -481,11 +589,9 @@ std::tuple, std::vector, vertex_t> compu edge_partition_major_range_sizes[i], raft::comms::op_t::SUM, i, - loop_stream); + handle.get_stream()); if (i == minor_comm_rank) { sorted_local_vertex_degrees = std::move(sorted_major_degrees); } } - - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } else { assert(edgelist_majors.size() == 1); @@ -495,47 +601,24 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertex_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, handle.get_stream()); - tmp_majors.reserve(static_cast(edgelist_edge_counts[0] + (num_chunks - 1)) / num_chunks, - handle.get_stream()); - size_t offset{0}; - for (size_t i = 0; i < num_chunks; ++i) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[0]) - offset); - tmp_majors.resize(this_chunk_size, handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[0] + offset, - edgelist_majors[0] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, handle.get_stream()); - rmm::device_uvector tmp_values(num_unique_majors, handle.get_stream()); - thrust::reduce_by_key(handle.get_thrust_policy(), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(handle.get_thrust_policy(), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_local_vertices.data(), - static_cast(sorted_local_vertices.size()), - sorted_local_vertex_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each(handle.get_thrust_policy(), + edgelist_majors[0], + edgelist_majors[0] + edgelist_edge_counts[0], + [sorted_majors = raft::device_span( + sorted_local_vertices.data(), sorted_local_vertices.size()), + sorted_major_degrees = raft::device_span( + sorted_local_vertex_degrees.data(), + sorted_local_vertex_degrees.size())] __device__(auto major) { + auto it = thrust::lower_bound( + thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); } - // 4. sort local vertices by degree (descending) + // 5. sort local vertices by degree (descending) thrust::sort_by_key(handle.get_thrust_policy(), sorted_local_vertex_degrees.begin(), @@ -543,7 +626,7 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertices.begin(), thrust::greater()); - // 5. compute segment_offsets + // 6. compute segment_offsets static_assert(detail::num_sparse_segments_per_vertex_partition == 3); static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) && @@ -553,57 +636,85 @@ std::tuple, std::vector, vertex_t> compu (detail::hypersparse_threshold_ratio <= 1.0)); size_t mid_degree_threshold{detail::mid_degree_threshold}; size_t low_degree_threshold{detail::low_degree_threshold}; - size_t hypersparse_degree_threshold{0}; + size_t hypersparse_degree_threshold{1}; if (multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); mid_degree_threshold *= minor_comm_size; low_degree_threshold *= minor_comm_size; - hypersparse_degree_threshold = - static_cast(minor_comm_size * detail::hypersparse_threshold_ratio); + hypersparse_degree_threshold = std::max( + static_cast(minor_comm_size * detail::hypersparse_threshold_ratio), size_t{1}); } - auto num_segments_per_vertex_partition = - detail::num_sparse_segments_per_vertex_partition + - (hypersparse_degree_threshold > 0 ? size_t{2} : size_t{1}); // last is 0-degree segment - rmm::device_uvector d_thresholds(num_segments_per_vertex_partition - 1, - handle.get_stream()); - auto h_thresholds = - hypersparse_degree_threshold > 0 - ? std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - static_cast(hypersparse_degree_threshold), - std::min(static_cast(hypersparse_degree_threshold), edge_t{1})} - : std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - edge_t{1}}; - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - - rmm::device_uvector d_segment_offsets(num_segments_per_vertex_partition + 1, - handle.get_stream()); - auto vertex_count = static_cast(sorted_local_vertices.size()); - d_segment_offsets.set_element_to_zero_async(0, handle.get_stream()); - d_segment_offsets.set_element( - num_segments_per_vertex_partition, vertex_count, handle.get_stream()); + std::vector h_segment_offsets{}; + std::optional> h_hypersparse_degree_offsets{}; + { + auto num_partitions = detail::num_sparse_segments_per_vertex_partition /* high, mid, low */ + + (hypersparse_degree_threshold > 1 + ? hypersparse_degree_threshold - size_t{1} + /* one partition per each global degree in the hypersparse region */ + : size_t{0}) + + size_t{1} /* zero */; + rmm::device_uvector d_thresholds(num_partitions - 1, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + d_thresholds.begin(), + d_thresholds.end(), + [mid_degree_threshold, + low_degree_threshold, + hypersparse_degree_threshold] __device__(size_t i) { + if (i == 0) { + return mid_degree_threshold; // high,mid boundary + } else if (i == 1) { + return low_degree_threshold; // mid, low boundary + } else { + assert(hypersparse_degree_threshold > (i - 2)); + return hypersparse_degree_threshold - (i - 2); + } + }); + rmm::device_uvector d_offsets(num_partitions + 1, handle.get_stream()); + d_offsets.set_element_to_zero_async(0, handle.get_stream()); + auto vertex_count = static_cast(sorted_local_vertices.size()); + d_offsets.set_element(num_partitions, vertex_count, handle.get_stream()); + thrust::upper_bound(handle.get_thrust_policy(), + sorted_local_vertex_degrees.begin(), + sorted_local_vertex_degrees.end(), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin() + 1, + thrust::greater{}); + std::vector h_offsets(d_offsets.size()); + raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); + handle.sync_stream(); - thrust::upper_bound(handle.get_thrust_policy(), - sorted_local_vertex_degrees.begin(), - sorted_local_vertex_degrees.end(), - d_thresholds.begin(), - d_thresholds.end(), - d_segment_offsets.begin() + 1, - thrust::greater{}); - - std::vector h_segment_offsets(d_segment_offsets.size()); - raft::update_host(h_segment_offsets.data(), - d_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - handle.sync_stream(); + auto num_segments_per_vertex_partition = + detail::num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold > 1 ? size_t{2} : size_t{1}); // last is 0-degree segment + h_segment_offsets.resize(num_segments_per_vertex_partition + 1); + std::copy(h_offsets.begin(), + h_offsets.begin() + num_sparse_segments_per_vertex_partition + 1, + h_segment_offsets.begin()); + *(h_segment_offsets.rbegin()) = *(h_offsets.rbegin()); + if (hypersparse_degree_threshold > 1) { + *(h_segment_offsets.rbegin() + 1) = *(h_offsets.rbegin() + 1); + + h_hypersparse_degree_offsets = std::vector(hypersparse_degree_threshold); + std::copy(h_offsets.begin() + num_sparse_segments_per_vertex_partition, + h_offsets.begin() + num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold - 1), + (*h_hypersparse_degree_offsets).begin()); + auto shift = (*h_hypersparse_degree_offsets)[0]; + std::transform((*h_hypersparse_degree_offsets).begin(), + (*h_hypersparse_degree_offsets).end(), + (*h_hypersparse_degree_offsets).begin(), + [shift](auto offset) { return offset - shift; }); + *((*h_hypersparse_degree_offsets).rbegin()) = *(h_offsets.rbegin() + 1); + } + } - return std::make_tuple( - std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id); + return std::make_tuple(std::move(sorted_local_vertices), + h_segment_offsets, + h_hypersparse_degree_offsets, + *locally_unused_vertex_id); } template @@ -789,32 +900,28 @@ void expensive_check_edgelist( } template -std::vector aggregate_segment_offsets(raft::handle_t const& handle, - std::vector const& segment_offsets) +std::vector aggregate_offset_vectors(raft::handle_t const& handle, + std::vector const& offsets) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - rmm::device_uvector d_segment_offsets(segment_offsets.size(), handle.get_stream()); - raft::update_device( - d_segment_offsets.data(), segment_offsets.data(), segment_offsets.size(), handle.get_stream()); - rmm::device_uvector d_aggregate_segment_offsets( - minor_comm_size * d_segment_offsets.size(), handle.get_stream()); - minor_comm.allgather(d_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - - std::vector h_aggregate_segment_offsets(d_aggregate_segment_offsets.size(), - vertex_t{0}); - raft::update_host(h_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.size(), + rmm::device_uvector d_offsets(offsets.size(), handle.get_stream()); + raft::update_device(d_offsets.data(), offsets.data(), offsets.size(), handle.get_stream()); + rmm::device_uvector d_aggregate_offset_vectors(minor_comm_size * d_offsets.size(), + handle.get_stream()); + minor_comm.allgather( + d_offsets.data(), d_aggregate_offset_vectors.data(), d_offsets.size(), handle.get_stream()); + + std::vector h_aggregate_offset_vectors(d_aggregate_offset_vectors.size(), vertex_t{0}); + raft::update_host(h_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.size(), handle.get_stream()); handle.sync_stream(); // this is necessary as h_aggregate_offsets can be used right after return. - return h_aggregate_segment_offsets; + return h_aggregate_offset_vectors; } } // namespace detail @@ -857,10 +964,10 @@ renumber_edgelist( (*edgelist_intra_partition_segment_offsets).size() == static_cast(minor_comm_size), "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets).size()."); for (size_t i = 0; i < edgelist_majors.size(); ++i) { - CUGRAPH_EXPECTS( - (*edgelist_intra_partition_segment_offsets)[i].size() == - static_cast(major_comm_size + 1), - "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets)[].size()."); + CUGRAPH_EXPECTS((*edgelist_intra_partition_segment_offsets)[i].size() == + static_cast(major_comm_size + 1), + "Invalid input arguments: erroneous " + "(*edgelist_intra_partition_segment_offsets)[].size()."); CUGRAPH_EXPECTS( std::is_sorted((*edgelist_intra_partition_segment_offsets)[i].begin(), (*edgelist_intra_partition_segment_offsets)[i].end()), @@ -868,8 +975,8 @@ renumber_edgelist( CUGRAPH_EXPECTS( ((*edgelist_intra_partition_segment_offsets)[i][0] == 0) && ((*edgelist_intra_partition_segment_offsets)[i].back() == edgelist_edge_counts[i]), - "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 and " - "(*edgelist_intra_partition_segment_offsets)[].back() should coincide with " + "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 " + "and (*edgelist_intra_partition_segment_offsets)[].back() should coincide with " "edgelist_edge_counts[]."); } } @@ -893,7 +1000,10 @@ renumber_edgelist( // 1. compute renumber map - auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + vertex_partition_segment_offsets, + vertex_partition_hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map(handle, std::move(local_vertices), edgelist_const_majors, @@ -966,11 +1076,16 @@ renumber_edgelist( } } - if ((static_cast(partition.local_edge_partition_minor_range_size() * - 2.5 /* tuning parameter */) >= - static_cast(number_of_edges / comm_size)) && - edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) - // part than the O(E/P) part + double approx_mem_requirements = + static_cast(partition.local_edge_partition_minor_range_size()) * + (static_cast( + sizeof(vertex_t)) /* rmm::device_uvector renumber_map_minor_labels */ + + + static_cast(sizeof(vertex_t) * 2) * + 2.5 /* kv_store_t renumber_map, * 2.5 to consider load factor */); + if ((approx_mem_requirements > + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) && + edgelist_intra_partition_segment_offsets) { vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = @@ -1020,10 +1135,10 @@ renumber_edgelist( recvcounts[i] = partition.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector displacements(recvcounts.size(), 0); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0}); device_allgatherv(major_comm, - renumber_map_labels.begin(), - renumber_map_minor_labels.begin(), + renumber_map_labels.data(), + renumber_map_minor_labels.data(), recvcounts, displacements, handle.get_stream()); @@ -1045,12 +1160,20 @@ renumber_edgelist( } auto edge_partition_segment_offsets = - detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); + detail::aggregate_offset_vectors(handle, vertex_partition_segment_offsets); + auto edge_partition_hypersparse_degree_offsets = + vertex_partition_hypersparse_degree_offsets + ? std::make_optional( + detail::aggregate_offset_vectors(handle, *vertex_partition_hypersparse_degree_offsets)) + : std::nullopt; return std::make_tuple( std::move(renumber_map_labels), - renumber_meta_t{ - number_of_vertices, number_of_edges, partition, edge_partition_segment_offsets}); + renumber_meta_t{number_of_vertices, + number_of_edges, + partition, + edge_partition_segment_offsets, + edge_partition_hypersparse_degree_offsets}); } template @@ -1078,7 +1201,10 @@ renumber_edgelist(raft::handle_t const& handle, std::nullopt); } - auto [renumber_map_labels, segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + segment_offsets, + hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map( handle, std::move(vertices), @@ -1099,8 +1225,9 @@ renumber_edgelist(raft::handle_t const& handle, renumber_map_view.find( edgelist_minors, edgelist_minors + num_edgelist_edges, edgelist_minors, handle.get_stream()); - return std::make_tuple(std::move(renumber_map_labels), - renumber_meta_t{segment_offsets}); + return std::make_tuple( + std::move(renumber_map_labels), + renumber_meta_t{segment_offsets, hypersparse_degree_offsets}); } } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh index 3efa58d9632..69c7c556bd8 100644 --- a/cpp/src/structure/renumber_utils_impl.cuh +++ b/cpp/src/structure/renumber_utils_impl.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -363,7 +364,7 @@ void renumber_ext_vertices(raft::handle_t const& handle, } std::unique_ptr> renumber_map_ptr{nullptr}; - if (multi_gpu) { + if constexpr (multi_gpu) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); @@ -402,11 +403,12 @@ void renumber_ext_vertices(raft::handle_t const& handle, rmm::device_uvector int_vertices_for_sorted_unique_ext_vertices(0, handle.get_stream()); auto [unique_ext_vertices, int_vertices_for_unique_ext_vertices] = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, local_renumber_map.view(), std::move(sorted_unique_ext_vertices), detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); renumber_map_ptr = std::make_unique>( unique_ext_vertices.begin(), @@ -573,7 +575,6 @@ void unrenumber_int_vertices(raft::handle_t const& handle, auto local_int_vertex_first = vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - auto local_int_vertex_last = vertex_partition_range_lasts[vertex_partition_id]; rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); sorted_unique_int_vertices.resize( @@ -595,16 +596,20 @@ void unrenumber_int_vertices(raft::handle_t const& handle, sorted_unique_int_vertices.end())), handle.get_stream()); - auto [unique_int_vertices, ext_vertices_for_unique_int_vertices] = - collect_values_for_unique_int_vertices(handle, - std::move(sorted_unique_int_vertices), - renumber_map_labels, - vertex_partition_range_lasts); + auto ext_vertices_for_sorted_unique_int_vertices = + collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + renumber_map_labels, + vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); kv_store_t renumber_map( - unique_int_vertices.begin(), - unique_int_vertices.begin() + unique_int_vertices.size(), - ext_vertices_for_unique_int_vertices.begin(), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end(), + ext_vertices_for_sorted_unique_int_vertices.begin(), invalid_vertex_id::value, invalid_vertex_id::value, handle.get_stream()); diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 8a18dedd2ab..ba40db1f085 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -16,8 +16,9 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" +#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh" #include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -51,6 +52,24 @@ namespace cugraph { namespace { +template +struct direction_optimizing_info_t { + rmm::device_uvector + approx_out_degrees; // if graph_view.local_vertex_partition_segment_offsets().has_value() is + // true, holds approximate degrees only for the high and mid degree + // segments; otherwise, exact + rmm::device_uvector visited_bitmap; + std::optional> nzd_unvisited_vertices{ + std::nullopt}; // valid only during bottom-up iterations + std::optional num_nzd_unvisited_low_degree_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() is true + std::optional num_nzd_unvisited_hypersparse_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() && + // graph_view.use_dcs() are both true +}; + template struct topdown_e_op_t { detail::edge_partition_endpoint_property_device_view_t @@ -69,18 +88,25 @@ struct topdown_e_op_t { } }; -template +template struct bottomup_e_op_t { - detail::edge_partition_endpoint_property_device_view_t + __device__ vertex_t operator()( + vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const + { + return dst; + } +}; + +template +struct bottomup_pred_op_t { + detail::edge_partition_endpoint_property_device_view_t prev_visited_flags{}; // visited in the previous iterations vertex_t dst_first{}; - __device__ thrust::optional operator()( + __device__ bool operator()( vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const { - auto dst_offset = dst - dst_first; - auto old = prev_visited_flags.get(dst_offset); - return old ? thrust::optional{dst} : thrust::nullopt; + return prev_visited_flags.get(dst - dst_first); } }; @@ -144,14 +170,27 @@ void bfs(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { is_sorted = static_cast(host_scalar_allreduce(handle.get_comms(), static_cast(is_sorted), - raft::comms::op_t::SUM, + raft::comms::op_t::MIN, handle.get_stream())); } - CUGRAPH_EXPECTS( is_sorted, "Invalid input arguments: input sources should be sorted in the non-descending order."); + bool no_duplicates = (static_cast(thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(n_sources), + is_first_in_run_t{sources})) == n_sources); + if constexpr (GraphViewType::is_multi_gpu) { + no_duplicates = static_cast(host_scalar_allreduce(handle.get_comms(), + static_cast(no_duplicates), + raft::comms::op_t::MIN, + handle.get_stream())); + } + CUGRAPH_EXPECTS(no_duplicates, + "Invalid input arguments: input sources should not have duplicates."); + auto num_invalid_vertices = thrust::count_if(handle.get_thrust_policy(), sources, @@ -189,34 +228,119 @@ void bfs(raft::handle_t const& handle, // 3. update meta data for direction optimizing BFS - constexpr edge_t direction_optimizing_alpha = 14; - constexpr vertex_t direction_optimizing_beta = 24; + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + + double direction_optimizing_alpha = + (graph_view.number_of_vertices() > 0) + ? ((static_cast(graph_view.compute_number_of_edges(handle)) / + static_cast(graph_view.number_of_vertices())) * + (1.0 / 3.75) /* tuning parametger */) + : double{1.0}; + constexpr vertex_t direction_optimizing_beta = 24; // tuning parameter - std::optional> out_degrees{std::nullopt}; - std::optional> nzd_unvisited_vertices{std::nullopt}; + std::optional> aux_info{std::nullopt}; if (direction_optimizing) { - out_degrees = graph_view.compute_out_degrees(handle); - nzd_unvisited_vertices = rmm::device_uvector( - graph_view.local_vertex_partition_range_size(), handle.get_stream()); - (*nzd_unvisited_vertices) - .resize(thrust::distance( - (*nzd_unvisited_vertices).begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()), - (*nzd_unvisited_vertices).begin(), - [vertex_partition, - sources = raft::device_span(sources, n_sources), - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = - vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return (out_degrees[v_offset] > edge_t{0}) && - !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); - })), - handle.get_stream()); - (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream()); + rmm::device_uvector approx_out_degrees(0, handle.get_stream()); + if (segment_offsets) { // exploit internal knowedge for exhaustive performance optimization for + // large-scale benchmarking (the else path is sufficient for small + // clusters with few tens of GPUs) + size_t partition_idx{0}; + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + partition_idx = static_cast(minor_comm_rank); + partition_size = static_cast(minor_comm_size); + } + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_mask_view = graph_view.edge_mask_view(); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + auto high_and_mid_degree_segment_size = + (*segment_offsets)[2]; // compute local degrees for high & mid degree segments only, for + // low & hypersparse segments, use low_degree_threshold * + // partition_size * 0.5 & partition_size * + // hypersparse_threshold_ratio * 0.5 as approximate out degrees + if (edge_partition_e_mask) { + approx_out_degrees = edge_partition.compute_local_degrees_with_mask( + (*edge_partition_e_mask).value_first(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } else { + approx_out_degrees = edge_partition.compute_local_degrees( + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } + thrust::transform(handle.get_thrust_policy(), + approx_out_degrees.begin(), + approx_out_degrees.end(), + approx_out_degrees.begin(), + multiplier_t{static_cast( + partition_size)}); // local_degrees => approximate global degrees + } else { + approx_out_degrees = graph_view.compute_out_degrees(handle); // exact + } + + rmm::device_uvector visited_bitmap( + packed_bool_size(graph_view.local_vertex_partition_range_size()), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + visited_bitmap.begin(), + visited_bitmap.end(), + packed_bool_empty_mask()); + thrust::for_each( + handle.get_thrust_policy(), + sources, + sources + n_sources, + [bitmap = raft::device_span(visited_bitmap.data(), visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + + std::optional num_nzd_unvisited_low_degree_vertices{std::nullopt}; + std::optional num_nzd_unvisited_hypersparse_vertices{std::nullopt}; + if (segment_offsets) { + num_nzd_unvisited_low_degree_vertices = (*segment_offsets)[3] - (*segment_offsets)[2]; + if (graph_view.use_dcs()) { + num_nzd_unvisited_hypersparse_vertices = (*segment_offsets)[4] - (*segment_offsets)[3]; + } + if (n_sources > 0) { + std::vector h_sources(n_sources); + raft::update_host(h_sources.data(), sources, n_sources, handle.get_stream()); + handle.sync_stream(); + for (size_t i = 0; i < h_sources.size(); ++i) { + auto v_offset = h_sources[i] - graph_view.local_vertex_partition_range_first(); + if ((v_offset >= (*segment_offsets)[2]) && (v_offset < (*segment_offsets)[3])) { + --(*num_nzd_unvisited_low_degree_vertices); + } else if (graph_view.use_dcs()) { + if ((v_offset >= (*segment_offsets)[3]) && (v_offset < (*segment_offsets)[4])) { + --(*num_nzd_unvisited_hypersparse_vertices); + } + } + } + } + } + + aux_info = + direction_optimizing_info_t{std::move(approx_out_degrees), + std::move(visited_bitmap), + std::nullopt, + num_nzd_unvisited_low_degree_vertices, + num_nzd_unvisited_hypersparse_vertices}; } // 4. initialize BFS frontier @@ -237,7 +361,6 @@ void bfs(raft::handle_t const& handle, handle, graph_view); // this may mark some vertices visited in previous iterations as unvisited // (but this is OK as we check prev_dst_visited_flags first) fill_edge_dst_property(handle, graph_view, dst_visited_flags.mutable_view(), false); - fill_edge_dst_property(handle, graph_view, vertex_frontier.bucket(bucket_idx_cur).begin(), @@ -247,12 +370,12 @@ void bfs(raft::handle_t const& handle, // 4. BFS iteration vertex_t depth{0}; - bool top_down = true; - auto cur_aggregate_vertex_frontier_size = + bool topdown = true; + auto cur_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_cur).aggregate_size()); while (true) { - vertex_t next_aggregate_vertex_frontier_size{}; - if (top_down) { + vertex_t next_aggregate_frontier_size{}; + if (topdown) { topdown_e_op_t e_op{}; e_op.prev_visited_flags = detail::edge_partition_endpoint_property_device_view_t( @@ -263,14 +386,15 @@ void bfs(raft::handle_t const& handle, e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + reduce_op::any()); auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), predecessor_buffer.begin()); @@ -286,9 +410,9 @@ void bfs(raft::handle_t const& handle, key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); - next_aggregate_vertex_frontier_size = + next_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_next).aggregate_size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } + if (next_aggregate_frontier_size == 0) { break; } fill_edge_dst_property(handle, graph_view, @@ -298,65 +422,146 @@ void bfs(raft::handle_t const& handle, true); if (direction_optimizing) { - auto m_f = thrust::transform_reduce( - handle.get_thrust_policy(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); + if (vertex_frontier.bucket(bucket_idx_next).size() > 0) { + thrust::for_each( + handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + } + double m_f{0.0}; + double m_u{0.0}; { - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - tmp_vertices.begin())), - handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + partition_size = static_cast(minor_comm_size); + } + + auto f_vertex_first = vertex_frontier.bucket(bucket_idx_next).begin(); + auto f_vertex_last = vertex_frontier.bucket(bucket_idx_next).end(); + + if (segment_offsets) { + // FIXME: this actually over-estimates for graphs with power-law degree distribution + auto approx_low_segment_degree = + static_cast(low_degree_threshold * partition_size) * 0.5; + auto approx_hypersparse_segment_degree = + static_cast(partition_size) * hypersparse_threshold_ratio * 0.5; + auto f_segment_offsets = compute_key_segment_offsets( + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + (f_segment_offsets[3] - f_segment_offsets[2]); + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + (f_segment_offsets[4] - f_segment_offsets[3]); + } + f_vertex_last = f_vertex_first + f_segment_offsets[2]; + m_f = static_cast((f_segment_offsets[3] - f_segment_offsets[2])) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_f += static_cast(f_segment_offsets[4] - f_segment_offsets[3]) * + approx_hypersparse_segment_degree; + } + + m_u = static_cast(*((*aux_info).num_nzd_unvisited_low_degree_vertices)) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_u += static_cast(*((*aux_info).num_nzd_unvisited_hypersparse_vertices)) * + approx_hypersparse_segment_degree; + } + } + + m_f += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + f_vertex_first, + f_vertex_last, + cuda::proclaim_return_type( + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) { + auto v_offset = v - v_first; + return out_degrees[v_offset]; + }), + edge_t{0}, + thrust::plus{})); + + m_u += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(segment_offsets + ? (*segment_offsets)[2] + : graph_view.local_vertex_partition_range_size()), + cuda::proclaim_return_type( + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + bitmap = raft::device_span( + (*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size())] __device__(vertex_t v_offset) { + auto word = bitmap[packed_bool_offset(v_offset)]; + if ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()) { // visited + return edge_t{0}; + } else { + return out_degrees[v_offset]; + } + }), + edge_t{0}, + thrust::plus{})); } - auto m_u = thrust::transform_reduce( - handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); - auto aggregate_m_f = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_f, raft::comms::op_t::SUM, handle.get_stream()) - : m_f; - auto aggregate_m_u = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) - : m_u; + auto aggregate_m_f = m_f; + auto aggregate_m_u = m_u; + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce(handle.get_comms(), + thrust::make_tuple(m_f, m_u), + raft::comms::op_t::SUM, + handle.get_stream()); + aggregate_m_f = thrust::get<0>(tmp); + aggregate_m_u = thrust::get<1>(tmp); + } if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && - (next_aggregate_vertex_frontier_size >= cur_aggregate_vertex_frontier_size)) { - top_down = false; + (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { + topdown = false; + (*aux_info).nzd_unvisited_vertices = rmm::device_uvector( + segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(), + handle.get_stream()); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator( + segment_offsets ? graph_view.local_vertex_partition_range_first() + + *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_last()), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) == packed_bool_empty_mask()); + })), + handle.get_stream()); } } - if (top_down) { // staying in top-down + if (topdown) { // staying in top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t(handle); vertex_frontier.swap_buckets(bucket_idx_cur, bucket_idx_next); @@ -364,63 +569,122 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + (*((*aux_info).nzd_unvisited_vertices)).size())); vertex_frontier.bucket(bucket_idx_next) = key_bucket_t(handle); } } else { // bottom up - bottomup_e_op_t e_op{}; - e_op.prev_visited_flags = - detail::edge_partition_endpoint_property_device_view_t( - prev_dst_visited_flags.mutable_view()); - e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); - auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_src(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); + rmm::device_uvector new_frontier_vertex_buffer(0, handle.get_stream()); + { + bottomup_e_op_t e_op{}; + bottomup_pred_op_t pred_op{}; + pred_op.prev_visited_flags = + detail::edge_partition_endpoint_property_device_view_t( + prev_dst_visited_flags.view()); + pred_op.dst_first = graph_view.local_edge_partition_dst_range_first(); + + rmm::device_uvector predecessor_buffer( + vertex_frontier.bucket(bucket_idx_cur).size(), handle.get_stream()); + per_v_transform_reduce_if_outgoing_e(handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + invalid_vertex, + reduce_op::any(), + pred_op, + predecessor_buffer.begin(), + true); + auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), + predecessor_buffer.begin()); + + // FIXME: this scatter_if and the resize below can be concurrently executed. + thrust::scatter_if( + handle.get_thrust_policy(), + input_pair_first, + input_pair_first + predecessor_buffer.size(), + thrust::make_transform_iterator( + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), + predecessor_buffer.begin(), + thrust::make_zip_iterator(distances, predecessor_first), + detail::is_not_equal_t{invalid_vertex}); + + new_frontier_vertex_buffer.resize(predecessor_buffer.size(), handle.get_stream()); + new_frontier_vertex_buffer.resize( + thrust::distance(new_frontier_vertex_buffer.begin(), + thrust::copy_if(handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + vertex_frontier.bucket(bucket_idx_cur).cend(), + predecessor_buffer.begin(), + new_frontier_vertex_buffer.begin(), + detail::is_not_equal_t{invalid_vertex})), + handle.get_stream()); - auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), - predecessor_buffer.begin()); - thrust::scatter( - handle.get_thrust_policy(), - input_pair_first, - input_pair_first + new_frontier_vertex_buffer.size(), - thrust::make_transform_iterator( + assert(direction_optimizing); + + thrust::for_each( + handle.get_thrust_policy(), new_frontier_vertex_buffer.begin(), - detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), - thrust::make_zip_iterator(distances, predecessor_first)); + new_frontier_vertex_buffer.end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::remove_if( + handle.get_thrust_policy(), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + (*((*aux_info).nzd_unvisited_vertices)).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + handle.get_stream()); - assert(direction_optimizing); + if (segment_offsets) { + auto key_segment_offsets = compute_key_segment_offsets( + new_frontier_vertex_buffer.begin(), + new_frontier_vertex_buffer.end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + key_segment_offsets[3] - key_segment_offsets[2]; + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + key_segment_offsets[4] - key_segment_offsets[3]; + } + } + } - { - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - new_frontier_vertex_buffer.begin(), - new_frontier_vertex_buffer.end(), - tmp_vertices.begin())), + next_aggregate_frontier_size = static_cast(new_frontier_vertex_buffer.size()); + auto aggregate_nzd_unvisited_vertices = + static_cast((*((*aux_info).nzd_unvisited_vertices)).size()); + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce( + handle.get_comms(), + thrust::make_tuple(next_aggregate_frontier_size, aggregate_nzd_unvisited_vertices), + raft::comms::op_t::SUM, handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + next_aggregate_frontier_size = thrust::get<0>(tmp); + aggregate_nzd_unvisited_vertices = thrust::get<1>(tmp); } - next_aggregate_vertex_frontier_size = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast(new_frontier_vertex_buffer.size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast(new_frontier_vertex_buffer.size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } + if (next_aggregate_frontier_size == 0) { break; } fill_edge_dst_property(handle, graph_view, @@ -429,21 +693,13 @@ void bfs(raft::handle_t const& handle, prev_dst_visited_flags.mutable_view(), true); - auto aggregate_nzd_unvisted_vertices = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast((*nzd_unvisited_vertices).size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast((*nzd_unvisited_vertices).size()); - - if ((next_aggregate_vertex_frontier_size * direction_optimizing_beta < - aggregate_nzd_unvisted_vertices) && - (next_aggregate_vertex_frontier_size < cur_aggregate_vertex_frontier_size)) { - top_down = true; + if ((next_aggregate_frontier_size * direction_optimizing_beta < + aggregate_nzd_unvisited_vertices) && + (next_aggregate_frontier_size < cur_aggregate_frontier_size)) { + topdown = true; } - if (top_down) { // swithcing to top-down + if (topdown) { // swithcing to top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); @@ -451,11 +707,11 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + ((*(*aux_info).nzd_unvisited_vertices)).size())); } } - cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size; + cur_aggregate_frontier_size = next_aggregate_frontier_size; depth++; if (depth >= depth_limit) { break; } diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh index 40030e2e39c..d228460bec3 100644 --- a/cpp/src/traversal/extract_bfs_paths_impl.cuh +++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh @@ -220,11 +220,15 @@ std::tuple, vertex_t> extract_bfs_paths( detail::decrement_position{}); if constexpr (multi_gpu) { - current_frontier = collect_values_for_int_vertices(handle, - current_frontier.begin(), - current_frontier.end(), - predecessors, - h_vertex_partition_range_lasts); + auto& comm = handle.get_comms(); + current_frontier = + collect_values_for_int_vertices(comm, + current_frontier.begin(), + current_frontier.end(), + predecessors, + h_vertex_partition_range_lasts, + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } else { thrust::transform(handle.get_thrust_policy(), current_frontier.begin(), diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh index acf3cfe8fc5..44fa21a5252 100644 --- a/cpp/src/traversal/k_hop_nbrs_impl.cuh +++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include @@ -147,15 +147,15 @@ k_hop_nbrs(raft::handle_t const& handle, rmm::device_uvector nbrs(0, handle.get_stream()); for (size_t iter = 0; iter < k; ++iter) { auto new_frontier_key_buffer = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - push_graph_view, - frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{}, - reduce_op::null{}, - do_expensive_check); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + push_graph_view, + frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{}, + reduce_op::null{}, + do_expensive_check); if (iter < (k - 1)) { frontier.bucket(bucket_idx_cur).clear(); frontier.bucket(bucket_idx_cur) diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index e1b7444b92f..b3cd0d57c67 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -22,7 +22,7 @@ #include "prims/kv_store.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -641,7 +641,6 @@ rmm::device_uvector od_shortest_distances( cutoff, invalid_distance}; detail::transform_reduce_v_frontier_call_e_op_t< - false, thrust::tuple, weight_t, vertex_t, @@ -653,8 +652,8 @@ rmm::device_uvector od_shortest_distances( auto new_frontier_tagged_vertex_buffer = allocate_dataframe_buffer>(0, handle.get_stream()); - std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = detail:: - extract_transform_v_frontier_e, weight_t>( + std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = + detail::extract_transform_v_frontier_e, weight_t>( handle, graph_view, vertex_frontier.bucket(bucket_idx_near), @@ -675,12 +674,14 @@ rmm::device_uvector od_shortest_distances( resize_dataframe_buffer(new_frontier_tagged_vertex_buffer, 0, handle.get_stream()); shrink_to_fit_dataframe_buffer(new_frontier_tagged_vertex_buffer, handle.get_stream()); - std::tie(new_frontier_keys, distance_buffer) = - detail::sort_and_reduce_buffer_elements>( + std::tie(new_frontier_keys, distance_buffer) = detail:: + sort_and_reduce_buffer_elements>( handle, std::move(new_frontier_keys), std::move(distance_buffer), - reduce_op::minimum()); + reduce_op::minimum(), + std::make_tuple(vertex_t{0}, graph_view.number_of_vertices()), + std::nullopt); } vertex_frontier.bucket(bucket_idx_near).clear(); diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh index 47908524feb..3429672b151 100644 --- a/cpp/src/traversal/sssp_impl.cuh +++ b/cpp/src/traversal/sssp_impl.cuh @@ -19,7 +19,7 @@ #include "prims/fill_edge_src_dst_property.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -197,7 +197,7 @@ void sssp(raft::handle_t const& handle, push_graph_view.local_vertex_partition_view()); auto [new_frontier_vertex_buffer, distance_predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst( + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( handle, push_graph_view, vertex_frontier.bucket(bucket_idx_cur_near), diff --git a/cpp/src/utilities/collect_comm.cuh b/cpp/src/utilities/collect_comm.cuh index 2197409fe26..dc4267aac57 100644 --- a/cpp/src/utilities/collect_comm.cuh +++ b/cpp/src/utilities/collect_comm.cuh @@ -50,79 +50,73 @@ namespace cugraph { -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template -decltype(allocate_dataframe_buffer(0, - rmm::cuda_stream_view{})) -collect_values_for_keys(raft::handle_t const& handle, - KVStoreViewType kv_store_view, - KeyIterator collect_key_first, - KeyIterator collect_key_last, - KeyToGPUIdOp key_to_gpu_id_op) +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template +dataframe_buffer_type_t collect_values_for_keys( + raft::comms::comms_t const& comm, + KVStoreViewType kv_store_view, + KeyIterator collect_key_first, + KeyIterator collect_key_last, + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; static_assert(std::is_same_v::value_type, key_t>); using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - // 1. collect values for the unique keys in [collect_key_first, collect_key_last) rmm::device_uvector unique_keys(thrust::distance(collect_key_first, collect_key_last), - handle.get_stream()); + stream_view); thrust::copy( - handle.get_thrust_policy(), collect_key_first, collect_key_last, unique_keys.begin()); - thrust::sort(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end()); + rmm::exec_policy_nosync(stream_view), collect_key_first, collect_key_last, unique_keys.begin()); + thrust::sort(rmm::exec_policy_nosync(stream_view), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( unique_keys.begin(), - thrust::unique(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end())), - handle.get_stream()); + thrust::unique(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end())), + stream_view); - auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); { - rmm::device_uvector rx_unique_keys(0, handle.get_stream()); + rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); - std::tie(rx_values_for_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); + std::tie(rx_values_for_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); values_for_unique_keys = std::move(rx_values_for_unique_keys); } // 2. build a kv_store_t object for the k, v pairs in unique_keys, values_for_unique_keys. - kv_store_t unique_key_value_store( - handle.get_stream()); + kv_store_t unique_key_value_store(stream_view); if constexpr (KVStoreViewType::binary_search) { unique_key_value_store = kv_store_t(std::move(unique_keys), std::move(values_for_unique_keys), kv_store_view.invalid_value(), false, - handle.get_stream()); + stream_view); } else { auto kv_pair_first = thrust::make_zip_iterator( thrust::make_tuple(unique_keys.begin(), get_dataframe_buffer_begin(values_for_unique_keys))); auto valid_kv_pair_last = - thrust::remove_if(handle.get_thrust_policy(), + thrust::remove_if(rmm::exec_policy(stream_view), kv_pair_first, kv_pair_first + unique_keys.size(), [invalid_value = kv_store_view.invalid_value()] __device__(auto pair) { @@ -136,176 +130,173 @@ collect_values_for_keys(raft::handle_t const& handle, get_dataframe_buffer_begin(values_for_unique_keys), kv_store_view.invalid_key(), kv_store_view.invalid_value(), - handle.get_stream()); + stream_view); - unique_keys.resize(0, handle.get_stream()); - resize_dataframe_buffer(values_for_unique_keys, 0, handle.get_stream()); - unique_keys.shrink_to_fit(handle.get_stream()); - shrink_to_fit_dataframe_buffer(values_for_unique_keys, handle.get_stream()); + unique_keys.resize(0, stream_view); + resize_dataframe_buffer(values_for_unique_keys, 0, stream_view); + unique_keys.shrink_to_fit(stream_view); + shrink_to_fit_dataframe_buffer(values_for_unique_keys, stream_view); } auto unique_key_value_store_view = unique_key_value_store.view(); // 3. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( - thrust::distance(collect_key_first, collect_key_last), handle.get_stream()); - unique_key_value_store_view.find(collect_key_first, - collect_key_last, - get_dataframe_buffer_begin(value_buffer), - handle.get_stream()); + thrust::distance(collect_key_first, collect_key_last), stream_view); + unique_key_value_store_view.find( + collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer), stream_view); return value_buffer; } -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template std::tuple, - decltype(allocate_dataframe_buffer( - 0, cudaStream_t{nullptr}))> + dataframe_buffer_type_t> collect_values_for_unique_keys( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, KVStoreViewType kv_store_view, rmm::device_uvector&& collect_unique_keys, - KeyToGPUIdOp key_to_gpu_id_op) + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - - auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, stream_view); { auto [rx_unique_keys, rx_value_counts] = groupby_gpu_id_and_shuffle_values( comm, collect_unique_keys.begin(), collect_unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - std::tie(values_for_collect_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + std::tie(values_for_collect_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); } return std::make_tuple(std::move(collect_unique_keys), std::move(values_for_collect_unique_keys)); } template -std::tuple< - rmm::device_uvector, - decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr}))> -collect_values_for_unique_int_vertices(raft::handle_t const& handle, - rmm::device_uvector&& collect_unique_int_vertices, - ValueIterator local_value_first, - std::vector const& vertex_partition_range_lasts) +dataframe_buffer_type_t::value_type> +collect_values_for_sorted_unique_int_vertices( + raft::comms::comms_t const& comm, + raft::device_span collect_sorted_unique_int_vertices, + ValueIterator local_value_first, + std::vector const& comm_rank_vertex_partition_range_lasts, + vertex_t local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using value_t = typename thrust::iterator_traits::value_type; - auto& comm = handle.get_comms(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_size = major_comm.get_size(); - auto const major_comm_rank = major_comm.get_rank(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - auto const minor_comm_rank = minor_comm.get_rank(); + // 1.find tx_counts - // 1. groupby and shuffle internal vertices + rmm::device_uvector d_range_lasts(comm_rank_vertex_partition_range_lasts.size(), + stream_view); + raft::update_device(d_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.size(), + stream_view); - rmm::device_uvector d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(), - handle.get_stream()); - raft::update_device(d_vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.size(), - handle.get_stream()); + rmm::device_uvector d_offsets(d_range_lasts.size() - 1, stream_view); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + collect_sorted_unique_int_vertices.begin(), + collect_sorted_unique_int_vertices.end(), + d_range_lasts.begin(), + d_range_lasts.begin() + (d_range_lasts.size() - 1), + d_offsets.begin()); - auto [rx_int_vertices, rx_int_vertex_counts] = groupby_gpu_id_and_shuffle_values( - comm, - collect_unique_int_vertices.begin(), - collect_unique_int_vertices.end(), - detail::compute_gpu_id_from_int_vertex_t{ - raft::device_span(d_vertex_partition_range_lasts.data(), - d_vertex_partition_range_lasts.size()), - major_comm_size, - minor_comm_size}, - handle.get_stream()); - - // 2: Lookup return values - - auto vertex_partition_id = - partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank); - auto local_int_vertex_first = - vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - - auto value_buffer = - allocate_dataframe_buffer(rx_int_vertices.size(), handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); + h_offsets[0] = 0; + h_offsets.back() = collect_sorted_unique_int_vertices.size(); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + + std::vector tx_counts(comm_rank_vertex_partition_range_lasts.size()); + std::adjacent_difference(h_offsets.begin() + 1, h_offsets.end(), tx_counts.begin()); + + // 2. shuffle sorted unique internal vertices to the owning ranks + + auto [rx_int_vertices, rx_counts] = + shuffle_values(comm, collect_sorted_unique_int_vertices.begin(), tx_counts, stream_view); + + // 3.Lookup return values + + auto value_buffer = allocate_dataframe_buffer(rx_int_vertices.size(), stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), rx_int_vertices.begin(), rx_int_vertices.end(), get_dataframe_buffer_begin(value_buffer), - [local_value_first, local_int_vertex_first] __device__(auto v) { - return local_value_first[v - local_int_vertex_first]; + [local_value_first, local_vertex_partition_range_first] __device__(auto v) { + return local_value_first[v - local_vertex_partition_range_first]; }); + rx_int_vertices.resize(0, stream_view); + rx_int_vertices.shrink_to_fit(stream_view); - // 3: Shuffle results back to original GPU + // 4. Shuffle results back to the original ranks - std::tie(value_buffer, std::ignore) = shuffle_values( - comm, get_dataframe_buffer_begin(value_buffer), rx_int_vertex_counts, handle.get_stream()); + std::tie(value_buffer, std::ignore) = + shuffle_values(comm, get_dataframe_buffer_begin(value_buffer), rx_counts, stream_view); - return std::make_tuple(std::move(collect_unique_int_vertices), std::move(value_buffer)); + return value_buffer; } template -decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr})) +dataframe_buffer_type_t::value_type> collect_values_for_int_vertices( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, VertexIterator collect_vertex_first, VertexIterator collect_vertex_last, ValueIterator local_value_first, std::vector::value_type> const& - vertex_partition_range_lasts) + comm_rank_vertex_partition_range_lasts, + typename thrust::iterator_traits::value_type local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using vertex_t = typename thrust::iterator_traits::value_type; using value_t = typename thrust::iterator_traits::value_type; size_t input_size = thrust::distance(collect_vertex_first, collect_vertex_last); - rmm::device_uvector sorted_unique_int_vertices(input_size, handle.get_stream()); + rmm::device_uvector sorted_unique_int_vertices(input_size, stream_view); - raft::copy( - sorted_unique_int_vertices.data(), collect_vertex_first, input_size, handle.get_stream()); + raft::copy(sorted_unique_int_vertices.data(), collect_vertex_first, input_size, stream_view); - thrust::sort(handle.get_thrust_policy(), + thrust::sort(rmm::exec_policy_nosync(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); - auto last = thrust::unique(handle.get_thrust_policy(), + auto last = thrust::unique(rmm::exec_policy(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); sorted_unique_int_vertices.resize(thrust::distance(sorted_unique_int_vertices.begin(), last), - handle.get_stream()); - - auto [unique_int_vertices, tmp_value_buffer] = collect_values_for_unique_int_vertices( - handle, std::move(sorted_unique_int_vertices), local_value_first, vertex_partition_range_lasts); + stream_view); - kv_store_t kv_map(std::move(unique_int_vertices), + auto tmp_value_buffer = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + local_value_first, + comm_rank_vertex_partition_range_lasts, + local_vertex_partition_range_first, + stream_view); + + kv_store_t kv_map(std::move(sorted_unique_int_vertices), std::move(tmp_value_buffer), invalid_vertex_id::value, false, - handle.get_stream()); + stream_view); auto device_view = detail::kv_binary_search_store_device_view_t(kv_map.view()); - auto value_buffer = allocate_dataframe_buffer(input_size, handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + auto value_buffer = allocate_dataframe_buffer(input_size, stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), collect_vertex_first, collect_vertex_last, get_dataframe_buffer_begin(value_buffer), diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh index 70327db5ffb..1cf2493cd28 100644 --- a/cpp/src/utilities/shuffle_vertex_pairs.cuh +++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh @@ -61,10 +61,10 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl( (edge_ids ? sizeof(edge_t) : size_t{0}) + (edge_types ? sizeof(edge_type_t) : size_t{0}); auto constexpr mem_frugal_ratio = - 0.1; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the - // total_global_mem, switch to the memory frugal approach (thrust::sort is used to - // group-by by default, and thrust::sort requires temporary buffer comparable to the input - // data size) + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + // total_global_mem, switch to the memory frugal approach (thrust::sort is used to + // group-by by default, and thrust::sort requires temporary buffer comparable to the + // input data size) auto mem_frugal_threshold = static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a2eeafea8cf..44963f91515 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -698,9 +698,9 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu) ############################################################################################### - # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST tests -------------------------- - ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST_TEST - prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu) + # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests ------------------------------ + ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST + prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu) ############################################################################################### # - MG PRIMS REDUCE_V tests ------------------------------------------------------------------- diff --git a/cpp/tests/c_api/mg_test_utils.h b/cpp/tests/c_api/mg_test_utils.h index b040d8dc529..12ca16bfe35 100644 --- a/cpp/tests/c_api/mg_test_utils.h +++ b/cpp/tests/c_api/mg_test_utils.h @@ -36,6 +36,15 @@ } \ } while (0) +#define C_NCCL_TRY(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' at file=%s line=%d failed.", #call, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + #define C_CUDA_TRY(call) \ do { \ cudaError_t const status = call; \ diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu similarity index 74% rename from cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu rename to cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu index 9b7e24856fe..085077017b3 100644 --- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu +++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include "utilities/base_fixture.hpp" #include "utilities/conversion_utilities.hpp" @@ -203,48 +203,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); - } - - auto mg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto mg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - - if constexpr (std::is_same_v) { - mg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(mg_reduce_by_src_new_frontier_key_buffer, mg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.stop(); - hr_timer.display_and_clear(std::cout); - } - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); + hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_dst"); } auto mg_reduce_by_dst_new_frontier_key_buffer = @@ -286,64 +245,27 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst // 3. compare SG & MG results if (prims_usecase.check_correctness) { - if constexpr (std::is_same_v) { - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_src_new_frontier_key_buffer.begin(), - mg_reduce_by_src_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_dst_new_frontier_key_buffer.begin(), - mg_reduce_by_dst_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } else { - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } - - auto mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_new_frontier_key_buffer.data(), - mg_reduce_by_src_new_frontier_key_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).size()); - } - auto mg_reduce_by_dst_aggregate_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); if constexpr (std::is_same_v) { + cugraph::unrenumber_local_int_vertices(*handle_, + mg_reduce_by_dst_new_frontier_key_buffer.data(), + mg_reduce_by_dst_new_frontier_key_buffer.size(), + (*mg_renumber_map).data(), + mg_graph_view.local_vertex_partition_range_first(), + mg_graph_view.local_vertex_partition_range_last()); mg_reduce_by_dst_aggregate_new_frontier_key_buffer = cugraph::test::device_gatherv(*handle_, mg_reduce_by_dst_new_frontier_key_buffer.data(), mg_reduce_by_dst_new_frontier_key_buffer.size()); } else { + cugraph::unrenumber_local_int_vertices( + *handle_, + std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).data(), + std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(), + (*mg_renumber_map).data(), + mg_graph_view.local_vertex_partition_range_first(), + mg_graph_view.local_vertex_partition_range_last()); std::get<0>(mg_reduce_by_dst_aggregate_new_frontier_key_buffer) = cugraph::test::device_gatherv( *handle_, @@ -356,26 +278,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst std::get<1>(mg_reduce_by_dst_new_frontier_key_buffer).size()); } - [[maybe_unused]] auto mg_reduce_by_src_aggregate_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (!std::is_same_v) { - if constexpr (std::is_arithmetic_v) { - mg_reduce_by_src_aggregate_payload_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_payload_buffer.data(), - mg_reduce_by_src_payload_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<0>(mg_reduce_by_src_payload_buffer).data(), - std::get<0>(mg_reduce_by_src_payload_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<1>(mg_reduce_by_src_payload_buffer).data(), - std::get<1>(mg_reduce_by_src_payload_buffer).size()); - } - } - [[maybe_unused]] auto mg_reduce_by_dst_aggregate_payload_buffer = cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); if constexpr (!std::is_same_v) { @@ -409,22 +311,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (handle_->get_comms().get_rank() == int{0}) { if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), @@ -471,34 +362,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst .insert(cugraph::get_dataframe_buffer_begin(sg_key_buffer), cugraph::get_dataframe_buffer_end(sg_key_buffer)); - auto sg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto sg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - sg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(sg_reduce_by_src_new_frontier_key_buffer, sg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - auto sg_reduce_by_dst_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); [[maybe_unused]] auto sg_reduce_by_dst_payload_buffer = @@ -528,22 +391,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst } if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), @@ -551,14 +403,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer)); } - bool key_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - ASSERT_TRUE(key_passed); - - key_passed = thrust::equal( + auto key_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer), @@ -567,13 +412,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if constexpr (!std::is_same_v) { bool payload_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_payload_buffer)); - ASSERT_TRUE(payload_passed); - - payload_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index 4d4b83e275b..3cd712798e3 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -100,16 +100,6 @@ class Tests_MGBFS : public ::testing::TestWithParam initialize_mg_handle(size_t pool_size = 64); +std::unique_ptr initialize_mg_handle( + size_t pool_size = 8 /* default value of CUDA_DEVICE_MAX_CONNECTIONS */); // NCCL lazily initializes for P2P, and this enforces P2P initialization for better performance // measurements diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 0a706d1cf80..ed96ba23917 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -24,7 +24,6 @@ #include #include #include -#include // legacy coo_to_csr #include @@ -234,7 +233,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { construct_edgelist(raft::handle_t const& handle, bool test_weighted, bool store_transposed, - bool multi_gpu) const + bool multi_gpu, + bool shuffle = true) const { CUGRAPH_EXPECTS( (size_t{1} << scale_) <= static_cast(std::numeric_limits::max()), @@ -246,7 +246,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { // cuMemAddressReserve // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size) - size_t constexpr num_partitions_per_gpu = 4; + size_t constexpr num_partitions_per_gpu = 8; size_t num_partitions = num_partitions_per_gpu * static_cast(multi_gpu ? handle.get_comms().get_size() : 1); @@ -330,7 +330,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { handle, std::move(tmp_src_v), std::move(tmp_dst_v), std::move(tmp_weights_v)); } - if (multi_gpu) { + if (multi_gpu && shuffle) { std::tie(store_transposed ? tmp_dst_v : tmp_src_v, store_transposed ? tmp_src_v : tmp_dst_v, tmp_weights_v, @@ -375,7 +375,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { translate(handle, vertex_v); - if (multi_gpu) { + if (multi_gpu && shuffle) { vertex_v = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( handle, std::move(vertex_v)); } @@ -391,6 +391,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { void set_edge_factor(size_t edge_factor) { edge_factor_ = edge_factor; } + bool undirected() const { return undirected_; } + private: size_t scale_{}; size_t edge_factor_{}; @@ -762,39 +764,5 @@ construct_graph(raft::handle_t const& handle, return std::make_tuple(std::move(graph), std::move(edge_weights), std::move(renumber_map)); } -namespace legacy { - -template -std::unique_ptr> construct_graph_csr( - raft::handle_t const& handle, input_usecase_t const& input_usecase, bool test_weighted) -{ - auto [d_src_v, d_dst_v, d_weight_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, test_weighted, false, false); - vertex_t num_vertices{}; // assuming that vertex IDs are non-negative consecutive integers - if (d_vertices_v) { - num_vertices = - max_element( - handle, raft::device_span((*d_vertices_v).data(), (*d_vertices_v).size())) + - 1; - } else { - num_vertices = - std::max( - max_element(handle, raft::device_span(d_src_v.data(), d_src_v.size())), - max_element(handle, raft::device_span(d_dst_v.data(), d_dst_v.size()))) + - 1; - } - - cugraph::legacy::GraphCOOView cooview( - d_src_v.data(), - d_dst_v.data(), - d_weight_v ? d_weight_v->data() : nullptr, - num_vertices, - static_cast(d_src_v.size())); - - return cugraph::coo_to_csr(cooview); -} - -} // namespace legacy } // namespace test } // namespace cugraph