From 460d8e468a73e2d8edee625b9a9fe97ef24fa36f Mon Sep 17 00:00:00 2001 From: Mike Sarahan Date: Fri, 22 Nov 2024 16:36:57 -0600 Subject: [PATCH 1/7] add telemetry (#4740) Enables telemetry during cugraph's build process. This parses github job metadata to obtain timing information. It should have very little impact on overall build time, and should not interfere with any build tools. This implements emitting OpenTelemetry traces and spans, as described in https://github.com/rapidsai/build-infra/issues/139 Authors: - Mike Sarahan (https://github.com/msarahan) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cugraph/pull/4740 --- .github/workflows/pr.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index e82342dfd9..c8bf94b098 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -10,6 +10,7 @@ concurrency: cancel-in-progress: true jobs: + # Please keep pr-builder as the top job here pr-builder: needs: - changed-files @@ -25,14 +26,24 @@ jobs: - wheel-tests-pylibcugraph - wheel-build-cugraph - wheel-tests-cugraph + - telemetry-setup - devcontainer secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 if: always() with: needs: ${{ toJSON(needs) }} + telemetry-setup: + runs-on: ubuntu-latest + continue-on-error: true + env: + OTEL_SERVICE_NAME: "pr-cugraph" + steps: + - name: Telemetry setup + uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main changed-files: secrets: inherit + needs: telemetry-setup uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12 with: files_yaml: | @@ -63,9 +74,11 @@ jobs: - '!notebooks/**' checks: secrets: inherit + needs: telemetry-setup uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 with: enable_check_generated_files: false + ignored_pr_jobs: telemetry-summarize conda-cpp-build: needs: checks secrets: inherit @@ -161,6 +174,7 @@ jobs: script: ci/test_wheel_cugraph.sh devcontainer: secrets: inherit + needs: telemetry-setup uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12 with: arch: '["amd64"]' @@ -171,3 +185,17 @@ jobs: sccache -z; build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON; sccache -s; + telemetry-summarize: + runs-on: ubuntu-latest + needs: pr-builder + if: always() + continue-on-error: true + steps: + - name: Load stashed telemetry env vars + uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main + with: + load_service_name: true + - name: Telemetry summarize + uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main + with: + cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}" From 3478fb57965ef896ba6ebc1ece69876de92b604c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com> Date: Mon, 25 Nov 2024 08:09:57 -0800 Subject: [PATCH 2/7] Increase max_iterations in MG HITS TEST (#4783) Increase max_iterations in MG HITS tests (with edge masking). With masking, we may end up with different graphs with different numbers of GPUs; this results in higher iteration counts for convergence for certain GPU counts. Increase the maximum iteration count to consider this. Authors: - Seunghwa Kang (https://github.com/seunghwak) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) URL: https://github.com/rapidsai/cugraph/pull/4783 --- cpp/tests/link_analysis/mg_hits_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp index eb2a9bcd72..83e7647226 100644 --- a/cpp/tests/link_analysis/mg_hits_test.cpp +++ b/cpp/tests/link_analysis/mg_hits_test.cpp @@ -91,7 +91,7 @@ class Tests_MGHits : public ::testing::TestWithParam d_mg_hubs(mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream()); From d71424346f7b10a507b2c7abffaaf288dafa4b0b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com> Date: Mon, 25 Nov 2024 08:11:02 -0800 Subject: [PATCH 3/7] Primitives & BFS performance improvements (#4751) This PR includes multiple updates to cut peak memory usage in graph creation and improve performance of BFS on scale-free graphs. * Add a bitmap for non-zero local degree vertices in the hypersparse region; this information can be used to quickly filter out locally zero degree vertices which don't need to be processed in multiple instances. * Store (global-)degree offsets for vertices in the hypersparse region; this information can used to quickly identify the vertices with a certain global degree (e.g. for global degree 1 vertices, we can skip inter-GPU reduction as we know each vertex has only one neighbor). * Skip kernel invocations in computing edge counts if the vertex list is empty. * Add asynchronous functions to compute edge counts. This helps in preventing unnecessary serialization when we can process multiple such functions concurrently. * Replace rmm::exec_policy with rmm::exec_policy_nosync in multiple places; the former enforces stream synchronization at the end. The latter does not. * Enforce cache line alignment in NCCL communication in multiple places (NCCL communication performance is significantly affected by cache line alignment, often leading to 30-40% or more differences). * For primitives working on a subset of vertices, broadcast a vertex list using a bitmap if the vertex frontier size is large. If the vertex frontier size is small (in case vertex_t is 8B and the local vertex partition range can fit into 4B), use vertex offsets instead of vertices to cut communication volume. * Merge multiple host scalar communication function calls to a single one. * Increase multi-stream concurrency in detail::extract_transform_e & detail::per_v_transform_reduce_e * Multiple optimizations in template specialization (for update_major == true && reduce_op == any && key type is vertex && working on a subset of vertices) in detail::per_v_transform_reduce_e (this includes pre-processing vertices with non-zero local degrees; so we don't need to process such vertices using multiple GPUs, pre-filtering of zero local degree vertices, allreduce communication to reduce shuffle communication volumes, and special treatment of global degree 1 vertices, and so on). * Multiple optimizations & specializations in detail::fill_edge_minor_property that works on a subset of vertices (this includes kernel fusion, specialization for bitmap properties including direct broadcast to the property buffer and special treatments for vertex partition boundaries, and so on). * Added multiple optimizations & specializations in transform_reduce_v_frontier_outgoing_e (especially for reduce_op::any and to cut communication volumes and to filter out (key, value) pairs that won't contribute to the final results). * Multiple low-level optimizations in direction optimizing BFS (including approximations in determining between bottom -up and top-down). * Multiple optimizations to cut peak memory usage in graph creation. Authors: - Seunghwa Kang (https://github.com/seunghwak) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) URL: https://github.com/rapidsai/cugraph/pull/4751 --- .../cugraph/edge_partition_device_view.cuh | 139 +- cpp/include/cugraph/edge_partition_view.hpp | 10 +- cpp/include/cugraph/graph.hpp | 24 +- cpp/include/cugraph/graph_functions.hpp | 41 +- cpp/include/cugraph/graph_view.hpp | 73 +- cpp/include/cugraph/partition_manager.hpp | 26 +- .../cugraph/utilities/dataframe_buffer.hpp | 73 +- cpp/include/cugraph/utilities/device_comm.hpp | 124 +- cpp/include/cugraph/utilities/misc_utils.cuh | 2 +- .../cugraph/utilities/shuffle_comm.cuh | 289 +- .../cugraph/utilities/thrust_tuple_utils.hpp | 26 + .../betweenness_centrality_impl.cuh | 20 +- .../approx_weighted_matching_impl.cuh | 5 +- cpp/src/community/detail/common_methods.cuh | 5 +- cpp/src/community/detail/refine_impl.cuh | 15 +- .../weakly_connected_components_impl.cuh | 39 +- cpp/src/cores/core_number_impl.cuh | 19 +- cpp/src/lookup/lookup_src_dst_impl.cuh | 5 +- .../detail/extract_transform_v_frontier_e.cuh | 1549 ++++-- cpp/src/prims/detail/multi_stream_utils.cuh | 156 + .../detail/optional_dataframe_buffer.hpp | 174 +- .../prims/detail/per_v_transform_reduce_e.cuh | 4374 +++++++++++++++++ cpp/src/prims/detail/prim_functors.cuh | 22 + cpp/src/prims/detail/prim_utils.cuh | 104 + cpp/src/prims/extract_transform_e.cuh | 4 +- ...xtract_transform_v_frontier_outgoing_e.cuh | 24 +- cpp/src/prims/fill_edge_src_dst_property.cuh | 859 +++- ..._v_pair_transform_dst_nbr_intersection.cuh | 95 +- ...r_v_random_select_transform_outgoing_e.cuh | 162 +- ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 5 +- ...ransform_reduce_if_incoming_outgoing_e.cuh | 421 ++ ...v_transform_reduce_incoming_outgoing_e.cuh | 1336 +---- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 1196 +++++ ...educe_v_frontier_outgoing_e_by_src_dst.cuh | 585 --- .../prims/update_edge_src_dst_property.cuh | 318 +- cpp/src/prims/vertex_frontier.cuh | 248 +- .../create_graph_from_edgelist_impl.cuh | 651 ++- cpp/src/structure/detail/structure_utils.cuh | 136 +- cpp/src/structure/graph_impl.cuh | 233 +- cpp/src/structure/graph_view_impl.cuh | 7 +- cpp/src/structure/induced_subgraph_impl.cuh | 13 - cpp/src/structure/renumber_edgelist_impl.cuh | 761 +-- cpp/src/structure/renumber_utils_impl.cuh | 29 +- cpp/src/traversal/bfs_impl.cuh | 582 ++- cpp/src/traversal/extract_bfs_paths_impl.cuh | 14 +- cpp/src/traversal/k_hop_nbrs_impl.cuh | 20 +- .../traversal/od_shortest_distances_impl.cuh | 15 +- cpp/src/traversal/sssp_impl.cuh | 4 +- cpp/src/utilities/collect_comm.cuh | 245 +- cpp/src/utilities/shuffle_vertex_pairs.cuh | 8 +- cpp/tests/CMakeLists.txt | 6 +- cpp/tests/c_api/mg_test_utils.h | 9 + ...rm_reduce_v_frontier_outgoing_e_by_dst.cu} | 194 +- cpp/tests/traversal/mg_bfs_test.cpp | 34 +- cpp/tests/utilities/mg_utilities.cpp | 4 +- cpp/tests/utilities/mg_utilities.hpp | 5 +- cpp/tests/utilities/test_graphs.hpp | 46 +- 57 files changed, 11526 insertions(+), 4057 deletions(-) create mode 100644 cpp/src/prims/detail/multi_stream_utils.cuh create mode 100644 cpp/src/prims/detail/per_v_transform_reduce_e.cuh create mode 100644 cpp/src/prims/detail/prim_utils.cuh create mode 100644 cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh create mode 100644 cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh delete mode 100644 cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh rename cpp/tests/prims/{mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu => mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu} (74%) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 583b0a3721..628c3cc10c 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -204,6 +204,7 @@ class edge_partition_device_view_t view) : detail::edge_partition_device_view_base_t(view.offsets(), view.indices()), dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())), + dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())), major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())), major_range_first_(view.major_range_first()), major_range_last_(view.major_range_last()), @@ -218,6 +219,7 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices_) { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -266,7 +328,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -284,7 +346,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -295,7 +357,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -368,7 +431,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -394,7 +457,7 @@ class edge_partition_device_view_t> for consistency (see + // dcs_nzd_range_bitmap()) __host__ __device__ thrust::optional dcs_nzd_vertices() const { return dcs_nzd_vertices_ ? thrust::optional{(*dcs_nzd_vertices_).data()} @@ -528,10 +593,20 @@ class edge_partition_device_view_t> dcs_nzd_range_bitmap() + const + { + return dcs_nzd_range_bitmap_ + ? thrust::make_optional>( + (*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size()) + : thrust::nullopt; + } + private: // should be trivially copyable to device thrust::optional> dcs_nzd_vertices_{thrust::nullopt}; + thrust::optional> dcs_nzd_range_bitmap_{thrust::nullopt}; thrust::optional major_hypersparse_first_{thrust::nullopt}; vertex_t major_range_first_{0}; @@ -558,6 +633,7 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -595,7 +709,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -613,6 +727,7 @@ class edge_partition_device_view_t local_degrees(this->major_range_size(), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -660,7 +775,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), diff --git a/cpp/include/cugraph/edge_partition_view.hpp b/cpp/include/cugraph/edge_partition_view.hpp index 4246527371..27c5705dfc 100644 --- a/cpp/include/cugraph/edge_partition_view.hpp +++ b/cpp/include/cugraph/edge_partition_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,6 +56,7 @@ class edge_partition_view_t offsets, raft::device_span indices, std::optional> dcs_nzd_vertices, + std::optional> dcs_nzd_range_bitmap, std::optional major_hypersparse_first, vertex_t major_range_first, vertex_t major_range_last, @@ -64,6 +65,7 @@ class edge_partition_view_t(offsets, indices), dcs_nzd_vertices_(dcs_nzd_vertices), + dcs_nzd_range_bitmap_(dcs_nzd_range_bitmap), major_hypersparse_first_(major_hypersparse_first), major_range_first_(major_range_first), major_range_last_(major_range_last), @@ -78,6 +80,11 @@ class edge_partition_view_t> dcs_nzd_range_bitmap() const + { + return dcs_nzd_range_bitmap_; + } + std::optional major_hypersparse_first() const { return major_hypersparse_first_; } vertex_t major_range_first() const { return major_range_first_; } @@ -90,6 +97,7 @@ class edge_partition_view_t> dcs_nzd_vertices_{std::nullopt}; + std::optional> dcs_nzd_range_bitmap_{std::nullopt}; std::optional major_hypersparse_first_{std::nullopt}; vertex_t major_range_first_{0}; diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 0607b39153..290f4b3c4d 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -48,6 +48,7 @@ struct graph_meta_t> { partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; vertex_t num_local_unique_edge_srcs{}; vertex_t num_local_unique_edge_dsts{}; @@ -61,6 +62,7 @@ struct graph_meta_t> { // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered std::optional> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_t is an owning graph class (note that graph_view_t is a non-owning graph class) @@ -101,6 +103,11 @@ class graph_t>>( (*edge_partition_dcs_nzd_vertices_).size()) : std::nullopt; + auto dcs_nzd_range_bitmaps = + edge_partition_dcs_nzd_range_bitmaps_ + ? std::make_optional>>( + (*edge_partition_dcs_nzd_range_bitmaps_).size()) + : std::nullopt; for (size_t i = 0; i < offsets.size(); ++i) { offsets[i] = raft::device_span(edge_partition_offsets_[i].data(), edge_partition_offsets_[i].size()); @@ -111,6 +118,11 @@ class graph_t((*edge_partition_dcs_nzd_vertices_)[i].data(), (*edge_partition_dcs_nzd_vertices_)[i].size()); } + if (dcs_nzd_range_bitmaps) { + (*dcs_nzd_range_bitmaps)[i] = + raft::device_span((*edge_partition_dcs_nzd_range_bitmaps_)[i].data(), + (*edge_partition_dcs_nzd_range_bitmaps_)[i].size()); + } } std::conditional_t{ this->number_of_vertices(), this->number_of_edges(), this->properties_, partition_, edge_partition_segment_offsets_, + edge_partition_hypersparse_degree_offsets_, local_sorted_unique_edge_srcs, local_sorted_unique_edge_src_chunk_start_offsets, local_sorted_unique_edge_src_chunk_size_, @@ -224,10 +238,13 @@ class graph_t>> edge_partition_dcs_nzd_vertices_{ std::nullopt}; + std::optional>> edge_partition_dcs_nzd_range_bitmaps_{ + std::nullopt}; partition_t partition_{}; // segment offsets within the vertex partition based on vertex degree std::vector edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge // sources/destinations << V / major_comm_size|minor_comm_size). @@ -290,7 +307,11 @@ class graph_t(offsets_.data(), offsets_.size()), raft::device_span(indices_.data(), indices_.size()), graph_view_meta_t{ - this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_}); + this->number_of_vertices(), + this->number_of_edges(), + this->properties_, + segment_offsets_, + hypersparse_degree_offsets_}); } private: @@ -299,6 +320,7 @@ class graph_t> segment_offsets_{}; + std::optional> hypersparse_degree_offsets_{}; }; template diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 866ab16ee9..6a03b9a645 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -41,11 +41,13 @@ struct renumber_meta_t> edge_t number_of_edges{}; partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; }; template struct renumber_meta_t> { std::vector segment_offsets{}; + std::optional> hypersparse_degree_offsets{}; }; /** @@ -244,7 +246,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle, * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -284,7 +286,7 @@ std::enable_if_t unrenumber_local_int_edges( * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -346,7 +348,7 @@ void renumber_local_ext_vertices(raft::handle_t const& handle, * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam edge_type_t Type of edge types. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -388,7 +390,7 @@ decompress_to_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -421,7 +423,7 @@ symmetrize_edgelist(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -463,7 +465,7 @@ symmetrize_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -505,7 +507,7 @@ transpose_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -549,7 +551,7 @@ transpose_graph_storage( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -625,7 +627,7 @@ void relabel(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -678,7 +680,7 @@ extract_induced_subgraphs( * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -743,7 +745,7 @@ create_graph_from_edgelist(raft::handle_t const& handle, * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -807,7 +809,7 @@ create_graph_from_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -829,7 +831,7 @@ std::tuple, rmm::device_uvector> get_two * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -856,7 +858,7 @@ rmm::device_uvector compute_in_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -883,7 +885,7 @@ rmm::device_uvector compute_out_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -910,7 +912,7 @@ weight_t compute_max_in_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -937,7 +939,7 @@ weight_t compute_max_out_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -963,7 +965,7 @@ weight_t compute_total_edge_weight( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -1114,7 +1116,8 @@ shuffle_external_vertex_value_pairs(raft::handle_t const& handle, * @param edge_ids Optional list of edge ids * @param edge_types Optional list of edge types * @return Tuple of vectors storing edge sources, destinations, optional weights, - * optional edge ids, optional edge types mapped to this GPU. + * optional edge ids, optional edge types mapped to this GPU and a vector storing the + * number of edges received from each GPU. */ template std::tuple, diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index a2ff3166fa..d109fbdac9 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -165,7 +165,12 @@ class partition_t { return vertex_partition_range_last(partition_idx) - vertex_partition_range_first(partition_idx); } - size_t number_of_local_edge_partitions() const { return minor_comm_size_; } + size_t number_of_local_edge_partitions() const { return static_cast(minor_comm_size_); } + size_t coinciding_local_edge_partition_idx() const + { + return static_cast(minor_comm_rank_); + } // the major range of coinciding_local_edge_partition_idx()'th local edge partition coincides + // with the local vertex partition range // major: source of the edge partition (if not transposed) or destination of the edge partition // (if transposed). @@ -249,9 +254,13 @@ double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_thres // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller // than minor_comm_size * hypersparse_threshold_ratio, should be less than 1.0 double constexpr hypersparse_threshold_ratio = 0.5; -size_t constexpr low_degree_threshold{raft::warp_size()}; -size_t constexpr mid_degree_threshold{1024}; -size_t constexpr num_sparse_segments_per_vertex_partition{3}; +size_t constexpr low_degree_threshold{ + raft::warp_size()}; // belongs to the low degree segment if the global degree is smaller than + // this value. +size_t constexpr mid_degree_threshold{ + 1024}; // belongs to the medium degree segment if the global degree is smaller than this value, + // otherwise, belongs to the high degree segment. +size_t constexpr num_sparse_segments_per_vertex_partition{3}; // high, mid, low // Common for both graph_view_t & graph_t and both single-GPU & multi-GPU versions template @@ -313,6 +322,7 @@ struct graph_view_meta_t edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; std::conditional_t>, @@ -356,6 +366,7 @@ struct graph_view_meta_t> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_view_t is a non-owning graph class (note that graph_t is an owning graph class) @@ -380,6 +391,8 @@ class graph_view_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta); std::vector vertex_partition_range_offsets() const @@ -552,6 +565,12 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_segment_offsets(partition_idx); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx) const { @@ -563,6 +582,28 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_hypersparse_degree_offsets(partition_idx); + } + + std::optional> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx) const + { + auto num_degrees_per_vertex_partition = + edge_partition_hypersparse_degree_offsets_ + ? ((*edge_partition_hypersparse_degree_offsets_).size() / edge_partition_offsets_.size()) + : size_t{0}; + return edge_partition_hypersparse_degree_offsets_ + ? std::make_optional>( + (*edge_partition_hypersparse_degree_offsets_).begin() + + partition_idx * num_degrees_per_vertex_partition, + (*edge_partition_hypersparse_degree_offsets_).begin() + + (partition_idx + 1) * num_degrees_per_vertex_partition) + : std::nullopt; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices(), @@ -605,6 +646,9 @@ class graph_view_t>> edge_partition_dcs_nzd_vertices_{}; + std::optional>> + edge_partition_dcs_nzd_range_bitmaps_{}; partition_t partition_{}; // segment offsets based on vertex degree std::vector edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store source/destination property values in key/value pairs (this saves memory if # // unique edge sources/destinations << V / major_comm_size|minor_comm_size). @@ -903,6 +950,11 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + return local_edge_partition_segment_offsets(size_t{0}); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx = 0) const { @@ -910,6 +962,18 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + return local_edge_partition_hypersparse_degree_offsets(size_t{0}); + } + + std::optional> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx = 0) const + { + assert(partition_idx == 0); + return hypersparse_degree_offsets_; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices()); @@ -1050,6 +1114,7 @@ class graph_view_t> segment_offsets_{std::nullopt}; + std::optional> hypersparse_degree_offsets_{std::nullopt}; std::optional> edge_mask_view_{std::nullopt}; }; diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 309b169e64..2eb210fb7c 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,6 +71,30 @@ class partition_manager { : (major_comm_rank * minor_comm_size + minor_comm_rank); } +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_major_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank % major_comm_size + : comm_rank / minor_comm_size; + } + +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_minor_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank / major_comm_size + : comm_rank % minor_comm_size; + } + #ifdef __CUDACC__ __host__ __device__ #endif diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp index a20613c65e..6d47ec540d 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp +++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp @@ -82,6 +82,53 @@ auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_ std::make_index_sequence(), buffer_size, stream_view); } +template +struct dataframe_buffer_type { + using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; + +template +using dataframe_buffer_type_t = typename dataframe_buffer_type::type; + +template +std::optional> try_allocate_dataframe_buffer( + size_t buffer_size, rmm::cuda_stream_view stream_view) +{ + try { + return allocate_dataframe_buffer(buffer_size, stream_view); + } catch (std::exception const& e) { + return std::nullopt; + } +} + +template +struct dataframe_buffer_iterator_type { + using type = typename rmm::device_uvector::iterator; +}; + +template +struct dataframe_buffer_iterator_type> { + using type = thrust::zip_iterator::iterator...>>; +}; + +template +using dataframe_buffer_iterator_type_t = typename dataframe_buffer_iterator_type::type; + +template +struct dataframe_buffer_const_iterator_type { + using type = typename rmm::device_uvector::const_iterator; +}; + +template +struct dataframe_buffer_const_iterator_type> { + using type = + thrust::zip_iterator::const_iterator...>>; +}; + +template +using dataframe_buffer_const_iterator_type_t = + typename dataframe_buffer_const_iterator_type::type; + template void reserve_dataframe_buffer(BufferType& buffer, size_t new_buffer_capacity, @@ -206,30 +253,4 @@ auto get_dataframe_buffer_cend(BufferType& buffer) std::make_index_sequence::value>(), buffer); } -template -struct dataframe_buffer_value_type { - using type = void; -}; - -template -struct dataframe_buffer_value_type> { - using type = T; -}; - -template -struct dataframe_buffer_value_type...>> { - using type = thrust::tuple; -}; - -template -using dataframe_buffer_value_type_t = typename dataframe_buffer_value_type::type; - -template -struct dataframe_buffer_type { - using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); -}; - -template -using dataframe_buffer_type_t = typename dataframe_buffer_type::type; - } // namespace cugraph diff --git a/cpp/include/cugraph/utilities/device_comm.hpp b/cpp/include/cugraph/utilities/device_comm.hpp index ffb0f7d9e5..07de2d0646 100644 --- a/cpp/include/cugraph/utilities/device_comm.hpp +++ b/cpp/include/cugraph/utilities/device_comm.hpp @@ -55,7 +55,7 @@ auto iter_to_raw_ptr(thrust::detail::normal_iterator> iter } template -std::enable_if_t::value, void> +std::enable_if_t, void> device_isend_impl(raft::comms::comms_t const& comm, InputIterator input_first, size_t count, @@ -76,7 +76,7 @@ std::enable_if_t::value, void> device_isend_ raft::comms::request_t* request) { static_assert( - std::is_same::value_type, OutputValueType>::value); + std::is_same_v::value_type, OutputValueType>); comm.isend(iter_to_raw_ptr(input_first), count, dst, tag, request); } @@ -136,7 +136,7 @@ device_irecv_impl(raft::comms::comms_t const& comm, { static_assert( - std::is_same::value_type>::value); + std::is_same_v::value_type>); comm.irecv(iter_to_raw_ptr(output_first), count, src, tag, request); } @@ -200,7 +200,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_sendrecv(iter_to_raw_ptr(input_first), tx_count, dst, @@ -286,7 +286,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_multicast_sendrecv(iter_to_raw_ptr(input_first), tx_counts, tx_offsets, @@ -379,8 +379,8 @@ device_bcast_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.bcast( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, root, stream_view.value()); } @@ -440,8 +440,8 @@ device_allreduce_impl(raft::comms::comms_t const& comm, raft::comms::op_t op, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allreduce( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream_view.value()); } @@ -503,8 +503,8 @@ device_reduce_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.reduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, @@ -548,6 +548,62 @@ struct device_reduce_tuple_iterator_element_impl +std::enable_if_t::value, void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + // no-op +} + +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); + comm.allgather( + iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, stream_view.value()); +} + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + device_allgather_impl(comm, + thrust::get(input_first.get_iterator_tuple()), + thrust::get(output_first.get_iterator_tuple()), + sendcount, + stream_view); + device_allgather_tuple_iterator_element_impl().run( + comm, input_first, output_first, sendcount, stream_view); + } +}; + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + } +}; + template std::enable_if_t::value, void> device_allgatherv_impl(raft::comms::comms_t const& comm, @@ -571,8 +627,8 @@ device_allgatherv_impl(raft::comms::comms_t const& comm, std::vector const& displacements, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allgatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), recvcounts.data(), @@ -639,8 +695,8 @@ device_gatherv_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.gatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, @@ -1000,6 +1056,44 @@ device_reduce(raft::comms::comms_t const& comm, .run(comm, input_first, output_first, count, op, root, stream_view); } +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + detail::device_allgather_impl(comm, input_first, output_first, sendcount, stream_view); +} + +template +std::enable_if_t< + is_thrust_tuple_of_arithmetic::value_type>::value && + is_thrust_tuple::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert( + thrust::tuple_size::value_type>::value == + thrust::tuple_size::value_type>::value); + + size_t constexpr tuple_size = + thrust::tuple_size::value_type>::value; + + detail::device_allgather_tuple_iterator_element_impl() + .run(comm, input_first, output_first, sendcount, stream_view); +} + template std::enable_if_t< std::is_arithmetic::value_type>::value, diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh index 633dabe5b4..91a349007d 100644 --- a/cpp/include/cugraph/utilities/misc_utils.cuh +++ b/cpp/include/cugraph/utilities/misc_utils.cuh @@ -81,7 +81,7 @@ std::tuple, std::vector> compute_offset_aligned_ return std::make_tuple(h_chunk_offsets, h_element_offsets); } else { - return std::make_tuple(std::vector{{0, offsets.size() - 1}}, + return std::make_tuple(std::vector{{0, static_cast(offsets.size() - 1)}}, std::vector{{0, num_elements}}); } } diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 3cbd35b4bc..98fa2cb170 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,8 @@ namespace cugraph { namespace detail { +constexpr size_t cache_line_size = 128; + template struct compute_group_id_count_pair_t { GroupIdIterator group_id_first{}; @@ -76,6 +79,7 @@ inline std::tuple, std::vector> compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, rmm::device_uvector const& d_tx_value_counts, + bool drop_empty_ranks, rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); @@ -111,28 +115,30 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1); std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1); - int num_tx_dst_ranks{0}; - int num_rx_src_ranks{0}; - for (int i = 0; i < comm_size; ++i) { - if (tx_counts[i] != 0) { - tx_counts[num_tx_dst_ranks] = tx_counts[i]; - tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; - tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; - ++num_tx_dst_ranks; - } - if (rx_counts[i] != 0) { - rx_counts[num_rx_src_ranks] = rx_counts[i]; - rx_offsets[num_rx_src_ranks] = rx_offsets[i]; - rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; - ++num_rx_src_ranks; + if (drop_empty_ranks) { + int num_tx_dst_ranks{0}; + int num_rx_src_ranks{0}; + for (int i = 0; i < comm_size; ++i) { + if (tx_counts[i] != 0) { + tx_counts[num_tx_dst_ranks] = tx_counts[i]; + tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; + tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; + ++num_tx_dst_ranks; + } + if (rx_counts[i] != 0) { + rx_counts[num_rx_src_ranks] = rx_counts[i]; + rx_offsets[num_rx_src_ranks] = rx_offsets[i]; + rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; + ++num_rx_src_ranks; + } } + tx_counts.resize(num_tx_dst_ranks); + tx_offsets.resize(num_tx_dst_ranks); + tx_dst_ranks.resize(num_tx_dst_ranks); + rx_counts.resize(num_rx_src_ranks); + rx_offsets.resize(num_rx_src_ranks); + rx_src_ranks.resize(num_rx_src_ranks); } - tx_counts.resize(num_tx_dst_ranks); - tx_offsets.resize(num_tx_dst_ranks); - tx_dst_ranks.resize(num_tx_dst_ranks); - rx_counts.resize(num_rx_src_ranks); - rx_offsets.resize(num_rx_src_ranks); - rx_src_ranks.resize(num_rx_src_ranks); return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks); } @@ -823,6 +829,8 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector const& tx_value_counts, rmm::cuda_stream_view stream_view) { + using value_t = typename thrust::iterator_traits::value_type; + auto const comm_size = comm.get_size(); rmm::device_uvector d_tx_value_counts(comm_size, stream_view); @@ -836,11 +844,10 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); - auto rx_value_buffer = - allocate_dataframe_buffer::value_type>( - rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); + auto rx_value_buffer = allocate_dataframe_buffer( + rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). device_multicast_sendrecv(comm, @@ -866,6 +873,236 @@ auto shuffle_values(raft::comms::comms_t const& comm, return std::make_tuple(std::move(rx_value_buffer), rx_counts); } +// Add gaps in the receive buffer to enforce that the sent data offset and the received data offset +// have the same alignment for every rank. This is faster assuming that @p alignment ensures cache +// line alignment in both send & receive buffer (tested with NCCL 2.23.4) +template +auto shuffle_values( + raft::comms::comms_t const& comm, + TxValueIterator tx_value_first, + std::vector const& tx_value_counts, + size_t alignment, // # elements + std::optional::value_type> fill_value, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_size = comm.get_size(); + + std::vector tx_value_displacements(tx_value_counts.size()); + std::exclusive_scan( + tx_value_counts.begin(), tx_value_counts.end(), tx_value_displacements.begin(), size_t{0}); + + std::vector tx_unaligned_counts(comm_size); + std::vector tx_displacements(comm_size); + std::vector tx_aligned_counts(comm_size); + std::vector tx_aligned_displacements(comm_size); + std::vector rx_unaligned_counts(comm_size); + std::vector rx_displacements(comm_size); + std::vector rx_aligned_counts(comm_size); + std::vector rx_aligned_displacements(comm_size); + std::vector tx_ranks(comm_size); + std::iota(tx_ranks.begin(), tx_ranks.end(), int{0}); + auto rx_ranks = tx_ranks; + for (size_t i = 0; i < tx_value_counts.size(); ++i) { + tx_unaligned_counts[i] = 0; + if (tx_value_displacements[i] % alignment != 0) { + tx_unaligned_counts[i] = + std::min(alignment - (tx_value_displacements[i] % alignment), tx_value_counts[i]); + } + tx_displacements[i] = tx_value_displacements[i]; + tx_aligned_counts[i] = tx_value_counts[i] - tx_unaligned_counts[i]; + tx_aligned_displacements[i] = tx_value_displacements[i] + tx_unaligned_counts[i]; + } + + rmm::device_uvector d_tx_unaligned_counts(tx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_tx_aligned_counts(tx_aligned_counts.size(), stream_view); + rmm::device_uvector d_rx_unaligned_counts(rx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_rx_aligned_counts(rx_aligned_counts.size(), stream_view); + raft::update_device(d_tx_unaligned_counts.data(), + tx_unaligned_counts.data(), + tx_unaligned_counts.size(), + stream_view); + raft::update_device( + d_tx_aligned_counts.data(), tx_aligned_counts.data(), tx_aligned_counts.size(), stream_view); + std::vector tx_counts(comm_size, size_t{1}); + std::vector tx_offsets(comm_size); + std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); + auto rx_counts = tx_counts; + auto rx_offsets = tx_offsets; + cugraph::device_multicast_sendrecv(comm, + d_tx_unaligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_unaligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + d_tx_aligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_aligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + raft::update_host(rx_unaligned_counts.data(), + d_rx_unaligned_counts.data(), + d_rx_unaligned_counts.size(), + stream_view); + raft::update_host( + rx_aligned_counts.data(), d_rx_aligned_counts.data(), d_rx_aligned_counts.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + size_t offset{0}; + for (size_t i = 0; i < rx_counts.size(); ++i) { + auto target_alignment = (alignment - rx_unaligned_counts[i]) % alignment; + auto cur_alignment = offset % alignment; + if (target_alignment >= cur_alignment) { + offset += target_alignment - cur_alignment; + } else { + offset += (target_alignment + alignment) - cur_alignment; + } + rx_displacements[i] = offset; + rx_aligned_displacements[i] = rx_displacements[i] + rx_unaligned_counts[i]; + offset = rx_aligned_displacements[i] + rx_aligned_counts[i]; + } + + auto rx_values = allocate_dataframe_buffer( + rx_aligned_displacements.back() + rx_aligned_counts.back(), stream_view); + if (fill_value) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_end(rx_values), + *fill_value); + } + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_unaligned_counts, + tx_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_unaligned_counts, + rx_displacements, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_aligned_counts, + tx_aligned_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_aligned_counts, + rx_aligned_displacements, + rx_ranks, + stream_view); + + return std::make_tuple(std::move(rx_values), + tx_unaligned_counts, + tx_aligned_counts, + tx_displacements, + rx_unaligned_counts, + rx_aligned_counts, + rx_displacements); +} + +// this uses less memory than calling shuffle_values then sort & unique but requires comm.get_size() +// - 1 communication steps +template +auto shuffle_and_unique_segment_sorted_values( + raft::comms::comms_t const& comm, + TxValueIterator + segment_sorted_tx_value_first, // sorted within each segment (segment sizes: + // tx_value_counts[i], where i = [0, comm_size); and bettter be + // unique to reduce communication volume + std::vector const& tx_value_counts, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + auto sorted_unique_values = allocate_dataframe_buffer(0, stream_view); + if (comm_size == 1) { + resize_dataframe_buffer(sorted_unique_values, tx_value_counts[comm_rank], stream_view); + thrust::copy(rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first, + segment_sorted_tx_value_first + tx_value_counts[comm_rank], + get_dataframe_buffer_begin(sorted_unique_values)); + resize_dataframe_buffer( + sorted_unique_values, + thrust::distance(get_dataframe_buffer_begin(sorted_unique_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values))), + stream_view); + } else { + rmm::device_uvector d_tx_value_counts(comm_size, stream_view); + raft::update_device( + d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream_view.value()); + + std::vector tx_counts{}; + std::vector tx_offsets{}; + std::vector rx_counts{}; + std::vector rx_offsets{}; + std::tie(tx_counts, tx_offsets, std::ignore, rx_counts, rx_offsets, std::ignore) = + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, false, stream_view); + + d_tx_value_counts.resize(0, stream_view); + d_tx_value_counts.shrink_to_fit(stream_view); + + for (int i = 1; i < comm_size; ++i) { + auto dst = (comm_rank + i) % comm_size; + auto src = + static_cast((static_cast(comm_rank) + static_cast(comm_size - i)) % + static_cast(comm_size)); + auto rx_sorted_values = allocate_dataframe_buffer(rx_counts[src], stream_view); + device_sendrecv(comm, + segment_sorted_tx_value_first + tx_offsets[dst], + tx_counts[dst], + dst, + get_dataframe_buffer_begin(rx_sorted_values), + rx_counts[src], + src, + stream_view); + auto merged_sorted_values = allocate_dataframe_buffer( + (i == 1 ? tx_counts[comm_rank] : size_dataframe_buffer(sorted_unique_values)) + + rx_counts[src], + stream_view); + if (i == 1) { + thrust::merge( + rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first + tx_offsets[comm_rank], + segment_sorted_tx_value_first + (tx_offsets[comm_rank] + tx_counts[comm_rank]), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } else { + thrust::merge(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } + resize_dataframe_buffer( + merged_sorted_values, + thrust::distance(get_dataframe_buffer_begin(merged_sorted_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(merged_sorted_values), + get_dataframe_buffer_end(merged_sorted_values))), + stream_view); + sorted_unique_values = std::move(merged_sorted_values); + } + } + shrink_to_fit_dataframe_buffer(sorted_unique_values, stream_view); + return sorted_unique_values; +} + template auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, ValueIterator tx_value_first /* [INOUT */, @@ -889,7 +1126,7 @@ auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); auto rx_value_buffer = allocate_dataframe_buffer::value_type>( @@ -943,7 +1180,7 @@ auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); rmm::device_uvector::value_type> rx_keys( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp index 2c36ed3335..29b9d132ef 100644 --- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp +++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp @@ -64,6 +64,18 @@ size_t sum_thrust_tuple_element_sizes(std::index_sequence) return (... + sizeof(typename thrust::tuple_element::type)); } +template +size_t min_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::min(sizeof(typename thrust::tuple_element::type)...); +} + +template +size_t max_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::max(sizeof(typename thrust::tuple_element::type)...); +} + template auto thrust_tuple_to_std_tuple(TupleType tup, std::index_sequence) { @@ -181,6 +193,20 @@ constexpr size_t sum_thrust_tuple_element_sizes() std::make_index_sequence::value>()); } +template +constexpr size_t min_thrust_tuple_element_sizes() +{ + return detail::min_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + +template +constexpr size_t max_thrust_tuple_element_sizes() +{ + return detail::max_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + template auto thrust_tuple_to_std_tuple(TupleType tup) { diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index 8ae49ed207..88ef3987a0 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -23,7 +23,7 @@ #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" #include "prims/transform_e.cuh" #include "prims/transform_reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -133,15 +133,15 @@ std::tuple, rmm::device_uvector> brandes_b update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view()); update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view()); - auto [new_frontier, new_sigma] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - src_sigmas.view(), - dst_distances.view(), - cugraph::edge_dummy_property_t{}.view(), - brandes_e_op_t{}, - reduce_op::plus()); + auto [new_frontier, new_sigma] = cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + src_sigmas.view(), + dst_distances.view(), + cugraph::edge_dummy_property_t{}.view(), + brandes_e_op_t{}, + reduce_op::plus()); update_v_frontier(handle, graph_view, diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh index a0ccfa52ff..869ed4e7ae 100644 --- a/cpp/src/community/approx_weighted_matching_impl.cuh +++ b/cpp/src/community/approx_weighted_matching_impl.cuh @@ -243,11 +243,12 @@ std::tuple, weight_t> approximate_weighted_matchin major_comm_size, minor_comm_size}; - candidates_of_candidates = cugraph::collect_values_for_keys(handle, + candidates_of_candidates = cugraph::collect_values_for_keys(comm, target_candidate_map.view(), candidates.begin(), candidates.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { candidates_of_candidates.resize(candidates.size(), handle.get_stream()); diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh index e17abdb370..18fb3fdb25 100644 --- a/cpp/src/community/detail/common_methods.cuh +++ b/cpp/src/community/detail/common_methods.cuh @@ -289,11 +289,12 @@ rmm::device_uvector update_clustering_by_delta_modularity( invalid_vertex_id::value, std::numeric_limits::max(), handle.get_stream()); - vertex_cluster_weights_v = cugraph::collect_values_for_keys(handle, + vertex_cluster_weights_v = cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), next_clusters_v.begin(), next_clusters_v.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); src_cluster_weights = edge_src_property_t, weight_t>(handle, diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh index 62b66ed5f4..d69c1463ed 100644 --- a/cpp/src/community/detail/refine_impl.cuh +++ b/cpp/src/community/detail/refine_impl.cuh @@ -181,11 +181,12 @@ refine_clustering( comm_size, major_comm_size, minor_comm_size}; vertex_louvain_cluster_weights = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), louvain_assignment_of_vertices.begin(), louvain_assignment_of_vertices.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { vertex_louvain_cluster_weights.resize(louvain_assignment_of_vertices.size(), @@ -473,11 +474,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; louvain_of_leiden_keys_used_in_edge_reduction = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_used_in_edge_reduction.begin(), leiden_keys_used_in_edge_reduction.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { louvain_of_leiden_keys_used_in_edge_reduction.resize( leiden_keys_used_in_edge_reduction.size(), handle.get_stream()); @@ -864,11 +866,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; lovain_of_leiden_cluster_keys = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_to_read_louvain.begin(), leiden_keys_to_read_louvain.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { lovain_of_leiden_cluster_keys.resize(leiden_keys_to_read_louvain.size(), handle.get_stream()); diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 468f4f7280..219bc3c4d1 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -550,24 +550,25 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto old_num_edge_inserts = num_edge_inserts.value(handle.get_stream()); resize_dataframe_buffer(edge_buffer, old_num_edge_inserts + max_pushes, handle.get_stream()); - auto new_frontier_tagged_vertex_buffer = transform_reduce_v_frontier_outgoing_e_by_dst( - handle, - level_graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{ - GraphViewType::is_multi_gpu - ? detail::edge_partition_endpoint_property_device_view_t( - edge_dst_components.mutable_view()) - : detail::edge_partition_endpoint_property_device_view_t( - detail::edge_minor_property_view_t(level_components, - vertex_t{0})), - level_graph_view.local_edge_partition_dst_range_first(), - get_dataframe_buffer_begin(edge_buffer), - num_edge_inserts.data()}, - reduce_op::null()); + auto new_frontier_tagged_vertex_buffer = + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + level_graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{ + GraphViewType::is_multi_gpu + ? detail::edge_partition_endpoint_property_device_view_t( + edge_dst_components.mutable_view()) + : detail::edge_partition_endpoint_property_device_view_t( + detail::edge_minor_property_view_t(level_components, + vertex_t{0})), + level_graph_view.local_edge_partition_dst_range_first(), + get_dataframe_buffer_begin(edge_buffer), + num_edge_inserts.data()}, + reduce_op::null()); update_v_frontier(handle, level_graph_view, diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh index d807ccac5a..a2b6f6430f 100644 --- a/cpp/src/cores/core_number_impl.cuh +++ b/cpp/src/cores/core_number_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -222,14 +222,15 @@ void core_number(raft::handle_t const& handle, if (graph_view.is_symmetric() || ((degree_type == k_core_degree_type_t::IN) || (degree_type == k_core_degree_type_t::INOUT))) { auto [new_frontier_vertex_buffer, delta_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - dst_core_numbers.view(), - edge_dummy_property_t{}.view(), - e_op_t{k, delta}, - reduce_op::plus()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + dst_core_numbers.view(), + edge_dummy_property_t{}.view(), + e_op_t{k, delta}, + reduce_op::plus()); update_v_frontier( handle, diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh index 1c8c39fd6d..45bbf870d8 100644 --- a/cpp/src/lookup/lookup_src_dst_impl.cuh +++ b/cpp/src/lookup/lookup_src_dst_impl.cuh @@ -115,12 +115,13 @@ struct lookup_container_t::lookup_con auto const minor_comm_size = minor_comm.get_size(); value_buffer = cugraph::collect_values_for_keys( - handle, + comm, kv_store_object->view(), edge_ids_to_lookup.begin(), edge_ids_to_lookup.end(), cugraph::detail::compute_gpu_id_from_ext_edge_id_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); } else { cugraph::resize_dataframe_buffer( value_buffer, edge_ids_to_lookup.size(), handle.get_stream()); diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 177c79ace8..2b89d214fd 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -15,9 +15,11 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" #include "prims/detail/optional_dataframe_buffer.hpp" #include "prims/detail/prim_functors.cuh" #include "prims/property_op_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -72,9 +74,9 @@ __device__ void push_buffer_element(BufferKeyOutputIterator buffer_key_output_fi e_op_result_t e_op_result) { using output_key_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; using output_value_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; assert(e_op_result.has_value()); @@ -118,7 +120,6 @@ __device__ void warp_push_buffer_elements( } template buffer_idx(*buffer_idx_ptr); - int32_t constexpr shared_array_size = max_one_e_per_frontier_key - ? int32_t{1} /* dummy */ - : extract_transform_v_frontier_e_kernel_block_size; - __shared__ std::conditional_t - warp_local_degree_inclusive_sums[shared_array_size]; - __shared__ std::conditional_t - warp_key_local_edge_offsets[shared_array_size]; + __shared__ edge_t + warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size]; + __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size]; using WarpScan = cub::WarpScan; - __shared__ std:: - conditional_t - temp_storage; + __shared__ typename WarpScan::TempStorage temp_storage; auto indices = edge_partition.indices(); @@ -216,98 +211,74 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } - if constexpr (max_one_e_per_frontier_key) { - // each thread processes one frontier key, exits if any edge returns a valid output + auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive + auto max_key_idx = + static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), + static_cast(num_keys))); // exclusive - e_op_result_t e_op_result{thrust::nullopt}; - auto key = *(key_first + idx); + // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - if (edge_partition_e_mask) { - for (edge_t i = 0; i < local_degree; ++i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } - } - } - } else { - for (edge_t i = 0; i < local_degree; ++i) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } - } - } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } else { - auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive - auto max_key_idx = - static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), - static_cast(num_keys))); // exclusive - - // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - - warp_key_local_edge_offsets[threadIdx.x] = edge_offset; - WarpScan(temp_storage) - .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); - __syncwarp(); + warp_key_local_edge_offsets[threadIdx.x] = edge_offset; + WarpScan(temp_storage) + .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); + __syncwarp(); - // all the threads in a warp collectively process local edges for the keys in [key_first + - // min_key_idx, key_first + max_key_idx) + // all the threads in a warp collectively process local edges for the keys in [key_first + + // min_key_idx, key_first + max_key_idx) - auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + - (max_key_idx - min_key_idx) - 1]; - auto rounded_up_num_edges_this_warp = - ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); + auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + + (max_key_idx - min_key_idx) - 1]; + auto rounded_up_num_edges_this_warp = + ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); - auto this_warp_inclusive_sum_first = - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); - auto this_warp_inclusive_sum_last = - this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); + auto this_warp_inclusive_sum_first = + warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); + auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); - if (edge_partition_e_mask) { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); - if ((*edge_partition_e_mask).get(local_edge_offset)) { - auto key = *(key_first + (min_key_idx + key_idx_this_warp)); - e_op_result = call_e_op(key, local_edge_offset); - } - } + if (edge_partition_e_mask) { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } - } else { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + if ((*edge_partition_e_mask).get(local_edge_offset)) { auto key = *(key_first + (min_key_idx + key_idx_this_warp)); e_op_result = call_e_op(key, local_edge_offset); } + } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + } + } else { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + auto key = *(key_first + (min_key_idx + key_idx_this_warp)); + e_op_result = call_e_op(key, local_edge_offset); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -315,8 +286,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } -template buffer_idx(*buffer_idx_ptr); - using WarpReduce = cub::WarpReduce; - __shared__ std::conditional_t - temp_storage[max_one_e_per_frontier_key - ? (extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()) - : int32_t{1} /* dummy */]; - while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); auto major = thrust_tuple_get_or_identity(key); auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = - ((static_cast(local_out_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * raft::warp_size(); auto call_e_op = call_e_op_t(local_out_degree)) && + if ((i < static_cast(local_degree)) && ((*edge_partition_e_mask).get(local_edge_offset + i))) { e_op_result = call_e_op(i); } - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } else { - for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -446,8 +382,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree( } } -template edge_partition, KeyIterator key_first, - KeyIterator key_last, + raft::device_span key_local_degree_offsets, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -482,123 +417,234 @@ __global__ static void extract_transform_v_frontier_e_high_degree( typename EdgePartitionEdgeValueInputWrapper::value_type, EdgeOp>::type; - auto const warp_id = threadIdx.x / raft::warp_size(); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; auto const lane_id = threadIdx.x % raft::warp_size(); - auto idx = static_cast(blockIdx.x); - - cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - - using BlockReduce = cub::BlockReduce; - __shared__ std::conditional_t - temp_storage; - __shared__ int32_t output_thread_id; - - while (idx < static_cast(thrust::distance(key_first, key_last))) { - auto key = *(key_first + idx); - auto major = thrust_tuple_get_or_identity(key); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = - edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = ((static_cast(local_out_degree) + - (extract_transform_v_frontier_e_kernel_block_size - 1)) / - extract_transform_v_frontier_e_kernel_block_size) * - extract_transform_v_frontier_e_kernel_block_size; - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - local_edge_offset}; + auto idx = static_cast(tid); - if (edge_partition_e_mask) { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if ((i < static_cast(local_out_degree)) && - ((*edge_partition_e_mask).get(local_edge_offset + i))) { - e_op_result = call_e_op(i); - } + cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } - } - } else { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + auto num_edges = *(key_local_degree_offsets.rbegin()); + size_t rounded_up_num_edges = + ((static_cast(num_edges) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + while (idx < rounded_up_num_edges) { + e_op_result_t e_op_result{thrust::nullopt}; + if (idx < num_edges) { + auto key_idx = thrust::distance( + key_local_degree_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, key_local_degree_offsets.begin() + 1, key_local_degree_offsets.end(), idx)); + auto key = *(key_first + key_idx); + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t local_edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = + edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + local_edge_offset}; + + auto e_idx = static_cast(idx - key_local_degree_offsets[key_idx]); + if (edge_partition_e_mask) { + if ((*edge_partition_e_mask).get(local_edge_offset + e_idx)) { + e_op_result = call_e_op(e_idx); } + } else { + e_op_result = call_e_op(e_idx); } } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - idx += gridDim.x; + idx += gridDim.x * blockDim.x; + } +} + +template +void extract_transform_v_frontier_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + InputKeyIterator edge_partition_frontier_key_first, + InputKeyIterator edge_partition_frontier_key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + OptionalOutputKeyIterator output_key_first, + OptionalOutputValueIterator output_value_first, + raft::device_span count /* size = 1 */, + EdgeOp e_op, + std::optional> high_segment_key_local_degree_offsets, + std::optional high_segment_edge_count, + std::optional> key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } + if (key_segment_offsets) { + if (((*key_segment_offsets)[1] > 0) && ((*high_segment_edge_count) > 0)) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + raft::grid_1d_thread_t update_grid((*high_segment_edge_count), + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_high_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + raft::device_span((*high_segment_key_local_degree_offsets).data(), + (*high_segment_key_local_degree_offsets).size()), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_mid_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[1], + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if (edge_partition.dcs_nzd_vertex_count() && + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_frontier_key_first + (*key_segment_offsets)[4], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + } else { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + auto frontier_size = static_cast( + thrust::distance(edge_partition_frontier_key_first, edge_partition_frontier_key_last)); + if (frontier_size > 0) { + raft::grid_1d_thread_t update_grid(frontier_size, + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } } } template -std::tuple< - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})), - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> +std::tuple, + optional_dataframe_buffer_type_t> extract_transform_v_frontier_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& frontier, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -607,7 +653,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using output_key_t = OutputKeyT; using output_value_t = OutputValueT; @@ -653,6 +699,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust::optional, thrust::optional>>>); + constexpr bool try_bitmap = GraphViewType::is_multi_gpu && std::is_same_v && + KeyBucketType::is_sorted_unique; + if (do_expensive_check) { auto frontier_vertex_first = thrust_tuple_get_or_identity(frontier.begin()); @@ -673,10 +722,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, "Invalid input argument: frontier includes out-of-range keys."); } + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. pre-process frontier data + auto frontier_key_first = frontier.begin(); auto frontier_key_last = frontier.end(); auto frontier_keys = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - if constexpr (!VertexFrontierBucketType::is_sorted_unique) { + if constexpr (!KeyBucketType::is_sorted_unique) { resize_dataframe_buffer(frontier_keys, frontier.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), frontier_key_first, @@ -689,209 +743,708 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, frontier_key_last = get_dataframe_buffer_end(frontier_keys); } - // 1. fill the buffers + std::optional> key_segment_offsets{std::nullopt}; + { // drop zero degree vertices & compute key_segment_offsets + size_t partition_idx{0}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + partition_idx = static_cast(minor_comm.get_rank()); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + if (thrust::distance(frontier_key_first, frontier_key_last) > 0) { + key_segment_offsets = compute_key_segment_offsets( + frontier_key_first, + frontier_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + frontier_key_last = frontier_key_first + (*key_segment_offsets).back(); + } else { + key_segment_offsets = std::vector((*segment_offsets).size(), 0); + } + } + } + + // 2. compute local max_pushes - auto key_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - auto value_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + size_t local_max_pushes{}; + { + size_t partition_idx{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto frontier_major_first = + thrust_tuple_get_or_identity(frontier_key_first); + auto frontier_major_last = + thrust_tuple_get_or_identity(frontier_key_last); + // for an edge-masked graph, we can pass edge mask to compute tighter bound (at the expense of + // additional computing) + local_max_pushes = edge_partition.compute_number_of_edges( + frontier_major_first, frontier_major_last, handle.get_stream()); + } + + // 3. communication over minor_comm std::vector local_frontier_sizes{}; + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_lasts{}; + std::optional>> key_segment_offset_vectors{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - handle.get_stream()); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{}; + { + size_t key_size{0}; + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + size_t output_key_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_key_size = sizeof(output_key_t); + } else { + output_key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + size_t output_value_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_value_size = sizeof(output_value_t); + } else { + output_value_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + approx_tmp_buffer_size_per_loop = + static_cast(thrust::distance(frontier_key_first, frontier_key_last)) * key_size + + local_max_pushes * (output_key_size + output_value_size); + } + + size_t num_scalars = + 3; // local_frontier_size, max_tmp_buffer_size, approx_tmp_buffer_size_per_loop + if constexpr (try_bitmap) { + num_scalars += 2; // local_frontier_range_first, local_frontier_range_last + } + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + (num_scalars * minor_comm_rank + (try_bitmap ? 5 : 3)), + [frontier_key_first, + max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + v_list_size = static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return v_list_size; + } else if (i == 1) { + return max_tmp_buffer_size; + } else if (i == 2) { + return approx_tmp_buffer_size_per_loop; + } + if constexpr (try_bitmap) { + if (i == 3) { + vertex_t first{}; + if (v_list_size > 0) { + first = *frontier_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else if (i == 4) { + assert(i == 4); + vertex_t last{}; + if (v_list_size > 0) { + last = *(frontier_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + } + assert(false); + return size_t{0}; + }); + if (key_segment_offsets) { + raft::update_device( + d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 5 : 3)), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + local_frontier_sizes = std::vector(minor_comm_size); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_frontier_range_firsts = std::vector(minor_comm_size); + local_frontier_range_lasts = std::vector(minor_comm_size); + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + for (int i = 0; i < minor_comm_size; ++i) { + local_frontier_sizes[i] = h_aggregate_tmps[i * num_scalars]; + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars + 1]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 2]; + if constexpr (try_bitmap) { + local_frontier_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 3]); + local_frontier_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 4]); + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back(h_aggregate_tmps.begin() + (i * num_scalars + (try_bitmap ? 5 : 3)), + h_aggregate_tmps.begin() + + (i * num_scalars + (try_bitmap ? 5 : 3) + (*key_segment_offsets).size())); + } + } } else { local_frontier_sizes = std::vector{static_cast( static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } + } + + // update frontier bitmap (used to reduce broadcast bandwidth size) + + bool v_compressible{false}; + std:: + conditional_t>, std::byte /* dummy */> + frontier_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_frontier{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + if constexpr (sizeof(vertex_t) == 8) { + vertex_t local_frontier_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + local_frontier_max_range_size = std::max(range_size, local_frontier_max_range_size); + } + if (local_frontier_max_range_size <= + std::numeric_limits::max()) { // broadcast 32 bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_frontier_sizes[i]); + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + constexpr double threshold_ratio = + 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + auto avg_frontier_size = + std::reduce(local_frontier_sizes.begin(), local_frontier_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_frontier_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to consider additional kernel launch overhead */)) { + frontier_bitmap = + compute_vertex_list_bitmap_info(frontier_key_first, + frontier_key_last, + local_frontier_range_firsts[minor_comm_rank], + local_frontier_range_lasts[minor_comm_rank], + handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_frontier_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + frontier_key_first, + frontier_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_frontier = std::move(tmps); + } + } + } + + // set-up stream ppol + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; + if (graph_view.local_vertex_partition_segment_offsets() && + (handle.get_stream_pool_size() >= max_segments)) { + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. + } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } + + size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loopos streams from stream_pool_indices + if (stream_pool_indices) { + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } + rmm::device_uvector counters(num_concurrent_loops, handle.get_stream()); + + if constexpr (!GraphViewType::is_multi_gpu) { + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + + // 2. fill the buffers + + std::vector> key_buffers{}; + std::vector> value_buffers{}; + key_buffers.reserve(graph_view.number_of_local_edge_partitions()); + value_buffers.reserve(graph_view.number_of_local_edge_partitions()); + auto edge_mask_view = graph_view.edge_mask_view(); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto edge_partition_frontier_key_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - vertex_t edge_partition_frontier_size = static_cast(local_frontier_sizes[i]); - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); + + std::conditional_t< + GraphViewType::is_multi_gpu, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> + edge_partition_key_buffers{}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + edge_partition_key_buffers.reserve(loop_count); + + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_bitmap_buffers{}; + if constexpr (try_bitmap) { + if (frontier_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); + } + } - resize_dataframe_buffer( - edge_partition_frontier_key_buffer, edge_partition_frontier_size, handle.get_stream()); - - device_bcast(minor_comm, - frontier_key_first, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition_frontier_size, - static_cast(i), - handle.get_stream()); - - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_frontier_key_buffer); - } - - auto edge_partition_frontier_major_first = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_first); - auto edge_partition_frontier_major_last = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_last); - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto max_pushes = max_one_e_per_frontier_key ? edge_partition_frontier_size - : edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - handle.get_stream()); - - auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes; - resize_optional_dataframe_buffer( - key_buffer, new_buffer_size, handle.get_stream()); - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_frontier_range_lasts[partition_idx] - + local_frontier_range_firsts[partition_idx]), + handle.get_stream()); + use_bitmap_buffer = true; + } + } + if (!use_bitmap_buffer) { + bool allocated{false}; + if constexpr (try_bitmap) { + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_frontier_sizes[partition_idx], handle.get_stream())); + allocated = true; + } + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_frontier_sizes[partition_idx], handle.get_stream())); + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + if constexpr (try_bitmap) { + if (frontier_bitmap) { + device_bcast(minor_comm, + (*frontier_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_frontier) { + device_bcast(minor_comm, + (*compressed_frontier).data(), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize(local_frontier_sizes[partition_idx], loop_stream); + } else { + keys = + rmm::device_uvector(local_frontier_sizes[partition_idx], loop_stream); + } + + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + + auto range_first = local_frontier_range_firsts[partition_idx]; + auto range_last = local_frontier_range_lasts[partition_idx]; + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + range_first, + range_last, + loop_stream); + } + + edge_partition_key_buffers.push_back(std::move(keys)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + (*edge_partition_bitmap_buffers).clear(); + } + } } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); - if (segment_offsets) { - static_assert(num_sparse_segments_per_vertex_partition == 3); - std::vector h_thresholds(num_sparse_segments_per_vertex_partition + - (graph_view.use_dcs() ? 1 : 0) - 1); - h_thresholds[0] = edge_partition.major_range_first() + (*segment_offsets)[1]; - h_thresholds[1] = edge_partition.major_range_first() + (*segment_offsets)[2]; - if (graph_view.use_dcs()) { - h_thresholds[2] = edge_partition.major_range_first() + (*segment_offsets)[3]; + std::vector> output_key_buffers{}; + output_key_buffers.reserve(loop_count); + std::vector> output_value_buffers{}; + output_value_buffers.reserve(loop_count); + std::vector edge_partition_max_push_counts(loop_count); + + std::optional>> + high_segment_key_local_degree_offset_vectors{std::nullopt}; + std::optional> high_segment_edge_counts{std::nullopt}; + if (key_segment_offset_vectors) { + high_segment_key_local_degree_offset_vectors = std::vector>{}; + (*high_segment_key_local_degree_offset_vectors).reserve(loop_count); + high_segment_edge_counts = std::vector(loop_count); + } + + edge_partition_max_push_counts[0] = local_max_pushes; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if (static_cast(partition_idx) != minor_comm_rank) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& keys = edge_partition_key_buffers[j]; + + bool computed{false}; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + auto major_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + return range_first + static_cast(v_offset); + })); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + std::get<0>(keys).size(), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + computed = true; + } + } + if (!computed) { + dataframe_buffer_const_iterator_type_t key_first{}; + size_t num_keys{}; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + num_keys = std::get<1>(keys).size(); + } else { + key_first = get_dataframe_buffer_begin(keys); + num_keys = size_dataframe_buffer(keys); + } + auto major_first = thrust_tuple_get_or_identity(key_first); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + num_keys, + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + raft::update_host( + edge_partition_max_push_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + edge_partition_max_push_counts[minor_comm_rank % num_concurrent_loops] = local_max_pushes; + } } - rmm::device_uvector d_thresholds(h_thresholds.size(), handle.get_stream()); - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - rmm::device_uvector d_offsets(d_thresholds.size(), handle.get_stream()); - thrust::lower_bound(handle.get_thrust_policy(), - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - d_thresholds.begin(), - d_thresholds.end(), - d_offsets.begin()); - std::vector h_offsets(d_offsets.size()); - raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); - RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - h_offsets.push_back(edge_partition_frontier_size); - // FIXME: we may further improve performance by 1) concurrently running kernels on different - // segments; 2) individually tuning block sizes for different segments; and 3) adding one - // more segment for very high degree vertices and running segmented reduction - if (h_offsets[0] > 0) { - raft::grid_1d_block_t update_grid(h_offsets[0], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_high_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first, - edge_partition_frontier_key_first + h_offsets[0], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + } + + if (key_segment_offset_vectors) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + rmm::device_uvector high_segment_key_local_degree_offsets( + key_segment_offsets[1] + 1, loop_stream); + high_segment_key_local_degree_offsets.set_element_to_zero_async(0, loop_stream); + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto key_local_degree_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [edge_partition, + range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + auto major = range_first + static_cast(v_offset); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + computed = true; + } + } + if (!computed) { + auto key_first = frontier_key_first; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + } else { + key_first = get_dataframe_buffer_begin(keys); + } + } + auto key_local_degree_first = thrust::make_transform_iterator( + key_first, cuda::proclaim_return_type([edge_partition] __device__(auto key) { + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + } + raft::update_host((*high_segment_edge_counts).data() + j, + high_segment_key_local_degree_offsets.data() + key_segment_offsets[1], + 1, + loop_stream); + (*high_segment_key_local_degree_offset_vectors) + .push_back(std::move(high_segment_key_local_degree_offsets)); } - if (h_offsets[1] - h_offsets[0] > 0) { - raft::grid_1d_warp_t update_grid(h_offsets[1] - h_offsets[0], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_mid_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[0], - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + + // to ensure that *high_segment_edge_counts[] is valid + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); + } else { + handle.sync_stream(); } - if (h_offsets[2] - h_offsets[1] > 0) { - raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + output_key_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); + output_value_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + thrust::fill( + handle.get_thrust_policy(), counters.begin(), counters.begin() + loop_count, size_t{0}); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; } - if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[3] - h_offsets[2] > 0)) { - raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_frontier_key_first + h_offsets[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + auto edge_partition_stream_pool_indices = + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); } - } else { - if (edge_partition_frontier_size > 0) { - raft::grid_1d_thread_t update_grid(edge_partition_frontier_size, - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_frontier_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + auto edge_partition_frontier_key_last = + edge_partition_frontier_key_first + std::get<0>(keys).size(); + extract_transform_v_frontier_e_edge_partition( + handle, edge_partition, edge_partition_frontier_key_first, edge_partition_frontier_key_last, @@ -899,24 +1452,150 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), - e_op); + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); + computed = true; + } + } + if (!computed) { + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + edge_partition_frontier_key_first = std::get<1>(keys).begin(); + edge_partition_frontier_key_last = std::get<1>(keys).end(); + } else { + edge_partition_frontier_key_first = get_dataframe_buffer_begin(keys); + edge_partition_frontier_key_last = get_dataframe_buffer_end(keys); + } + } + + extract_transform_v_frontier_e_edge_partition( + handle, + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); } } - } - // 2. resize and return the buffers + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto new_buffer_size = buffer_idx.value(handle.get_stream()); + std::vector h_counts(loop_count); + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); - resize_optional_dataframe_buffer(key_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); + auto tmp_buffer_size = h_counts[j]; + if (tmp_buffer_size > 0) { + auto& tmp_key_buffer = output_key_buffers[j]; + auto& tmp_value_buffer = output_value_buffers[j]; + + resize_optional_dataframe_buffer( + tmp_key_buffer, tmp_buffer_size, loop_stream); + // skip shrink_to_fit before return to cut execution time + + resize_optional_dataframe_buffer( + tmp_value_buffer, tmp_buffer_size, loop_stream); + // skip shrink_to_fit before return to cut execution time + + key_buffers.push_back(std::move(tmp_key_buffer)); + value_buffers.push_back(std::move(tmp_value_buffer)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + } + + // 3. concatenate and return the buffers + + auto key_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + auto value_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + if (key_buffers.size() == 0) { + /* nothing to do */ + } else if (key_buffers.size() == 1) { + key_buffer = std::move(key_buffers[0]); + value_buffer = std::move(value_buffers[0]); + shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); + } else { + std::vector buffer_sizes(key_buffers.size()); + static_assert(!std::is_same_v || !std::is_same_v); + for (size_t i = 0; i < key_buffers.size(); ++i) { + if constexpr (!std::is_same_v) { + buffer_sizes[i] = size_optional_dataframe_buffer(key_buffers[i]); + } else { + buffer_sizes[i] = size_optional_dataframe_buffer(value_buffers[i]); + } + } + auto buffer_size = std::reduce(buffer_sizes.begin(), buffer_sizes.end()); + resize_optional_dataframe_buffer(key_buffer, buffer_size, handle.get_stream()); + resize_optional_dataframe_buffer( + value_buffer, buffer_size, handle.get_stream()); + std::vector buffer_displacements(buffer_sizes.size()); + std::exclusive_scan( + buffer_sizes.begin(), buffer_sizes.end(), buffer_displacements.begin(), size_t{0}); + handle.sync_stream(); + for (size_t i = 0; i < key_buffers.size(); ++i) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[i]) + : handle.get_stream(); + if constexpr (!std::is_same_v) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_optional_dataframe_buffer_cbegin(key_buffers[i]), + get_optional_dataframe_buffer_cend(key_buffers[i]), + get_optional_dataframe_buffer_begin(key_buffer) + buffer_displacements[i]); + } + + if constexpr (!std::is_same_v) { + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_optional_dataframe_buffer_cbegin(value_buffers[i]), + get_optional_dataframe_buffer_cend(value_buffers[i]), + get_optional_dataframe_buffer_begin(value_buffer) + + buffer_displacements[i]); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + } return std::make_tuple(std::move(key_buffer), std::move(value_buffer)); } diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh new file mode 100644 index 0000000000..76ef3fb0de --- /dev/null +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cugraph { + +namespace detail { + +inline std::vector init_stream_pool_indices(size_t max_tmp_buffer_size, + size_t approx_tmp_buffer_size_per_loop, + size_t loop_count, + size_t num_streams_per_loop, + size_t max_streams) +{ + size_t num_streams = std::min(loop_count * num_streams_per_loop, + raft::round_down_safe(max_streams, num_streams_per_loop)); + + auto num_concurrent_loops = + (approx_tmp_buffer_size_per_loop > 0) + ? std::max(max_tmp_buffer_size / approx_tmp_buffer_size_per_loop, size_t{1}) + : loop_count; + num_streams = std::min(num_concurrent_loops * num_streams_per_loop, num_streams); + + std::vector stream_pool_indices(num_streams); + std::iota(stream_pool_indices.begin(), stream_pool_indices.end(), size_t{0}); + + return stream_pool_indices; +} + +// this assumes that the caller already knows how many items will be copied. +template +void copy_if_nosync(InputIterator input_first, + InputIterator input_last, + FlagIterator flag_first, + OutputIterator output_first, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::copy_if_nosync relies on cub::DeviceSelect::Flagged which uses int for input " + "size, but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceSelect::Flagged(static_cast(nullptr), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + count.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceSelect::Flagged(d_tmp_storage.data(), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + count.data(), + input_size, + stream_view); +} + +template +void count_nosync(InputIterator input_first, + InputIterator input_last, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type value, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + count.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, count.data(), input_size, stream_view); +} + +template +void sum_nosync( + InputIterator input_first, + InputIterator input_last, + raft::device_span::value_type> sum /* size = 1 */, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + sum.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, sum.data(), input_size, stream_view); +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/detail/optional_dataframe_buffer.hpp b/cpp/src/prims/detail/optional_dataframe_buffer.hpp index 87c095f8e8..6657b91f13 100644 --- a/cpp/src/prims/detail/optional_dataframe_buffer.hpp +++ b/cpp/src/prims/detail/optional_dataframe_buffer.hpp @@ -26,152 +26,130 @@ namespace detail { // we cannot use thrust::iterator_traits::value_type if Iterator is void* (reference to // void is not allowed) template -struct optional_dataframe_buffer_value_type_t; +struct optional_dataframe_buffer_iterator_value_type_t; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = typename thrust::iterator_traits::value_type; }; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = void; }; -template >* = nullptr> -std::byte allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) -{ - return std::byte{0}; // dummy -} - -template >* = nullptr> +template auto allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) { - return allocate_dataframe_buffer(size, stream); + if constexpr (std::is_same_v) { + return std::byte{0}; // dummy + } else { + return allocate_dataframe_buffer(size, stream); + } } -template >* = nullptr> -void* get_optional_dataframe_buffer_begin(std::byte& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} +template +struct optional_dataframe_buffer_type { + using type = decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; -template >* = nullptr> -auto get_optional_dataframe_buffer_begin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_begin(optional_dataframe_buffer); -} +template +using optional_dataframe_buffer_type_t = typename optional_dataframe_buffer_type::type; -template >* = nullptr> -void* get_optional_dataframe_buffer_end(std::byte& optional_dataframe_buffer) +template +auto get_optional_dataframe_buffer_begin( + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return static_cast(nullptr); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_begin(optional_dataframe_buffer); + } } -template >* = nullptr> +template auto get_optional_dataframe_buffer_end( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return get_dataframe_buffer_end(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_end(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cbegin(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cbegin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cend(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cend( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_cend(optional_dataframe_buffer); -} - -template >* = nullptr> -void reserve_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_capacity, - rmm::cuda_stream_view stream_view) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return; + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cend(optional_dataframe_buffer); + } } -template >* = nullptr> +template void reserve_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_capacity, rmm::cuda_stream_view stream_view) { - return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); -} - -template >* = nullptr> -void resize_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_size, - rmm::cuda_stream_view stream_view) -{ - return; + if constexpr (std::is_same_v) { + return; + } else { + return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); + } } -template >* = nullptr> +template void resize_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view) { - return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + if constexpr (std::is_same_v) { + return; + } else { + return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + } } -template >* = nullptr> -void shrink_to_fit_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return; -} - -template >* = nullptr> +template void shrink_to_fit_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); -} - -template >* = nullptr> -size_t size_optional_dataframe_buffer(std::byte const& optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer, rmm::cuda_stream_view stream_view) { - return size_t{0}; + if constexpr (std::is_same_v) { + return; + } else { + return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); + } } -template >* = nullptr> +template size_t size_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return size_dataframe_buffer(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return size_t{0}; + } else { + return size_dataframe_buffer(optional_dataframe_buffer); + } } } // namespace detail diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh new file mode 100644 index 0000000000..311b16e71e --- /dev/null +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -0,0 +1,4374 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "detail/graph_partition_utils.cuh" +#include "prims/detail/multi_stream_utils.cuh" +#include "prims/detail/optional_dataframe_buffer.hpp" +#include "prims/detail/prim_functors.cuh" +#include "prims/detail/prim_utils.cuh" +#include "prims/fill_edge_src_dst_property.cuh" +#include "prims/property_op_utils.cuh" +#include "prims/reduce_op.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cugraph { + +namespace detail { + +int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; +int32_t constexpr per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size = 128; + +template +struct iterator_value_type_or_default_t; + +template +struct iterator_value_type_or_default_t>> { + using value_type = default_t; // if Iterator is invalid (void*), value_type = default_t +}; + +template +struct iterator_value_type_or_default_t>> { + using value_type = typename thrust::iterator_traits< + Iterator>::value_type; // if iterator is valid, value_type = typename + // thrust::iterator_traits::value_type +}; + +template +__device__ auto init_pred_op( + edge_partition_device_view_t const& edge_partition, + EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input, + PredOp const& pred_op, + key_t key, + typename GraphViewType::vertex_type major_offset, + typename GraphViewType::vertex_type const* indices, + typename GraphViewType::edge_type edge_offset) +{ + if constexpr (std::is_same_v< + PredOp, + const_true_e_op_t>) { + return call_const_true_e_op_t{}; + } else { + return call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } +} + +template +struct transform_and_atomic_reduce_t { + edge_partition_device_view_t const& edge_partition{}; + vertex_t const* indices{nullptr}; + TransformOp const& transform_op{}; + PredOp const& pred_op{}; + ResultValueOutputIteratorOrWrapper& result_value_output{}; + + __device__ void operator()(edge_t i) const + { + if (pred_op(i)) { + auto e_op_result = transform_op(i); + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + if constexpr (multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } +}; + +template +__device__ void update_result_value_output( + edge_partition_device_view_t const& edge_partition, + vertex_t const* indices, + edge_t local_degree, + TransformOp const& transform_op, + result_t init, + ReduceOp const& reduce_op, + PredOp const& pred_op, + size_t output_idx /* relevent only when update_major === true */, + ResultValueOutputIteratorOrWrapper& result_value_output) +{ + if constexpr (update_major) { + result_t val{}; + if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v>) { // init is selected only when no + // edges return a valid value + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + auto tmp = transform_op(i); + val = tmp; + break; + } + } else { + val = thrust::transform_reduce(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_op, + init, + reduce_op); + } + } else { + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + if (pred_op(i)) { + auto tmp = transform_op(i); + if constexpr (std::is_same_v>) { // init is selected only when + // no edges return a valid + // value + val = tmp; + break; + } else { + val = reduce_op(val, tmp); + } + } + } + } + *(result_value_output + output_idx) = val; + } else { + thrust::for_each(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_and_atomic_reduce_t{ + edge_partition, indices, transform_op, pred_op, result_value_output}); + } +} + +template +__global__ static void per_v_transform_reduce_e_hypersparse( + edge_partition_device_view_t edge_partition, + OptionalKeyIterator key_first, + OptionalKeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + static_assert(update_major || !use_input_key); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + size_t key_count{}; + if constexpr (use_input_key) { + key_count = static_cast(thrust::distance(key_first, key_last)); + } else { + key_count = *(edge_partition.dcs_nzd_vertex_count()); + } + + while (idx < key_count) { + key_t key{}; + vertex_t major{}; + thrust::optional major_idx{}; + if constexpr (use_input_key) { + key = *(key_first + idx); + major = thrust_tuple_get_or_identity(key); + major_idx = edge_partition.major_idx_from_major_nocheck(major); + } else { + key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); + major = key; + auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - + edge_partition.major_range_first()); + major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region + } + + size_t output_idx = use_input_key ? idx : (major - *(edge_partition).major_hypersparse_first()); + if (major_idx) { + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(*major_idx)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(i); + } else { + return false; + } + }, + output_idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + output_idx, + result_value_output); + } + } else { + if constexpr (update_major) { *(result_value_output + output_idx) = init; } + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_low_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(major_offset)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(i); + } else { + return false; + } + }, + idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + idx, + result_value_output); + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_mid_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); + auto const lane_id = tid % raft::warp_size(); + auto idx = static_cast(tid / raft::warp_size()); + + using WarpReduce = cub::WarpReduce< + std::conditional_t>, int32_t, e_op_result_t>>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size()) + : int32_t{1} /* dummy */]; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_lane_id{}; + if constexpr (update_major) { + reduced_e_op_result = + (lane_id == 0) ? init : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_lane_id = raft::warp_size(); + } + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (lane_id == ((first_valid_lane_id == raft::warp_size()) ? 0 : first_valid_lane_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(reduced_e_op_result, reduce_op); + if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x * (blockDim.x / raft::warp_size()); + } +} + +template +__global__ static void per_v_transform_reduce_e_high_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + auto idx = static_cast(blockIdx.x); + + using BlockReduce = cub::BlockReduce< + std::conditional_t>, int32_t, e_op_result_t>, + std::is_same_v> + ? per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : per_v_transform_reduce_e_kernel_block_size>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage; + [[maybe_unused]] __shared__ + std::conditional_t>, + int32_t, + std::byte /* dummy */> + output_thread_id; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_thread_id{}; + if constexpr (update_major) { + reduced_e_op_result = threadIdx.x == 0 + ? init + : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_thread_id = per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + } + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (threadIdx.x == ((first_valid_thread_id == + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) + ? 0 + : first_valid_thread_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); + if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x; + } +} + +template +void compute_priorities( + raft::comms::comms_t const& comm, + ValueIterator value_first, + raft::device_span priorities, + std::optional, raft::device_span>> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are + // more than one, the GPU with (comm_rank == root) has the highest priority, GPUs in the same DGX + // node should be the next) + + if (ignore_local_values) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + } else { + thrust::tabulate( + rmm::exec_policy_nosync(stream_view), + priorities.begin(), + priorities.begin() + contiguous_size, + [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { + auto val = *(value_first + offset); + return (val != init) + ? rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)) + : std::numeric_limits::max(); // lowest priority + }); + if (hypersparse_key_offsets) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + priorities.begin() + contiguous_size, + priorities.end(), + std::numeric_limits::max()); + if ((*hypersparse_key_offsets).index() == 0) { + auto priority_first = thrust::make_transform_iterator( + std::get<0>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(uint32_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<0>(*hypersparse_key_offsets).size(), + std::get<0>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } else { + auto priority_first = thrust::make_transform_iterator( + std::get<1>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(size_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<1>(*hypersparse_key_offsets).size(), + std::get<1>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } + } + } +} + +// return selected ranks if root. +// otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are +// selected or not. +template +std::variant, + int, + priority_t>> /* root, store selected ranks */, + std::optional> /* store bitmap */> +compute_selected_ranks_from_priorities( + raft::comms::comms_t const& comm, + raft::device_span priorities, + std::optional, raft::device_span>> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + using rank_t = std::conditional_t, int, priority_t>; + + if (comm_rank == root) { + rmm::device_uvector selected_ranks(priorities.size(), stream_view); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::transform(rmm::exec_policy_nosync(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + priorities.size(), + selected_ranks.begin(), + [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + return static_cast(rank); + }); + return selected_ranks; + } else { + std::optional> keep_flags{std::nullopt}; + if (!ignore_local_values) { + keep_flags = rmm::device_uvector( + packed_bool_size(hypersparse_key_offsets + ? (contiguous_size + ((*hypersparse_key_offsets).index() == 0 + ? std::get<0>(*hypersparse_key_offsets).size() + : std::get<1>(*hypersparse_key_offsets).size())) + : contiguous_size), + stream_view); + thrust::fill(rmm::exec_policy_nosync(stream_view), + (*keep_flags).begin(), + (*keep_flags).end(), + packed_bool_empty_mask()); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + contiguous_size, + [keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if (hypersparse_key_offsets) { + if ((*hypersparse_key_offsets).index() == 0) { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<0>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<0>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } else { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<1>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<1>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } + } + } + return keep_flags; + } +} + +template +void per_v_transform_reduce_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + OptionalKeyIterator edge_partition_key_first, + OptionalKeyIterator edge_partition_key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper output_buffer, + EdgeOp e_op, + T major_init, + T major_identity_element, + ReduceOp reduce_op, + PredOp pred_op, + std::optional> key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + constexpr bool use_input_key = !std::is_same_v; + + using vertex_t = typename GraphViewType::vertex_type; + using segment_key_iterator_t = + std::conditional_t; + + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } + if (key_segment_offsets) { + static_assert(detail::num_sparse_segments_per_vertex_partition == 3); + + if (edge_partition.dcs_nzd_vertex_count()) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit + // every vertex in the hypersparse segment + thrust::fill(rmm::exec_policy_nosync(exec_stream), + output_buffer + (*key_segment_offsets)[3], + output_buffer + (*key_segment_offsets)[4], + major_init); + } + + auto segment_size = use_input_key + ? ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]) + : static_cast(*(edge_partition.dcs_nzd_vertex_count())); + if (segment_size > 0) { + raft::grid_1d_thread_t update_grid(segment_size, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; } + auto segment_key_first = edge_partition_key_first; + auto segment_key_last = edge_partition_key_last; + if constexpr (use_input_key) { + segment_key_first += (*key_segment_offsets)[3]; + segment_key_last = + segment_key_first + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]); + } else { + assert(segment_key_first == nullptr); + assert(segment_key_last == nullptr); + } + detail::per_v_transform_reduce_e_hypersparse + <<>>( + edge_partition, + segment_key_first, + segment_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; } + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + *segment_key_first += (*key_segment_offsets)[2]; + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; } + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + *segment_key_first += (*key_segment_offsets)[1]; + detail::per_v_transform_reduce_e_mid_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[1] > 0) { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); + raft::grid_1d_block_t update_grid( + (*key_segment_offsets)[1], + std::is_same_v> + ? detail::per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_high_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + (*key_segment_offsets)[1], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + } else { + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + + size_t num_keys{}; + if constexpr (use_input_key) { + num_keys = + static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); + } else { + num_keys = static_cast(edge_partition.major_range_size()); + } + + if (num_keys > size_t{0}) { + raft::grid_1d_thread_t update_grid(num_keys, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + *segment_key_first, + *segment_key_first + num_keys, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } +} + +template +void per_v_transform_reduce_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + OptionalKeyIterator sorted_unique_key_first, + OptionalKeyIterator sorted_unique_key_last, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first) +{ + constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || !use_input_key); + constexpr bool filter_input_key = + GraphViewType::is_multi_gpu && use_input_key && + std::is_same_v>; // if GraphViewType::is_multi_gpu && update_major && + // std::is_same_v>, for any + // vertex in the frontier, we need to visit only local edges + // if we find any valid local edge (FIXME: this is + // applicable even when use_input_key is false). + + static_assert( + ReduceOp::pure_function && + ((reduce_op::has_compatible_raft_comms_op_v && + reduce_op::has_identity_element_v) || + (update_major && + std::is_same_v>))); // current restriction, to support general + // reduction, we may need to take a less + // efficient code path + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + using edge_partition_src_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeSrcValueInputWrapper::value_iterator, + typename EdgeSrcValueInputWrapper::value_type>>; + using edge_partition_dst_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeDstValueInputWrapper::value_iterator, + typename EdgeDstValueInputWrapper::value_type>>; + using edge_partition_e_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_edge_dummy_property_device_view_t, + detail::edge_partition_edge_property_device_view_t< + edge_t, + typename EdgeValueInputWrapper::value_iterator, + typename EdgeValueInputWrapper::value_type>>; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + constexpr bool try_bitmap = + GraphViewType::is_multi_gpu && use_input_key && std::is_same_v; + + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. drop zero degree keys & compute key_segment_offsets + + auto local_vertex_partition_segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + + std::conditional_t>, std::byte /* dummy */> + key_segment_offsets{}; + auto sorted_unique_nzd_key_last = sorted_unique_key_last; + if constexpr (use_input_key) { + if (local_vertex_partition_segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + sorted_unique_nzd_key_last = sorted_unique_key_first + (*key_segment_offsets).back(); + } + } + + // 2. initialize vertex value output buffer + + if constexpr (update_major) { // no vertices in the zero degree segment are visited (otherwise, + // no need to initialize) + if constexpr (use_input_key) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_key_last), + init); + } else { + if (local_vertex_partition_segment_offsets) { + thrust::fill( + handle.get_thrust_policy(), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin()), + init); + } + } + } else { + if constexpr (GraphViewType::is_multi_gpu) { + /* no need to initialize (we use minor_tmp_buffer) */ + } else { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + graph_view.local_vertex_partition_range_size(), + init); + } + } + + // 3. filter input keys & update key_segment_offsets + + auto edge_mask_view = graph_view.edge_mask_view(); + + auto tmp_key_buffer = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + auto tmp_output_indices = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + std::conditional_t, + VertexValueOutputIterator> + tmp_vertex_value_output_first{}; + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(static_cast(minor_comm_rank))); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, static_cast(minor_comm_rank)) + : thrust::nullopt; + + std::optional> edge_partition_stream_pool_indices{std::nullopt}; + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + edge_partition_stream_pool_indices = std::vector(max_segments); + std::iota((*edge_partition_stream_pool_indices).begin(), + (*edge_partition_stream_pool_indices).end(), + size_t{0}); + } + + if (edge_partition_stream_pool_indices) { handle.sync_stream(); } + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t( + edge_dst_value_input, static_cast(minor_comm_rank)); + } else { + edge_partition_src_value_input = edge_partition_src_input_device_view_t( + edge_src_value_input, static_cast(minor_comm_rank)); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, static_cast(minor_comm_rank)); + + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + vertex_value_output_first, + e_op, + init, + init, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices ? std::make_optional>( + (*edge_partition_stream_pool_indices).data(), + (*edge_partition_stream_pool_indices).size()) + : std::nullopt); + + if (edge_partition_stream_pool_indices) { + handle.sync_stream_pool(*edge_partition_stream_pool_indices); + } + + auto num_tmp_keys = thrust::count( + handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + init); // we allow false positives (some edge operations may actually return init) + + resize_optional_dataframe_buffer(tmp_key_buffer, num_tmp_keys, handle.get_stream()); + resize_optional_dataframe_buffer(tmp_output_indices, num_tmp_keys, handle.get_stream()); + + auto input_first = + thrust::make_zip_iterator(sorted_unique_key_first, thrust::make_counting_iterator(size_t{0})); + thrust::copy_if( + handle.get_thrust_policy(), + input_first, + input_first + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first, + thrust::make_zip_iterator(get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_output_indices)), + is_equal_t{init}); + + sorted_unique_key_first = get_optional_dataframe_buffer_begin(tmp_key_buffer); + sorted_unique_nzd_key_last = get_optional_dataframe_buffer_end(tmp_key_buffer); + tmp_vertex_value_output_first = thrust::make_permutation_iterator( + vertex_value_output_first, get_optional_dataframe_buffer_begin(tmp_output_indices)); + + if (key_segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + edge_partition.major_range_first(), + handle.get_stream()); + assert((*key_segment_offsets).back() == *((*key_segment_offsets).rbegin() + 1)); + assert(sorted_uniue_nzd_key_last == sorted_unique_key_first + (*key_segment_offsets).back()); + } + } else { + tmp_vertex_value_output_first = vertex_value_output_first; + } + + /* 4. compute subgroup_size (used to compute priority in device_gatherv) */ + + [[maybe_unused]] std::conditional_t>, + int, + std::byte /* dummy */> + subgroup_size{}; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size <= num_gpus_per_node) { + subgroup_size = minor_comm_size; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / major_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } + } + + // 5. collect max_tmp_buffer_size, approx_tmp_buffer_size_per_loop, local_key_list_sizes, + // local_v_list_range_firsts, local_v_list_range_lasts, local_key_list_deg1_sizes, + // key_segment_offset_vectors + + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; + std::conditional_t, std::byte /* dummy */> + local_key_list_sizes{}; + std::conditional_t, std::byte /* dummy */> + local_v_list_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_v_list_range_lasts{}; + std::conditional_t>, std::byte /* dummy */> + local_key_list_deg1_sizes{}; // if global degree is 1, any valid local value should be selected + std::conditional_t>>, + std::byte /* dummy */> + key_segment_offset_vectors{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{0}; + if constexpr (update_major) { + size_t key_size{0}; + if constexpr (use_input_key) { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + size_t value_size{0}; + if constexpr (std::is_arithmetic_v) { + value_size = sizeof(T); + } else { + value_size = sum_thrust_tuple_element_sizes(); + } + + size_t major_range_size{}; + if constexpr (use_input_key) { + major_range_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + ; + } else { + major_range_size = graph_view.local_vertex_partition_range_size(); + } + size_t size_per_key{}; + if constexpr (filter_input_key) { + size_per_key = + key_size + + value_size / 2; // to reflect that many keys will be filtered out, note that this is a + // simple approximation, memory requirement in this case is much more + // complex as we store additional temporary variables + + } else { + size_per_key = key_size + value_size; + } + approx_tmp_buffer_size_per_loop = major_range_size * size_per_key; + } + + size_t num_scalars = 2; // max_tmp_buffer_size, approx_tmp_buffer_size_per_loop + size_t num_scalars_less_key_segment_offsets = num_scalars; + if constexpr (use_input_key) { + num_scalars += 1; // local_key_list_size + if constexpr (try_bitmap) { + num_scalars += 2; // local_key_list_range_first, local_key_list_range_last + } + if (filter_input_key && graph_view.use_dcs()) { + num_scalars += 1; // local_key_list_degree_1_size + } + num_scalars_less_key_segment_offsets = num_scalars; + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + } + + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + auto hypersparse_degree_offsets = + graph_view.local_vertex_partition_hypersparse_degree_offsets(); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets, + [max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + deg1_v_first = (filter_input_key && graph_view.use_dcs()) + ? thrust::make_optional(graph_view.local_vertex_partition_range_first() + + (*local_vertex_partition_segment_offsets)[3] + + *((*hypersparse_degree_offsets).rbegin() + 1)) + : thrust::nullopt, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return approx_tmp_buffer_size_per_loop; + } + if constexpr (use_input_key) { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + if (i == 2) { return v_list_size; } + if constexpr (try_bitmap) { + if (i == 3) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else if (i == 4) { + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } else if (i == 5) { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } + } + } else { + if (i == 3) { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } + } + } + } + assert(false); + return size_t{0}; + }); + if constexpr (use_input_key) { + if (key_segment_offsets) { + raft::update_device(d_aggregate_tmps.data() + (num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); + if constexpr (use_input_key) { + local_key_list_sizes = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_v_list_range_firsts = std::vector(minor_comm_size); + local_v_list_range_lasts = std::vector(minor_comm_size); + } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + local_key_list_deg1_sizes = std::vector(minor_comm_size); + } + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + } + for (int i = 0; i < minor_comm_size; ++i) { + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 1]; + if constexpr (use_input_key) { + local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars + 2]; + if constexpr (try_bitmap) { + local_v_list_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 3]); + local_v_list_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 4]); + } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + (*local_key_list_deg1_sizes)[i] = + static_cast(h_aggregate_tmps[i * num_scalars + (try_bitmap ? 5 : 3)]); + } + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back( + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets, + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets + + (*key_segment_offsets).size()); + } + } + } + } else { + if constexpr (use_input_key) { + local_key_list_sizes = std::vector{ + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } + } + } + + // 6. compute optional bitmap info & compressed vertex list + + bool v_compressible{false}; + std:: + conditional_t>, std::byte /* dummy */> + v_list_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_v_list{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + if constexpr (sizeof(vertex_t) == 8) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_key_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + double threshold_ratio = + 2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ / + static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); + auto avg_key_list_size = + std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_key_list_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to considerr additional kernel launch overhead */)) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, + sorted_unique_nzd_key_last, + local_v_list_range_firsts[minor_comm_rank], + local_v_list_range_lasts[minor_comm_rank], + handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_key_list_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_key_first, + sorted_unique_nzd_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); + } + } + } + + bool uint32_key_output_offset = false; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + size_t max_key_offset_size = std::numeric_limits::max(); + if constexpr (filter_input_key) { + max_key_offset_size = std::reduce( + local_key_list_sizes.begin(), local_key_list_sizes.end(), size_t{0}, [](auto l, auto r) { + return std::max(l, r); + }); + } else { + static_assert(!use_input_key); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto const& segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + + auto output_range_size = + segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + + max_key_offset_size = std::max(static_cast(output_range_size), max_key_offset_size); + } + } + uint32_key_output_offset = + (max_key_offset_size <= static_cast(std::numeric_limits::max())); + } + + // 7. set-up stream pool & events + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. + } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } + + size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loops streams from stream_pool_indices + if (stream_pool_indices) { + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); + } + + // 8. set-up temporary buffers + + using minor_tmp_buffer_type = std::conditional_t, + edge_dst_property_t>; + [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + minor_tmp_buffer = std::make_unique(handle, graph_view); + auto minor_init = init; + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer ma not + // store values for the entire minor rangey + minor_init = ReduceOp::identity_element; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; + } + fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); + } + + using edge_partition_minor_output_device_view_t = + std::conditional_tmutable_view().value_first())>, + void /* dummy */>; + + auto counters = allocate_optional_dataframe_buffer< + std::conditional_t>( + num_concurrent_loops, handle.get_stream()); + + if constexpr (!GraphViewType::is_multi_gpu || !use_input_key) { + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + + // 9. process local edge partitions + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); + + std::conditional_t< + GraphViewType::is_multi_gpu && use_input_key, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> + edge_partition_key_buffers{}; + std::conditional_t, rmm::device_uvector>>>, + std::byte /* dummy */> + edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in th + // hypersparse regione + std::conditional_t>, std::byte /* dummy */> + edge_partition_deg1_hypersparse_key_offset_counts{}; + std::vector process_local_edges(loop_count, true); + + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto const minor_comm_rank = minor_comm.get_rank(); + + edge_partition_key_buffers.reserve(loop_count); + + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_bitmap_buffers{std::nullopt}; + if constexpr (try_bitmap) { + if (v_list_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); + } + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); + use_bitmap_buffer = true; + } + } + if (!use_bitmap_buffer) { + bool allocated{false}; + if constexpr (try_bitmap) { + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_key_list_sizes[partition_idx], handle.get_stream())); + allocated = true; + } + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_key_list_sizes[partition_idx], handle.get_stream())); + } + } + + if constexpr (filter_input_key) { + if (static_cast(partition_idx) == minor_comm_rank) { + process_local_edges[j] = false; + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if constexpr (try_bitmap) { + if (v_list_bitmap) { + device_bcast(minor_comm, + (*v_list_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(minor_comm, + (*compressed_v_list).data(), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + // copy keys from temporary bitmap buffers to key buffers (copy only the sparse segments + // if filter_input_key is true) + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + } else { + keys = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + } + + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + if (process_local_edges[j]) { + auto range_first = local_v_list_range_firsts[partition_idx]; + auto range_last = local_v_list_range_lasts[partition_idx]; + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { // skip copying the hypersparse segment + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + range_last = std::min(range_last, *(edge_partition.major_hypersparse_first())); + } + } + if (range_first < range_last) { + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + range_first, + range_last, + loop_stream); + } + } + } else { + rx_bitmap.resize(0, loop_stream); + rx_bitmap.shrink_to_fit(loop_stream); + } + edge_partition_key_buffers.push_back(std::move(keys)); + } + } + } + + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + edge_partition_hypersparse_key_offset_vectors = + std::vector, rmm::device_uvector>>{}; + (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); + edge_partition_deg1_hypersparse_key_offset_counts = std::vector(loop_count, 0); + + std::conditional_t, + rmm::device_uvector>>, + std::vector>>>, + std::byte /* dummy */> + edge_partition_new_key_buffers{}; + bool allocate_new_key_buffer{true}; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { allocate_new_key_buffer = false; } + } + if (allocate_new_key_buffer) { // allocate new key buffers and copy the sparse segment + // keys to the new key buffers + if constexpr (try_bitmap) { + edge_partition_new_key_buffers = std::vector< + std::variant, rmm::device_uvector>>{}; + } else { + edge_partition_new_key_buffers = std::vector>{}; + } + (*edge_partition_new_key_buffers).reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + if constexpr (try_bitmap) { + if (v_compressible) { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<0>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<0>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } else { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<1>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<1>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } else { + auto new_key_buffer = allocate_dataframe_buffer( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + edge_partition_key_buffers[j].resize(0, loop_stream); + edge_partition_key_buffers[j].shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } + } + + if constexpr (try_bitmap) { // if we are using a bitmap buffer + if (v_list_bitmap) { + std::vector> input_count_offset_vectors{}; + input_count_offset_vectors.reserve(loop_count); + + std::vector> filtered_bitmap_vectors{}; + std::vector> output_count_offset_vectors{}; + filtered_bitmap_vectors.reserve(loop_count); + output_count_offset_vectors.reserve(loop_count); + + std::vector range_offset_firsts(loop_count, 0); + std::vector range_offset_lasts(loop_count, 0); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector input_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto range_offset_first = + std::min((edge_partition.major_range_first() + (*segment_offsets)[3] > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[3]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + auto range_offset_last = + std::min(((edge_partition.major_range_first() + (*segment_offsets)[4]) > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[4]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto input_count_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(packed_bool_offset(range_offset_first)), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + range_offset_first] __device__(size_t i) { + auto word = range_bitmap[i]; + if (i == packed_bool_offset(range_offset_first)) { + word &= ~packed_bool_partial_mask( + range_offset_first % + packed_bools_per_word()); // clear the bits in the sparse region + } + return static_cast(__popc(word)); + })); + input_count_offsets.resize( + (rx_bitmap.size() - packed_bool_offset(range_offset_first)) + 1, loop_stream); + input_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan( + rmm::exec_policy_nosync(loop_stream), + input_count_first, + input_count_first + + (rx_bitmap.size() - packed_bool_offset(range_offset_first)), + input_count_offsets.begin() + 1); + } + range_offset_firsts[j] = range_offset_first; + range_offset_lasts[j] = range_offset_last; + } + input_count_offset_vectors.push_back(std::move(input_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector filtered_bitmap(0, loop_stream); + rmm::device_uvector output_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + filtered_bitmap.resize( + rx_bitmap.size() - packed_bool_offset(range_offset_first), loop_stream); + thrust::tabulate( + rmm::exec_policy_nosync(loop_stream), + filtered_bitmap.begin(), + filtered_bitmap.end(), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + range_offset_last, + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(size_t i) { + auto this_word_range_offset_first = cuda::std::max( + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()), + range_offset_first); + auto this_word_range_offset_last = + cuda::std::min(static_cast( + (packed_bool_offset(range_offset_first) + (i + 1)) * + packed_bools_per_word()), + range_offset_last); + auto range_lead_bits = static_cast(this_word_range_offset_first % + packed_bools_per_word()); + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask(range_offset_first % + packed_bools_per_word()); + } + auto this_word_hypersparse_offset_first = + (range_first + this_word_range_offset_first) - major_hypersparse_first; + auto num_bits = static_cast(this_word_range_offset_last - + this_word_range_offset_first); + auto hypersparse_lead_bits = + static_cast(this_word_hypersparse_offset_first) % + packed_bools_per_word(); + auto segment_bitmap_word = ((segment_bitmap[packed_bool_offset( + this_word_hypersparse_offset_first)] >> + hypersparse_lead_bits)) + << range_lead_bits; + auto remaining_bits = + (num_bits > (packed_bools_per_word() - hypersparse_lead_bits)) + ? (num_bits - (packed_bools_per_word() - hypersparse_lead_bits)) + : size_t{0}; + if (remaining_bits > 0) { + segment_bitmap_word |= + ((segment_bitmap + [packed_bool_offset(this_word_hypersparse_offset_first) + 1] & + packed_bool_partial_mask(remaining_bits)) + << ((packed_bools_per_word() - hypersparse_lead_bits) + + range_lead_bits)); + } + return range_bitmap_word & segment_bitmap_word; + })); + auto output_count_first = thrust::make_transform_iterator( + filtered_bitmap.begin(), + cuda::proclaim_return_type([] __device__(uint32_t word) { + return static_cast(__popc(word)); + })); + output_count_offsets.resize(filtered_bitmap.size() + 1, loop_stream); + output_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + output_count_first, + output_count_first + filtered_bitmap.size(), + output_count_offsets.begin() + 1); + } + } + filtered_bitmap_vectors.push_back(std::move(filtered_bitmap)); + output_count_offset_vectors.push_back(std::move(output_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto const& input_count_offsets = input_count_offset_vectors[j]; + auto const& filtered_bitmap = filtered_bitmap_vectors[j]; + auto const& output_count_offsets = output_count_offset_vectors[j]; + + if (keys.index() == 0) { + if (offsets.index() == 0) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } else { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } + } else { + if (offsets.index() == 0) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } else { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); + } + } + thrust::transform( + rmm::exec_policy_nosync(loop_stream), + output_count_offsets.begin() + (output_count_offsets.size() - 1), + output_count_offsets.end(), + counters.data() + j, + typecast_t{}); + } else { + thrust::fill(rmm::exec_policy_nosync(loop_stream), + counters.data() + j, + counters.data() + (j + 1), + size_t{0}); + } + } + + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + } + } + } + if (edge_partition_new_key_buffers) { // if there is no bitmap buffer + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + + auto& new_keys = (*edge_partition_new_key_buffers)[j]; + if constexpr (try_bitmap) { + assert(!v_list_bitmap); + if (keys.index() == 0) { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + major_hypersparse_first = + *(edge_partition + .major_hypersparse_first())] __device__(uint32_t v_offset) { + auto v = range_first + static_cast(v_offset); + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(vertex_t v) { + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } else { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(keys) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(auto key) { + auto segment_offset = + thrust_tuple_get_or_identity(key) - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + if (edge_partition_new_key_buffers) { + for (size_t j = 0; j < loop_count; ++j) { + edge_partition_key_buffers[j] = std::move((*edge_partition_new_key_buffers)[j]); + } + } + if (edge_partition_bitmap_buffers) { (*edge_partition_bitmap_buffers).clear(); } + + std::vector h_counts(loop_count); + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if (process_local_edges[j]) { + auto& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + resize_dataframe_buffer( + std::get<0>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } else { + resize_dataframe_buffer( + std::get<1>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } + } else { + resize_dataframe_buffer(keys, key_segment_offsets[3] + h_counts[j], loop_stream); + } + // skip shrink_to_fit to cut execution time + + auto& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(h_counts[j], loop_stream); + } else { + std::get<1>(offsets).resize(h_counts[j], loop_stream); + } + // skip shrink_to_fit to cut execution time + } + } + + { // update edge_partition_deg1_hypersparse_key_offset_counts + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector h_ptrs( + loop_count); // pointers to hypersparse key offset vectors + std::vector h_scalars( + loop_count * 2); // (key offset vector sizes, start degree 1 key offset) + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if (process_local_edges[j]) { + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + h_ptrs[j] = static_cast(std::get<0>(offsets).data()); + h_scalars[j * 2] = std::get<0>(offsets).size(); + } else { + h_ptrs[j] = static_cast(std::get<1>(offsets).data()); + h_scalars[j * 2] = std::get<1>(offsets).size(); + } + h_scalars[j * 2 + 1] = + local_key_list_sizes[partition_idx] - (*local_key_list_deg1_sizes)[partition_idx]; + } else { + h_ptrs[j] = static_cast(nullptr); + h_scalars[j * 2] = size_t{0}; + h_scalars[j * 2 + 1] = size_t{0}; + } + } + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + rmm::device_uvector d_scalars(h_scalars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_scalars.data(), h_scalars.data(), h_scalars.size(), handle.get_stream()); + rmm::device_uvector d_counts(loop_count, handle.get_stream()); + thrust::transform( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(loop_count), + d_counts.begin(), + cuda::proclaim_return_type( + [d_ptrs = raft::device_span(d_ptrs.data(), d_ptrs.size()), + d_scalars = raft::device_span(d_scalars.data(), d_scalars.size()), + uint32_key_output_offset] __device__(auto i) { + auto first = d_ptrs[i]; + if (first != static_cast(nullptr)) { + auto size = d_scalars[i * 2]; + auto start_offset = d_scalars[i * 2 + 1]; + if (uint32_key_output_offset) { + auto casted_first = static_cast(first); + return size - static_cast(thrust::distance( + casted_first, + thrust::lower_bound(thrust::seq, + casted_first, + casted_first + size, + static_cast(start_offset)))); + } else { + auto casted_first = static_cast(first); + return size - + static_cast(thrust::distance( + casted_first, + thrust::lower_bound( + thrust::seq, casted_first, casted_first + size, start_offset))); + } + } else { + return size_t{0}; + } + })); + raft::update_host((*edge_partition_deg1_hypersparse_key_offset_counts).data(), + d_counts.data(), + d_counts.size(), + handle.get_stream()); + handle.sync_stream(); + } + } + } + } + + std::conditional_t>, + std::byte /* dummy */> + edge_partition_major_output_buffers{}; + if constexpr (GraphViewType::is_multi_gpu && update_major) { + edge_partition_major_output_buffers.reserve(loop_count); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + size_t buffer_size{0}; + if (process_local_edges[j]) { + if constexpr (use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + buffer_size = size_dataframe_buffer(std::get<0>(keys)); + } else { + buffer_size = size_dataframe_buffer(std::get<1>(keys)); + } + } else { + buffer_size = size_dataframe_buffer(keys); + } + } else { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + buffer_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + } + } + edge_partition_major_output_buffers.push_back( + allocate_dataframe_buffer(buffer_size, loop_stream)); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + auto partition_idx = i + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; + } + auto edge_partition_stream_pool_indices = + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; + + T major_init{}; + T major_identity_element{}; + if constexpr (update_major) { + if constexpr (std::is_same_v>) { // if any edge has a non-init value, + // one of the non-init values will + // be selected. + major_init = init; + major_identity_element = init; + } else { + major_init = ReduceOp::identity_element; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + major_init = (static_cast(partition_idx) == minor_comm_rank) + ? init + : ReduceOp::identity_element; + } else { + major_init = init; + } + major_identity_element = ReduceOp::identity_element; + } + } + + std::optional> key_segment_offsets{std::nullopt}; + if constexpr (use_input_key) { + if (key_segment_offset_vectors) { + key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + (*key_segment_offsets).back() = + size_dataframe_buffer(edge_partition_major_output_buffers[j]); + *((*key_segment_offsets).rbegin() + 1) = (*key_segment_offsets).back(); + } + } + } + } else { + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); + } + } + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + std::conditional_t, + edge_partition_minor_output_device_view_t>, + VertexValueOutputIterator> + output_buffer{}; + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (update_major) { + output_buffer = get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]); + } else { + output_buffer = + edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + } + } else { + output_buffer = tmp_vertex_value_output_first; + } + + bool processed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_first + std::get<0>(keys).size(), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + processed = true; + } + } + if (!processed) { + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + edge_partition_key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + edge_partition_key_last = get_dataframe_buffer_end(std::get<1>(keys)); + } else { + edge_partition_key_first = get_dataframe_buffer_begin(keys); + edge_partition_key_last = get_dataframe_buffer_end(keys); + } + } + + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + } + } + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + if constexpr (use_input_key) { + edge_partition_key_buffers.clear(); + edge_partition_key_buffers.shrink_to_fit(); + } + + if constexpr (std::is_same_v>) { + std::conditional_t< + filter_input_key, + std::optional, raft::device_span>>>, + std::byte /* dummy */> + edge_partition_hypersparse_non_deg1_key_offset_spans{}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + edge_partition_hypersparse_non_deg1_key_offset_spans = std::vector< + std::variant, raft::device_span>>( + loop_count); + } + } + + std::vector edge_partition_allreduce_sizes(loop_count); + std::vector edge_partition_allreduce_displacements(loop_count); + std::vector edge_partition_contiguous_sizes(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + size_t allreduce_size{}; + size_t contiguous_size{}; + if constexpr (filter_input_key) { + allreduce_size = local_key_list_sizes[partition_idx]; + if (local_key_list_deg1_sizes) { + allreduce_size -= (*local_key_list_deg1_sizes)[partition_idx]; + } + if (key_segment_offset_vectors) { + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + contiguous_size = key_segment_offsets[3]; + } else { + contiguous_size = local_key_list_sizes[partition_idx]; + } + } else { + static_assert(!use_input_key); + auto hypersparse_degree_offsets = + graph_view.local_edge_partition_hypersparse_degree_offsets(partition_idx); + allreduce_size = size_dataframe_buffer(output_buffer); + if (hypersparse_degree_offsets) { + allreduce_size -= *((*hypersparse_degree_offsets).rbegin()) - + *((*hypersparse_degree_offsets).rbegin() + 1); + } + contiguous_size = size_dtaframe_buffer(output_buffer); + } + edge_partition_allreduce_sizes[j] = allreduce_size; + edge_partition_contiguous_sizes[j] = contiguous_size; + } + std::exclusive_scan(edge_partition_allreduce_sizes.begin(), + edge_partition_allreduce_sizes.end(), + edge_partition_allreduce_displacements.begin(), + size_t{0}); + std::variant, rmm::device_uvector> + aggregate_priorities = rmm::device_uvector(0, handle.get_stream()); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities) + .resize( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } else { // priority == uint32_t + aggregate_priorities = rmm::device_uvector( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + + if (offsets.index() == 0) { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<0>(offsets).data(), + std::get<0>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } else { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<1>(offsets).data(), + std::get<1>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] = + *hypersparse_non_deg1_key_offsets; + } + } + + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + compute_priorities( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + raft::device_span(std::get<0>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), + hypersparse_non_deg1_key_offsets, + edge_partition_contiguous_sizes[j], + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + } else { // priority == uint32_t + compute_priorities( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + raft::device_span(std::get<1>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), + hypersparse_non_deg1_key_offsets, + edge_partition_contiguous_sizes[j], + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + device_allreduce(minor_comm, + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } else { // priority == uint32_t + device_allreduce(minor_comm, + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } + + std::vector< + std::variant, rmm::device_uvector>, + std::optional>>> + edge_partition_selected_ranks_or_flags{}; + edge_partition_selected_ranks_or_flags.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& output_buffer = edge_partition_major_output_buffers[j]; + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + hypersparse_non_deg1_key_offsets = + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + } + } + + auto contiguous_size = edge_partition_contiguous_sizes[j]; + + std::variant, rmm::device_uvector>, + std::optional>> + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + rmm::device_uvector(0, loop_stream)); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + auto priorities = raft::device_span( + std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); + auto tmp = compute_selected_ranks_from_priorities( + minor_comm, + priorities, + hypersparse_non_deg1_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } + } else { // priority_t == uint32_t + auto priorities = raft::device_span( + std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); + auto tmp = compute_selected_ranks_from_priorities( + minor_comm, + priorities, + hypersparse_non_deg1_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges[j] ? false : true /* ignore_local_values */, + loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } + } + edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<0>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } else { + std::get<1>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<1>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } + if (loop_stream_pool_indices) { handle.sync_stream(); } + + std::vector> edge_partition_values{}; + edge_partition_values.reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto& output_buffer = edge_partition_major_output_buffers[j]; + + auto values = allocate_dataframe_buffer( + process_local_edges[j] ? size_dataframe_buffer(output_buffer) : size_t{0}, loop_stream); + if (process_local_edges[j]) { + if (minor_comm_rank == static_cast(partition_idx)) { + assert(!use_input_key); + assert(edge_partition_selected_ranks_or_flags[j].index() == 0); + auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<0>(selected_ranks).begin(), + cuda::proclaim_return_type([minor_comm_rank] __device__(auto rank) { + return static_cast(rank) == minor_comm_rank; + })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<1>(selected_ranks).begin(), + cuda::proclaim_return_type( + [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + size_t input_end_offset{}; + if constexpr (filter_input_key) { + input_end_offset = edge_partition_contiguous_sizes[j]; + if (edge_partition_hypersparse_non_deg1_key_offset_spans) { + auto const& span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (span.index() == 0) { + input_end_offset += std::get<0>(span).size(); + } else { + input_end_offset += std::get<1>(span).size(); + } + } + } else { + input_end_offset = edge_partition_allreduce_sizes[j]; + } + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + input_end_offset, + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span( + (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { + auto word = keep_flags[packed_bool_offset(offset)]; + return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); + })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + (*keep_flags).resize(0, loop_stream); + (*keep_flags).shrink_to_fit(loop_stream); + } + } + + edge_partition_values.push_back(std::move(values)); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector copy_sizes(loop_count); + raft::update_host(copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + std::optional< + std::vector, rmm::device_uvector>>> + edge_partition_deg1_hypersparse_output_offset_vectors{}; + + if (graph_view.use_dcs()) { + edge_partition_deg1_hypersparse_output_offset_vectors = + std::vector, rmm::device_uvector>>{}; + (*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto& output_buffer = edge_partition_major_output_buffers[j]; + std::variant, rmm::device_uvector> + output_offsets = rmm::device_uvector(0, loop_stream); + if (!uint32_key_output_offset) { + output_offsets = rmm::device_uvector(0, loop_stream); + } + + if (process_local_edges[j]) { + auto& values = edge_partition_values[j]; + + size_t output_offset_buf_size{0}; + if constexpr (filter_input_key) { + output_offset_buf_size = (*edge_partition_deg1_hypersparse_key_offset_counts)[j]; + } else { + assert(!use_input_key); + output_offset_buf_size = + size_dataframe_buffer(output_buffer) - edge_partition_allreduce_sizes[j]; + } + + if (output_offsets.index() == 0) { + std::get<0>(output_offsets).resize(output_offset_buf_size, loop_stream); + } else { + output_offsets = rmm::device_uvector(output_offset_buf_size, loop_stream); + } + + size_t input_start_offset{}; + if constexpr (filter_input_key) { + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + input_start_offset = + edge_partition_contiguous_sizes[j] + + (span.index() == 0 ? std::get<0>(span).size() : std::get<1>(span).size()); + } else { + static_assert(!use_input_key); + input_start_offset = edge_partition_allreduce_sizes[j]; + } + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + cuda::proclaim_return_type( + [init] __device__(auto val) { return val != init; })); + + if constexpr (filter_input_key) { + auto& hypersparse_key_offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (hypersparse_key_offsets.index() == 0) { + assert(output_offsets.index() == 0); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<0>(hypersparse_key_offsets).begin() + std::get<0>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<0>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<0>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } else { + assert(output_offsets.index() == 1); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<1>(hypersparse_key_offsets).begin() + std::get<1>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<1>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<1>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } + } else { + static_assert(!use_input_key); + assert(process_local_edges[j]); + if (output_offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(uint32_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(size_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + + (*edge_partition_deg1_hypersparse_output_offset_vectors) + .push_back(std::move(output_offsets)); + + resize_dataframe_buffer(output_buffer, 0, loop_stream); + shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + + std::vector deg1_copy_sizes(loop_count); + raft::update_host( + deg1_copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + copy_sizes[j] += deg1_copy_sizes[j]; + auto& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } else { + assert(offsets.index() == 1); + std::get<1>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } + // skip shrink_to_fit() to cut execution time + } + } + } + + for (size_t j = 0; j < loop_count; ++j) { + if (process_local_edges[j]) { + resize_dataframe_buffer(edge_partition_values[j], copy_sizes[j], handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } + } + + size_t min_element_size{cache_line_size}; + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(T), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + assert((cache_line_size % min_element_size) == 0); + size_t value_alignment = cache_line_size / min_element_size; + + size_t offset_alignment = 1; + if (graph_view.use_dcs()) { + static_assert(((cache_line_size % sizeof(uint32_t)) == 0) && + ((cache_line_size % sizeof(size_t)) == 0)); + offset_alignment = + cache_line_size / (uint32_key_output_offset ? sizeof(uint32_t) : sizeof(size_t)); + } + + std::optional> rx_value_sizes{}; + std::optional> rx_value_displs{}; + std::optional> rx_values{}; + + std::optional> rx_offset_sizes{}; + std::optional> rx_offset_displs{}; + std::optional, rmm::device_uvector>> + rx_offsets{}; + { + auto size_per_rank = + loop_count * (graph_view.use_dcs() ? 2 /* value buffer size, offset buffer size */ + : 1 /* value buffer size */); + rmm::device_uvector d_aggregate_buffer_sizes(minor_comm_size * size_per_rank, + handle.get_stream()); + std::vector h_buffer_sizes(size_per_rank); + for (size_t j = 0; j < loop_count; ++j) { + h_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); + if (graph_view.use_dcs()) { + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + h_buffer_sizes[loop_count + j] = std::get<0>(offsets).size(); + } else { + assert(offsets.index() == 1); + h_buffer_sizes[loop_count + j] = std::get<1>(offsets).size(); + } + } + } + raft::update_device(d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + h_buffer_sizes.data(), + h_buffer_sizes.size(), + handle.get_stream()); + device_allgather(minor_comm, + d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + d_aggregate_buffer_sizes.data(), + size_per_rank, + handle.get_stream()); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + std::vector h_aggregate_buffer_sizes(d_aggregate_buffer_sizes.size()); + raft::update_host(h_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.size(), + handle.get_stream()); + handle.sync_stream(); + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + rx_value_sizes = std::vector(minor_comm_size); + rx_value_displs = std::vector(minor_comm_size); + if (graph_view.use_dcs()) { + rx_offset_sizes = std::vector(minor_comm_size); + rx_offset_displs = std::vector(minor_comm_size); + } + for (int k = 0; k < minor_comm_size; ++k) { + (*rx_value_sizes)[k] = h_aggregate_buffer_sizes[k * size_per_rank + j]; + if (graph_view.use_dcs()) { + (*rx_offset_sizes)[k] = + h_aggregate_buffer_sizes[k * size_per_rank + loop_count + j]; + } + } + + std::vector aligned_sizes(minor_comm_size); + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_value_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_value_sizes)[k], value_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_value_displs).begin(), size_t{0}); + + if (graph_view.use_dcs()) { + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_offset_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_offset_sizes)[k], offset_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_offset_displs).begin(), size_t{0}); + } + + rx_values = allocate_dataframe_buffer( + (*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream()); + if (graph_view.use_dcs()) { + if (uint32_key_output_offset) { + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } else { + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& values = edge_partition_values[j]; + + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + get_dataframe_buffer_begin(*rx_values), + values.size(), + *rx_value_sizes, + *rx_value_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + dataframe_buffer_iterator_type_t{}, + values.size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (graph_view.use_dcs()) { + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& values = edge_partition_values[j]; + + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + std::get<0>(offsets).data(), + std::get<0>(*rx_offsets).data(), + std::get<0>(offsets).size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + std::get<0>(offsets).data(), + static_cast(nullptr), + std::get<0>(offsets).size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } else { + assert(offsets.index() == 1); + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + std::get<1>(offsets).data(), + std::get<1>(*rx_offsets).data(), + std::get<1>(offsets).size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + std::get<1>(offsets).data(), + static_cast(nullptr), + std::get<1>(offsets).size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } + } + device_group_end(minor_comm); + } + handle.sync_stream(); // this is required before edge_partition_values.clear(); + edge_partition_values.clear(); + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); + } // to ensure that memory is freed + + if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) { + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + auto partition_idx = i + j; + + { // remove gaps introduced to enforce alignment + rmm::device_uvector bitmap( + packed_bool_size(size_dataframe_buffer(*rx_values)), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + rmm::device_uvector d_displs((*rx_value_displs).size(), handle.get_stream()); + rmm::device_uvector d_sizes((*rx_value_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_value_displs).data(), + (*rx_value_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_value_sizes).data(), + (*rx_value_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + value_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = value_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + resize_dataframe_buffer( + *rx_values, + thrust::distance( + get_dataframe_buffer_begin(*rx_values), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + std::exclusive_scan((*rx_value_sizes).begin(), + (*rx_value_sizes).end(), + (*rx_value_displs).begin(), + size_t{0}); // now gaps are removed + + if (rx_offsets) { + size_t num_offsets = ((*rx_offsets).index() == 0) + ? size_dataframe_buffer(std::get<0>(*rx_offsets)) + : size_dataframe_buffer(std::get<1>(*rx_offsets)); + bitmap.resize(packed_bool_size(num_offsets), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + d_displs.resize((*rx_offset_displs).size(), handle.get_stream()); + d_sizes.resize((*rx_offset_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_offset_displs).data(), + (*rx_offset_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_offset_sizes).data(), + (*rx_offset_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + offset_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = offset_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if ((*rx_offsets).index() == 0) { + resize_dataframe_buffer( + std::get<0>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + get_dataframe_buffer_end(std::get<0>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } else { + resize_dataframe_buffer( + std::get<1>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + get_dataframe_buffer_end(std::get<1>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } + std::exclusive_scan((*rx_offset_sizes).begin(), + (*rx_offset_sizes).end(), + (*rx_offset_displs).begin(), + size_t{0}); // now gaps are removed + } + } + + size_t output_range_size{}; + if constexpr (filter_input_key) { + output_range_size = local_key_list_sizes[partition_idx]; + } else { + auto const& segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + output_range_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : graph_view.local_vertex_partition_range_size(); + } + auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + auto old_size = std::get<0>(selected_ranks).size(); + std::get<0>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin() + old_size, + std::get<0>(selected_ranks).end(), + static_cast(minor_comm_size)); + } else { + assert(selected_ranks.index() == 1); + auto old_size = std::get<1>(selected_ranks).size(); + std::get<1>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin() + old_size, + std::get<1>(selected_ranks).end(), + minor_comm_size); + } + if (rx_offsets) { + rmm::device_uvector lasts((*rx_offset_displs).size(), handle.get_stream()); + raft::update_device(lasts.data(), + (*rx_offset_displs).data() + 1, + (*rx_offset_displs).size() - 1, + handle.get_stream()); + auto num_elements = (*rx_offset_displs).back() + (*rx_offset_sizes).back(); + lasts.set_element_async(lasts.size() - 1, num_elements, handle.get_stream()); + handle.sync_stream(); // this is necessary before num_elements becomes out-of-scope + + if ((*rx_offsets).index() == 0) { + auto& offsets = std::get<0>(*rx_offsets); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } else { + assert((*rx_offsets).index() == 1); + auto& offsets = std::get<1>(*rx_offsets); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } + } + + size_t num_positions = (selected_ranks.index() == 0) ? std::get<0>(selected_ranks).size() + : std::get<1>(selected_ranks).size(); + if (num_positions <= static_cast(std::numeric_limits::max())) { + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), uint32_t{0}); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_positions.begin(), + tmp_vertex_value_output_first); + } else { + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), size_t{0}); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_positions.begin(), + tmp_vertex_value_output_first); + } + } + handle.sync_stream(); + } else { + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + device_reduce(minor_comm, + get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]), + tmp_vertex_value_output_first, + size_dataframe_buffer(edge_partition_major_output_buffers[j]), + ReduceOp::compatible_raft_comms_op, + static_cast(partition_idx), + handle.get_stream()); + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + } + } + } + + // 10. communication + + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // applying the initial value is deferred to here + vertex_t max_vertex_partition_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + max_vertex_partition_size = + std::max(max_vertex_partition_size, + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); + } + auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); + auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); + std::optional> minor_key_offsets{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); + } else { + minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); + } + for (int i = 0; i < major_comm_size; ++i) { + auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + thrust::fill(handle.get_thrust_policy(), + tx_buffer_first, + tx_buffer_first + + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), + minor_init); + auto value_first = thrust::make_transform_iterator( + view.value_first(), + cuda::proclaim_return_type( + [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); + thrust::scatter(handle.get_thrust_policy(), + value_first + (*minor_key_offsets)[i], + value_first + (*minor_key_offsets)[i + 1], + thrust::make_transform_iterator( + (*(view.keys())).begin() + (*minor_key_offsets)[i], + cuda::proclaim_return_type( + [key_first = graph_view.vertex_partition_range_first( + this_segment_vertex_partition_id)] __device__(auto key) { + return key - key_first; + })), + tx_buffer_first); + device_reduce(major_comm, + tx_buffer_first, + tmp_vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } else { + auto first_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); + vertex_t minor_range_first = + graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - + minor_range_first; + device_reduce(major_comm, + view.value_first() + offset, + tmp_vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } + } +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh index f426cd993e..a166f37906 100644 --- a/cpp/src/prims/detail/prim_functors.cuh +++ b/cpp/src/prims/detail/prim_functors.cuh @@ -21,6 +21,23 @@ namespace cugraph { namespace detail { +template +struct const_true_e_op_t { + __device__ auto operator()(std::conditional_t key_or_src, + std::conditional_t key_or_dst, + src_value_t, + dst_value_t, + e_value_t) const + { + return true; + } +}; + template +struct call_const_true_e_op_t { + __device__ auto operator()(edge_t i) const { return true; } +}; + template +#include + +namespace cugraph { + +namespace detail { + +template +__host__ __device__ priority_t +rank_to_priority(int rank, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffic */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (rank == root) { + return priority_t{0}; + } else if (rank / subgroup_size == + root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in + // [1, subgroup_size) + auto rank_dist = + static_cast(((static_cast(rank) + subgroup_size) - root) % subgroup_size); + int modulo = subgroup_size - 1; + return static_cast(1 + (static_cast(rank_dist - 1) + (offset % modulo)) % + modulo); + } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) + auto subgroup_dist = + static_cast(((static_cast(rank / subgroup_size) + (comm_size / subgroup_size)) - + (root / subgroup_size)) % + (comm_size / subgroup_size)); + auto intra_subgroup_rank_dist = static_cast( + ((static_cast(rank % subgroup_size) + subgroup_size) - (root % subgroup_size)) % + subgroup_size); + auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist; + int modulo = comm_size - subgroup_size; + return static_cast( + subgroup_size + + (static_cast(rank_dist - subgroup_size) + (offset % modulo)) % modulo); + } +} + +template +__host__ __device__ int priority_to_rank( + priority_t priority, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffict */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (priority == priority_t{0}) { + return root; + } else if (priority < static_cast(subgroup_size)) { + int modulo = subgroup_size - 1; + auto rank_dist = static_cast( + 1 + ((static_cast(priority - 1) + modulo) - (offset % modulo)) % modulo); + return static_cast((root - (root % subgroup_size)) + + ((static_cast(root) + rank_dist) % subgroup_size)); + } else { + int modulo = comm_size - subgroup_size; + auto rank_dist = static_cast( + subgroup_size + + ((static_cast(priority) - subgroup_size) + (modulo - (offset % modulo))) % modulo); + auto subgroup_dist = rank_dist / subgroup_size; + auto intra_subgroup_rank_dist = rank_dist % subgroup_size; + return static_cast( + ((static_cast((root / subgroup_size) * subgroup_size) + + subgroup_dist * subgroup_size) + + (static_cast(root) + intra_subgroup_rank_dist) % subgroup_size) % + comm_size); + } +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh index d51e03628e..5741c98d90 100644 --- a/cpp/src/prims/extract_transform_e.cuh +++ b/cpp/src/prims/extract_transform_e.cuh @@ -116,8 +116,8 @@ extract_transform_e(raft::handle_t const& handle, thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last())); auto value_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(std::ignore, value_buffer) = detail:: - extract_transform_v_frontier_e( + std::tie(std::ignore, value_buffer) = + detail::extract_transform_v_frontier_e( handle, graph_view, frontier, diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh index 7ad033b93c..ba227b263b 100644 --- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh +++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh @@ -64,13 +64,13 @@ namespace cugraph { * @return Dataframe buffer object storing extracted and accumulated valid @p e_op return values. */ template decltype(allocate_dataframe_buffer< - typename detail::edge_op_result_type(size_t{0}, handle.get_stream()); std::tie(std::ignore, value_buffer) = - detail::extract_transform_v_frontier_e(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - do_expensive_check); + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + do_expensive_check); return value_buffer; } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 58dbf7e74a..a36cf332eb 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include "prims/vertex_frontier.cuh" + #include #include #include @@ -129,8 +131,8 @@ template void fill_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMajorPropertyOutputWrapper edge_major_property_output, T input) { @@ -153,12 +155,12 @@ void fill_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto local_v_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -169,14 +171,18 @@ void fill_edge_major_property(raft::handle_t const& handle, edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + local_v_list_sizes[i], + i, + handle.get_stream()); if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), input, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -199,7 +205,7 @@ void fill_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), input, @@ -219,7 +225,7 @@ void fill_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), val_first, - val_first + rx_counts[i], + val_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -232,17 +238,18 @@ void fill_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_firsts[0]] __device__( auto v) { packed_bool_atomic_set(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -286,8 +293,8 @@ template void fill_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMinorPropertyOutputWrapper edge_minor_property_output, T input) { @@ -300,22 +307,269 @@ void fill_edge_minor_property(raft::handle_t const& handle, using edge_t = typename GraphViewType::edge_type; auto edge_partition_value_first = edge_minor_property_output.value_first(); + vertex_t minor_range_first{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_range_first = graph_view.local_edge_partition_src_range_first(); + } else { + minor_range_first = graph_view.local_edge_partition_dst_range_first(); + } + if constexpr (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); - }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); + constexpr size_t packed_bool_word_bcast_alignment = + 128 / + sizeof( + uint32_t); // 128B cache line alignment (unaligned ncclBroadcast operations are slower) + + std::vector max_tmp_buffer_sizes{}; + std::vector local_v_list_sizes{}; + std::vector local_v_list_range_firsts{}; + std::vector local_v_list_range_lasts{}; + { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + rmm::device_uvector d_aggregate_tmps(major_comm_size * size_t{4}, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + major_comm_rank * size_t{4}, + d_aggregate_tmps.begin() + (major_comm_rank + 1) * size_t{4}, + [max_tmp_buffer_size = static_cast( + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05), + sorted_unique_vertex_first, + v_list_size, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return static_cast(v_list_size); + } else if (i == 2) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_vertex_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else { + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_vertex_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + }); + + if (major_comm_size > 1) { // allgather max_tmp_buffer_size, v_list_size, v_list_range_first + // (inclusive), v_list_range_last (exclusive) + device_allgather(major_comm, + d_aggregate_tmps.data() + major_comm_rank * size_t{4}, + d_aggregate_tmps.data(), + size_t{4}, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(major_comm_size); + local_v_list_sizes = std::vector(major_comm_size); + local_v_list_range_firsts = std::vector(major_comm_size); + local_v_list_range_lasts = std::vector(major_comm_size); + for (int i = 0; i < major_comm_size; ++i) { + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * size_t{4}]; + local_v_list_sizes[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 1]); + local_v_list_range_firsts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 2]); + local_v_list_range_lasts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 3]); + } + } + + auto edge_partition_keys = edge_minor_property_output.keys(); + + std::optional> v_list_bitmap{std::nullopt}; + std::optional> compressed_v_list{std::nullopt}; + if (major_comm_size > 1) { + bool v_compressible{false}; + if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += (range_size > 0) + ? (static_cast(num_keys) / static_cast(range_size)) + : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + double threshold_ratio = + 1.0 / static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); + auto avg_v_list_size = std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) / + static_cast(major_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_v_list_size) > packed_bool_word_bcast_alignment)) { + if (is_packed_bool() && + !edge_partition_keys) { // directly update edge_minor_property_output (with special + // care for unaligned boundaries) + rmm::device_uvector boundary_words( + packed_bool_word_bcast_alignment, + handle.get_stream()); // for unaligned boundaries + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) == + packed_bool_offset(graph_view.local_vertex_partition_range_first() - + minor_range_first)) && + (((local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bools_per_word()) != + 0)) { // there are unaligned bits (fewer than packed_bools_per_word()) in the vertex + // partition boundary + leading_boundary_words = packed_bool_word_bcast_alignment; + } + thrust::fill(handle.get_thrust_policy(), + boundary_words.begin(), + boundary_words.begin() + leading_boundary_words, + packed_bool_empty_mask()); + if (local_v_list_range_firsts[major_comm_rank] < + local_v_list_range_lasts[major_comm_rank]) { + auto word_offset_first = + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first); + auto word_offset_last = + packed_bool_offset((local_v_list_range_lasts[major_comm_rank] - 1) - + minor_range_first) + + 1; + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(word_offset_first), + thrust::make_counting_iterator(word_offset_last), + [sorted_unique_vertex_first, + sorted_unique_vertex_last, + input, + minor_range_first, + leading_boundary_words, + word_offset_first, + vertex_partition_range_last = graph_view.local_vertex_partition_range_last(), + output_value_first = edge_partition_value_first, + boundary_words = raft::device_span( + boundary_words.data(), boundary_words.size())] __device__(auto i) { + auto& word = ((i - word_offset_first) < leading_boundary_words) + ? boundary_words[i - word_offset_first] + : *(output_value_first + i); + auto word_v_first = + minor_range_first + static_cast(i * packed_bools_per_word()); + auto word_v_last = + ((vertex_partition_range_last - word_v_first) <= packed_bools_per_word()) + ? vertex_partition_range_last + : (word_v_first + static_cast(packed_bools_per_word())); + auto it = thrust::lower_bound( + thrust::seq, sorted_unique_vertex_first, sorted_unique_vertex_last, word_v_first); + while ((it != sorted_unique_vertex_last) && (*it < word_v_last)) { + auto v_offset = *it - minor_range_first; + if (input) { + word |= packed_bool_mask(v_offset); + } else { + word &= ~packed_bool_mask(v_offset); + } + ++it; + } + }); + } + rmm::device_uvector aggregate_boundary_words( + major_comm_size * packed_bool_word_bcast_alignment, handle.get_stream()); + device_allgather(major_comm, + boundary_words.data(), + aggregate_boundary_words.data(), + packed_bool_word_bcast_alignment, + handle.get_stream()); + v_list_bitmap = std::move(aggregate_boundary_words); + } else { + v_list_bitmap = + compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } else if (v_compressible) { + rmm::device_uvector tmps(local_v_list_sizes[major_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[major_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); + } + } + + std::optional> stream_pool_indices{std::nullopt}; + size_t num_concurrent_bcasts{1}; + { + size_t tmp_buffer_size_per_loop{}; + for (int i = 0; i < major_comm_size; ++i) { + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { + tmp_buffer_size_per_loop += 0; + } else if (v_list_bitmap) { + tmp_buffer_size_per_loop += + packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) * + sizeof(uint32_t) + + static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); + } else { + tmp_buffer_size_per_loop += static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); + } + } + tmp_buffer_size_per_loop /= major_comm_size; + size_t max_streams = + static_cast(major_comm_size); // to allow setting num_concurrent_bcasts above + // hnadle.get_stream_pool_size() + stream_pool_indices = init_stream_pool_indices( + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(major_comm_size), + tmp_buffer_size_per_loop, + major_comm_size, + 1, + max_streams); + num_concurrent_bcasts = (*stream_pool_indices).size(); + if ((*stream_pool_indices).size() > handle.get_stream_pool_size()) { + (*stream_pool_indices).resize(handle.get_stream_pool_size()); + } + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + } std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -324,88 +578,417 @@ void fill_edge_minor_property(raft::handle_t const& handle, key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(size_t{0})); - auto edge_partition_keys = edge_minor_property_output.keys(); - for (int i = 0; i < major_comm_size; ++i) { - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + for (size_t i = 0; i < static_cast(major_comm_size); i += num_concurrent_bcasts) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); + auto loop_count = std::min(num_concurrent_bcasts, static_cast(major_comm_size) - i); + + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { + std::vector leading_boundary_word_counts(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, partition_idx, minor_comm_rank); + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) == + packed_bool_offset(graph_view.vertex_partition_range_first(vertex_partition_id) - + minor_range_first)) && + (((local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bools_per_word()) != 0)) { + leading_boundary_words = packed_bool_word_bcast_alignment; + } + leading_boundary_word_counts[j] = leading_boundary_words; + } + device_group_start(major_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + size_t bcast_size{0}; + vertex_t packed_bool_offset_first{0}; + if (local_v_list_range_firsts[partition_idx] < local_v_list_range_lasts[partition_idx]) { + auto leading_boundary_words = leading_boundary_word_counts[j]; + packed_bool_offset_first = + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) + + static_cast(leading_boundary_words); + auto packed_bool_offset_last = + packed_bool_offset(local_v_list_range_lasts[partition_idx] - 1 - minor_range_first); + if (packed_bool_offset_first <= packed_bool_offset_last) { + bcast_size = (packed_bool_offset_last - packed_bool_offset_first) + 1; + } + } + + device_bcast(major_comm, + edge_partition_value_first + packed_bool_offset_first, + edge_partition_value_first + packed_bool_offset_first, + bcast_size, + static_cast(partition_idx), + handle.get_stream()); + } + device_group_end(major_comm); + + rmm::device_uvector d_leading_boundary_word_counts( + leading_boundary_word_counts.size(), handle.get_stream()); + raft::update_device(d_leading_boundary_word_counts.data(), + leading_boundary_word_counts.data(), + leading_boundary_word_counts.size(), + handle.get_stream()); + + rmm::device_uvector d_local_v_list_range_firsts(loop_count, handle.get_stream()); + raft::update_device(d_local_v_list_range_firsts.data(), + local_v_list_range_firsts.data() + i, + loop_count, + handle.get_stream()); - if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), - [rx_vertex_first = rx_vertices.begin(), - input, - subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], - subrange_key_last = (*edge_partition_keys).begin() + (*key_offsets)[i + 1], - edge_partition_value_first = edge_partition_value_first, - subrange_start_offset = (*key_offsets)[i]] __device__(auto i) { - auto minor = *(rx_vertex_first + i); - auto it = - thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); - if ((it != subrange_key_last) && (*it == minor)) { - auto subrange_offset = thrust::distance(subrange_key_first, it); - if constexpr (contains_packed_bool_element) { - fill_scalar_or_thrust_tuple( - edge_partition_value_first, subrange_start_offset + subrange_offset, input); - } else { - *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + thrust::make_counting_iterator(loop_count * packed_bool_word_bcast_alignment), + [input, + minor_range_first, + leading_boundary_word_counts = raft::device_span( + d_leading_boundary_word_counts.data(), d_leading_boundary_word_counts.size()), + local_v_list_range_firsts = raft::device_span( + d_local_v_list_range_firsts.data(), d_local_v_list_range_firsts.size()), + aggregate_boundary_words = raft::device_span( + (*v_list_bitmap).data() + i * packed_bool_word_bcast_alignment, + loop_count * packed_bool_word_bcast_alignment), + output_value_first = edge_partition_value_first] __device__(size_t i) { + auto j = i / packed_bool_word_bcast_alignment; + auto leading_boundary_words = leading_boundary_word_counts[j]; + if ((i % packed_bool_word_bcast_alignment) < leading_boundary_words) { + auto boundary_word = aggregate_boundary_words[i]; + if (boundary_word != packed_bool_empty_mask()) { + auto word_offset = + packed_bool_offset(local_v_list_range_firsts[j] - minor_range_first) + + (i % packed_bool_word_bcast_alignment); + cuda::atomic_ref word( + *(output_value_first + word_offset)); + if (input) { + word.fetch_or(aggregate_boundary_words[i], cuda::std::memory_order_relaxed); + } else { + word.fetch_and(~aggregate_boundary_words[i], cuda::std::memory_order_relaxed); + } } } }); } else { - if constexpr (contains_packed_bool_element) { - thrust::for_each(handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), - [edge_partition, - rx_vertex_first = rx_vertices.begin(), - input, - output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = - edge_partition.minor_offset_from_minor_nocheck(rx_vertex); - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); - }); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), - cuda::proclaim_return_type([edge_partition] __device__(auto v) { - return edge_partition.minor_offset_from_minor_nocheck(v); - })); - auto val_first = thrust::make_constant_iterator(input); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + rx_counts[i], - map_first, - edge_partition_value_first); + std::vector, rmm::device_uvector>> + edge_partition_v_buffers{}; + edge_partition_v_buffers.reserve(loop_count); + rmm::device_uvector dummy_counters(loop_count, handle.get_stream()); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + std::variant, rmm::device_uvector> v_buffer = + rmm::device_uvector(0, handle.get_stream()); + if (v_list_bitmap) { + v_buffer = rmm::device_uvector( + packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); + } else if (compressed_v_list) { + v_buffer = + rmm::device_uvector(local_v_list_sizes[partition_idx], handle.get_stream()); + } else { + std::get<0>(v_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); + } + edge_partition_v_buffers.push_back(std::move(v_buffer)); + } + + device_group_start(major_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto& v_buffer = edge_partition_v_buffers[j]; + if (v_list_bitmap) { + device_bcast(major_comm, + (*v_list_bitmap).data(), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(major_comm, + (*compressed_v_list).data(), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(major_comm, + (static_cast(partition_idx) == major_comm_rank) + ? sorted_unique_vertex_first + : static_cast(nullptr), + std::get<0>(v_buffer).data(), + std::get<0>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(major_comm); + bool kernel_fusion = + !edge_partition_keys && !v_list_bitmap && (loop_count > 1) && + (static_cast(std::reduce(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count))) < + size_t{256 * 1024} /* tuning parameter (binary search vs kernel launch overhead) */ * + loop_count); // FIXME: kernle fusion can be useful even when + // edge_partition_keys.has_value() is true + + if (!kernel_fusion) { + if (stream_pool_indices) { handle.sync_stream(); } + } + + if (!kernel_fusion) { + size_t stream_pool_size{0}; + if (stream_pool_indices) { stream_pool_size = (*stream_pool_indices).size(); } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j % stream_pool_size]) + : handle.get_stream(); + + if (v_list_bitmap) { + auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], + loop_stream); + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + rx_vertices.begin(), + raft::device_span(dummy_counters.data() + j, size_t{1}), + local_v_list_range_firsts[partition_idx], + local_v_list_range_lasts[partition_idx], + loop_stream); + edge_partition_v_buffers[j] = std::move(rx_vertices); + } + + if (edge_partition_keys) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], + input, + subrange_key_first = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], + subrange_key_last = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], + edge_partition_value_first = edge_partition_value_first, + subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto it = + thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); + if ((it != subrange_key_last) && (*it == minor)) { + auto subrange_offset = thrust::distance(subrange_key_first, it); + if constexpr (contains_packed_bool_element) { + fill_scalar_or_thrust_tuple( + edge_partition_value_first, subrange_start_offset + subrange_offset, input); + } else { + *(edge_partition_value_first + subrange_start_offset + subrange_offset) = + input; + } + } + }); + } else { + if constexpr (contains_packed_bool_element) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [minor_range_first, + rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + std::get<1>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first, + range_first = + local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { + return static_cast(v_offset + (range_first - minor_range_first)); + })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + std::get<0>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } + } + } + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + } else { // kernel fusion + std::vector h_vertex_vars(loop_count /* range_first values */ + + (loop_count + 1) /* loop offsets */); + std::copy(local_v_list_range_firsts.begin() + i, + local_v_list_range_firsts.begin() + (i + loop_count), + h_vertex_vars.begin()); + h_vertex_vars[loop_count] = 0; + std::inclusive_scan(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count), + h_vertex_vars.begin() + (loop_count + 1)); + std::vector h_ptrs(loop_count); + if (compressed_v_list) { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<1>(edge_partition_v_buffers[j]).data()); + } + } else { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<0>(edge_partition_v_buffers[j]).data()); + } + } + rmm::device_uvector d_vertex_vars(h_vertex_vars.size(), handle.get_stream()); + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_vertex_vars.data(), h_vertex_vars.data(), h_vertex_vars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + + raft::device_span range_firsts(d_vertex_vars.data(), loop_count); + raft::device_span loop_offsets(d_vertex_vars.data() + loop_count, + loop_count + 1); + if constexpr (contains_packed_bool_element) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(h_vertex_vars.back()), + [range_firsts, + loop_offsets, + minor_range_first, + input, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + output_value_first = edge_partition_value_first, + compressed = compressed_v_list.has_value()] __device__(auto i) { + auto loop_idx = + thrust::distance(loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto rx_first = rx_firsts[loop_idx]; + vertex_t minor{}; + if (compressed) { + minor = range_firsts[loop_idx] + + *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } else { + minor = *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + auto val_first = thrust::make_constant_iterator(input); + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [range_firsts, + loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = + range_firsts[loop_idx] + *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } + } } } } } else { assert(graph_view.local_vertex_partition_range_size() == - graph_view.local_edge_partition_src_range_size()); + (GraphViewType::is_storage_transposed + ? graph_view.local_edge_partition_src_range_size() + : graph_view.local_edge_partition_dst_range_sizse())); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_first] __device__(auto v) { fill_scalar_or_thrust_tuple(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -451,8 +1034,8 @@ void fill_edge_src_property(raft::handle_t const& handle, /** * @brief Fill graph edge source property values to the input value. * - * This version fills only a subset of graph edge source property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -461,10 +1044,12 @@ void fill_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [vertex_first, sorted_unique_vertex_last) should be sorted & distinct (and + * should belong to the vertex partition assigned to this process in multi-GPU), otherwise undefined + * behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_src_property_output edge_src_property_view_t class object to store source property * values (for the edge source assigned to this process in multi-GPU). * @param input Edge source property values will be set to @p input. @@ -476,8 +1061,8 @@ template void fill_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeSrcValueOutputWrapper edge_src_property_output, T input, bool do_expensive_check = false) @@ -486,8 +1071,8 @@ void fill_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -498,17 +1083,25 @@ void fill_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } else { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } } @@ -552,8 +1145,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, /** * @brief Fill graph edge destination property values to the input value. * - * This version fills only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -563,10 +1156,12 @@ void fill_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be sorted & + * distinct (and should belong to the vertex partition assigned to this process in multi-GPU), + * otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_dst_property_output edge_dst_property_view_t class object to store destination * property values (for the edge destinations assigned to this process in multi-GPU). * @param input Edge destination property values will be set to @p input. @@ -578,8 +1173,8 @@ template void fill_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeDstValueOutputWrapper edge_dst_property_output, T input, bool do_expensive_check = false) @@ -588,8 +1183,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -600,17 +1195,25 @@ void fill_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } else { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } } diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh index ce5e5d3e8c..f03e8f54fb 100644 --- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh +++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh @@ -250,11 +250,14 @@ void per_v_pair_transform_dst_nbr_intersection( } auto num_input_pairs = static_cast(thrust::distance(vertex_pair_first, vertex_pair_last)); - std::optional> unique_vertices{std::nullopt}; + std::optional> sorted_unique_vertices{std::nullopt}; std::optional(size_t{0}, rmm::cuda_stream_view{}))> - property_buffer_for_unique_vertices{std::nullopt}; + property_buffer_for_sorted_unique_vertices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - unique_vertices = rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); + auto& comm = handle.get_comms(); + + sorted_unique_vertices = + rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); auto elem0_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -262,7 +265,7 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem0_first, elem0_first + num_input_pairs, - (*unique_vertices).begin()); + (*sorted_unique_vertices).begin()); auto elem1_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -270,25 +273,25 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem1_first, elem1_first + num_input_pairs, - (*unique_vertices).begin() + num_input_pairs); - thrust::sort(handle.get_thrust_policy(), (*unique_vertices).begin(), (*unique_vertices).end()); - (*unique_vertices) - .resize(thrust::distance((*unique_vertices).begin(), + (*sorted_unique_vertices).begin() + num_input_pairs); + thrust::sort(handle.get_thrust_policy(), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end()); + (*sorted_unique_vertices) + .resize(thrust::distance((*sorted_unique_vertices).begin(), thrust::unique(handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end())), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end())), handle.get_stream()); - std::tie(unique_vertices, property_buffer_for_unique_vertices) = - collect_values_for_unique_int_vertices(handle, - std::move(*unique_vertices), - vertex_value_input_first, - graph_view.vertex_partition_range_lasts()); - thrust::sort_by_key( - handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end(), - (*property_buffer_for_unique_vertices).begin()); // necessary for binary search + property_buffer_for_sorted_unique_vertices = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span((*sorted_unique_vertices).data(), + (*sorted_unique_vertices).size()), + vertex_value_input_first, + graph_view.vertex_partition_range_lasts(), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } rmm::device_uvector vertex_pair_indices(num_input_pairs, handle.get_stream()); @@ -412,32 +415,32 @@ void per_v_pair_transform_dst_nbr_intersection( do_expensive_check); } - if (unique_vertices) { - auto vertex_value_input_for_unique_vertices_first = - get_dataframe_buffer_begin(*property_buffer_for_unique_vertices); - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(this_chunk_size), - detail::call_intersection_op_t< - GraphViewType, - decltype(vertex_value_input_for_unique_vertices_first), - typename decltype(r_nbr_intersection_property_values0)::const_pointer, - IntersectionOp, - decltype(chunk_vertex_pair_index_first), - VertexPairIterator, - VertexPairValueOutputIterator>{edge_partition, - thrust::make_optional>( - (*unique_vertices).data(), (*unique_vertices).size()), - vertex_value_input_for_unique_vertices_first, - intersection_op, - intersection_offsets.data(), - intersection_indices.data(), - r_nbr_intersection_property_values0.data(), - r_nbr_intersection_property_values1.data(), - chunk_vertex_pair_index_first, - vertex_pair_first, - vertex_pair_value_output_first}); + if (sorted_unique_vertices) { + auto vertex_value_input_for_sorted_unique_vertices_first = + get_dataframe_buffer_begin(*property_buffer_for_sorted_unique_vertices); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(this_chunk_size), + detail::call_intersection_op_t< + GraphViewType, + decltype(vertex_value_input_for_sorted_unique_vertices_first), + typename decltype(r_nbr_intersection_property_values0)::const_pointer, + IntersectionOp, + decltype(chunk_vertex_pair_index_first), + VertexPairIterator, + VertexPairValueOutputIterator>{ + edge_partition, + thrust::make_optional>( + (*sorted_unique_vertices).data(), (*sorted_unique_vertices).size()), + vertex_value_input_for_sorted_unique_vertices_first, + intersection_op, + intersection_offsets.data(), + intersection_indices.data(), + r_nbr_intersection_property_values0.data(), + r_nbr_intersection_property_values1.data(), + chunk_vertex_pair_index_first, + vertex_pair_first, + vertex_pair_value_output_first}); } else { thrust::for_each(handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index dd34d4b06a..30706632ad 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -206,7 +206,7 @@ struct return_value_compute_offset_t { template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -237,7 +237,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using key_buffer_t = dataframe_buffer_type_t; using edge_partition_src_input_device_view_t = std::conditional_t< @@ -286,15 +286,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (do_expensive_check) { // FIXME: better re-factor this check function? - auto frontier_vertex_first = - thrust_tuple_get_or_identity(frontier.begin()); - auto frontier_vertex_last = - thrust_tuple_get_or_identity(frontier.end()); + auto key_list_vertex_first = + thrust_tuple_get_or_identity(key_list.begin()); + auto key_list_vertex_last = + thrust_tuple_get_or_identity(key_list.end()); auto num_invalid_keys = - frontier.size() - + key_list.size() - thrust::count_if(handle.get_thrust_policy(), - frontier_vertex_first, - frontier_vertex_last, + key_list_vertex_first, + key_list_vertex_last, check_in_range_t{graph_view.local_vertex_partition_range_first(), graph_view.local_vertex_partition_range_last()}); if constexpr (GraphViewType::is_multi_gpu) { @@ -302,35 +302,35 @@ per_v_random_select_transform_e(raft::handle_t const& handle, handle.get_comms(), num_invalid_keys, raft::comms::op_t::SUM, handle.get_stream()); } CUGRAPH_EXPECTS(num_invalid_keys == size_t{0}, - "Invalid input argument: frontier includes out-of-range keys."); + "Invalid input argument: key_list includes out-of-range keys."); } - std::vector local_frontier_sizes{}; + std::vector local_key_list_sizes{}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); + local_key_list_sizes = host_scalar_allgather(minor_comm, key_list.size(), handle.get_stream()); } else { - local_frontier_sizes = std::vector{frontier.size()}; + local_key_list_sizes = std::vector{key_list.size()}; } - std::vector local_frontier_displacements(local_frontier_sizes.size()); - std::exclusive_scan(local_frontier_sizes.begin(), - local_frontier_sizes.end(), - local_frontier_displacements.begin(), + std::vector local_key_list_displacements(local_key_list_sizes.size()); + std::exclusive_scan(local_key_list_sizes.begin(), + local_key_list_sizes.end(), + local_key_list_displacements.begin(), size_t{0}); - // 1. aggregate frontier + // 1. aggregate key_list - std::optional aggregate_local_frontier{std::nullopt}; + std::optional aggregate_local_key_list{std::nullopt}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - aggregate_local_frontier = allocate_dataframe_buffer( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + aggregate_local_key_list = allocate_dataframe_buffer( + local_key_list_displacements.back() + local_key_list_sizes.back(), handle.get_stream()); device_allgatherv(minor_comm, - frontier.begin(), - get_dataframe_buffer_begin(*aggregate_local_frontier), - local_frontier_sizes, - local_frontier_displacements, + key_list.begin(), + get_dataframe_buffer_begin(*aggregate_local_key_list), + local_key_list_sizes, + local_key_list_displacements, handle.get_stream()); } @@ -339,66 +339,66 @@ per_v_random_select_transform_e(raft::handle_t const& handle, rmm::device_uvector sample_local_nbr_indices(0, handle.get_stream()); std::optional> sample_key_indices{std::nullopt}; - std::vector local_frontier_sample_offsets{}; + std::vector local_key_list_sample_offsets{}; if constexpr (std::is_same_v>) { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = uniform_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier) - : frontier.begin(), - local_frontier_displacements, - local_frontier_sizes, + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement); } else { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = biased_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier) - : frontier.begin(), + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, e_bias_op, - local_frontier_displacements, - local_frontier_sizes, + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement, do_expensive_check); } - std::vector local_frontier_sample_counts(minor_comm_size); - std::adjacent_difference(local_frontier_sample_offsets.begin() + 1, - local_frontier_sample_offsets.end(), - local_frontier_sample_counts.begin()); + std::vector local_key_list_sample_counts(minor_comm_size); + std::adjacent_difference(local_key_list_sample_offsets.begin() + 1, + local_key_list_sample_offsets.end(), + local_key_list_sample_counts.begin()); // 3. transform auto sample_e_op_results = - allocate_dataframe_buffer(local_frontier_sample_offsets.back(), handle.get_stream()); + allocate_dataframe_buffer(local_key_list_sample_offsets.back(), handle.get_stream()); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - auto edge_partition_frontier_key_first = - ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier) - : frontier.begin()) + - local_frontier_displacements[i]; + auto edge_partition_key_list_first = + ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin()) + + local_key_list_displacements[i]; auto edge_partition_sample_local_nbr_index_first = - sample_local_nbr_indices.begin() + local_frontier_sample_offsets[i]; + sample_local_nbr_indices.begin() + local_key_list_sample_offsets[i]; auto edge_partition_sample_e_op_result_first = - get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_offsets[i]; + get_dataframe_buffer_begin(sample_e_op_results) + local_key_list_sample_offsets[i]; edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -415,14 +415,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (sample_key_indices) { auto edge_partition_sample_key_index_first = - (*sample_key_indices).begin() + local_frontier_sample_offsets[i]; + (*sample_key_indices).begin() + local_key_list_sample_offsets[i]; thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_frontier_sample_counts[i]), + thrust::make_counting_iterator(local_key_list_sample_counts[i]), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{ edge_partition, thrust::make_optional(edge_partition_sample_key_index_first), - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -444,10 +444,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle, thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(frontier.size() * K), + thrust::make_counting_iterator(key_list.size() * K), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{edge_partition, thrust::nullopt, - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -466,13 +466,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle, K}); } } - aggregate_local_frontier = std::nullopt; + aggregate_local_key_list = std::nullopt; // 4. shuffle randomly selected & transformed results and update sample_offsets auto sample_offsets = invalid_value ? std::nullopt : std::make_optional>( - frontier.size() + 1, handle.get_stream()); + key_list.size() + 1, handle.get_stream()); assert(K <= std::numeric_limits::max()); if (minor_comm_size > 1) { sample_local_nbr_indices.resize(0, handle.get_stream()); @@ -483,12 +483,12 @@ per_v_random_select_transform_e(raft::handle_t const& handle, std::tie(sample_e_op_results, std::ignore) = shuffle_values(minor_comm, get_dataframe_buffer_begin(sample_e_op_results), - local_frontier_sample_counts, + local_key_list_sample_counts, handle.get_stream()); std::tie(sample_key_indices, std::ignore) = shuffle_values( - minor_comm, (*sample_key_indices).begin(), local_frontier_sample_counts, handle.get_stream()); + minor_comm, (*sample_key_indices).begin(), local_key_list_sample_counts, handle.get_stream()); - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::fill( handle.get_thrust_policy(), sample_counts.begin(), sample_counts.end(), int32_t{0}); auto sample_intra_partition_displacements = @@ -504,7 +504,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_counts.resize(0, handle.get_stream()); sample_counts.shrink_to_fit(handle.get_stream()); - resize_dataframe_buffer(tmp_sample_e_op_results, frontier.size() * K, handle.get_stream()); + resize_dataframe_buffer(tmp_sample_e_op_results, key_list.size() * K, handle.get_stream()); thrust::fill(handle.get_thrust_policy(), get_dataframe_buffer_begin(tmp_sample_e_op_results), get_dataframe_buffer_end(tmp_sample_e_op_results), @@ -553,7 +553,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_e_op_results = std::move(tmp_sample_e_op_results); } else { if (!invalid_value) { - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::tabulate( handle.get_thrust_policy(), sample_counts.begin(), @@ -597,8 +597,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases. * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -609,8 +609,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -647,11 +647,11 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -682,7 +682,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, { return detail::per_v_random_select_transform_e(handle, graph_view, - frontier, + key_list, edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, @@ -705,8 +705,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * (uniform neighbor sampling). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -715,8 +715,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -747,11 +747,11 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -775,7 +775,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, return detail::per_v_random_select_transform_e( handle, graph_view, - frontier, + key_list, edge_src_dummy_property_t{}.view(), edge_dst_dummy_property_t{}.view(), edge_dummy_property_t{}.view(), @@ -783,7 +783,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, detail::edge_endpoint_dummy_property_view_t, detail::edge_endpoint_dummy_property_view_t, edge_dummy_property_view_t, - typename VertexFrontierBucketType::key_type>{}, + typename KeyBucketType::key_type>{}, edge_src_value_input, edge_dst_value_input, edge_value_input, diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index 5a5e933209..c13816242b 100644 --- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -924,11 +924,12 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); std::tie(unique_minor_keys, values_for_unique_keys) = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, kv_store_view, std::move(unique_minor_keys), cugraph::detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); if constexpr (KVStoreViewType::binary_search) { multi_gpu_minor_key_value_map_ptr = diff --git a/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh new file mode 100644 index 0000000000..1e0d366429 --- /dev/null +++ b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "prims/detail/per_v_transform_reduce_e.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include + +#include + +#include +#include + +namespace cugraph { + +/** + * @brief Iterate over every vertex's incoming edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce. In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to + * fill the wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first + * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the incoming + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 incoming edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(GraphViewType::is_storage_transposed); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief Iterate over every vertex's outgoing edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the + * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +} // namespace cugraph diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 027ef1f662..5ba7edec89 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -15,558 +15,165 @@ */ #pragma once -#include "detail/graph_partition_utils.cuh" -#include "prims/detail/prim_functors.cuh" -#include "prims/fill_edge_src_dst_property.cuh" -#include "prims/property_op_utils.cuh" -#include "prims/reduce_op.cuh" +#include "prims/detail/per_v_transform_reduce_e.cuh" +#include "prims/vertex_frontier.cuh" -#include -#include -#include #include #include -#include -#include -#include #include -#include -#include #include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #include namespace cugraph { -namespace detail { - -int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; - -template -struct transform_and_atomic_reduce_t { - edge_partition_device_view_t const& edge_partition{}; - result_t identity_element{}; - vertex_t const* indices{nullptr}; - TransformOp const& transform_op{}; - ResultValueOutputIteratorOrWrapper& result_value_output{}; - - __device__ void operator()(edge_t i) const - { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - if constexpr (multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } -}; - -template -__device__ void update_result_value_output( - edge_partition_device_view_t const& edge_partition, - vertex_t const* indices, - edge_t local_degree, - TransformOp const& transform_op, - result_t init, - ReduceOp const& reduce_op, - size_t output_idx /* relevent only when update_major === true */, - result_t identity_element, - ResultValueOutputIteratorOrWrapper& result_value_output) -{ - if constexpr (update_major) { - *(result_value_output + output_idx) = - thrust::transform_reduce(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_op, - init, - reduce_op); - } else { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_and_atomic_reduce_t{ - edge_partition, identity_element, indices, transform_op, result_value_output}); - } -} - -template -__global__ static void per_v_transform_reduce_e_hypersparse( - edge_partition_device_view_t edge_partition, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - - edge_partition.major_range_first()); - auto idx = static_cast(tid); - - auto dcs_nzd_vertex_count = *(edge_partition.dcs_nzd_vertex_count()); - - while (idx < static_cast(dcs_nzd_vertex_count)) { - auto major = - *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto major_idx = - major_start_offset + idx; // major_offset != major_idx in the hypersparse region - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_idx)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - major - *(edge_partition).major_hypersparse_first(), - identity_element, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - major - *(edge_partition).major_hypersparse_first(), - identity_element, - result_value_output); - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_low_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid); - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_offset)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - idx, - identity_element, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - idx, - identity_element, - result_value_output); - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_mid_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) + typename T, + typename VertexValueOutputIterator> +void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) { - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); - auto const lane_id = tid % raft::warp_size(); - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid / raft::warp_size()); - - using WarpReduce = cub::WarpReduce; - [[maybe_unused]] __shared__ typename WarpReduce::TempStorage - temp_storage[per_v_transform_reduce_e_kernel_block_size / - raft::warp_size()]; // relevant only if update_major == true - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - [[maybe_unused]] auto reduced_e_op_result = - lane_id == 0 ? init : identity_element; // relevant only if update_major == true - if (edge_partition_e_mask) { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } else { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - - if constexpr (update_major) { - reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(reduced_e_op_result, reduce_op); - if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - - idx += gridDim.x * (blockDim.x / raft::warp_size()); + if (do_expensive_check) { + // currently, nothing to do } -} - -template -__global__ static void per_v_transform_reduce_e_high_degree( - edge_partition_device_view_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, - ReduceOp reduce_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(blockIdx.x); - - using BlockReduce = cub::BlockReduce; - [[maybe_unused]] __shared__ - typename BlockReduce::TempStorage temp_storage; // relevant only if update_major == true - - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; - - [[maybe_unused]] auto reduced_e_op_result = - threadIdx.x == 0 ? init : identity_element; // relevant only if update_major == true - if (edge_partition_e_mask) { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } else { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - - if constexpr (update_major) { - reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); - if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - - idx += gridDim.x; - } + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } -template -void per_v_transform_reduce_e(raft::handle_t const& handle, - GraphViewType const& graph_view, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - T init, - ReduceOp reduce_op, - VertexValueOutputIterator vertex_value_output_first) +void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) { - static_assert(ReduceOp::pure_function && reduce_op::has_compatible_raft_comms_op_v && - reduce_op::has_identity_element_v); // current restriction, to support - // general reduction, we may need to - // take a less efficient code path - - constexpr auto update_major = (incoming == GraphViewType::is_storage_transposed); - [[maybe_unused]] constexpr auto max_segments = - detail::num_sparse_segments_per_vertex_partition + size_t{1}; - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; + static_assert(GraphViewType::is_storage_transposed); - using edge_partition_src_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeSrcValueInputWrapper::value_iterator, - typename EdgeSrcValueInputWrapper::value_type>>; - using edge_partition_dst_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeDstValueInputWrapper::value_iterator, - typename EdgeDstValueInputWrapper::value_type>>; - using edge_partition_e_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_edge_dummy_property_device_view_t, - detail::edge_partition_edge_property_device_view_t< - edge_t, - typename EdgeValueInputWrapper::value_iterator, - typename EdgeValueInputWrapper::value_type>>; - - static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - - using minor_tmp_buffer_type = std::conditional_t, - edge_dst_property_t>; - [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - minor_tmp_buffer = std::make_unique(handle, graph_view); - } - - using edge_partition_minor_output_device_view_t = - std::conditional_tmutable_view().value_first())>, - void /* dummy */>; - - if constexpr (update_major) { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { // no vertices in the zero degree segment are visited - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), - init); - } - } else { - if constexpr (GraphViewType::is_multi_gpu) { - auto minor_init = init; - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may not - // store values for the entire minor range - minor_init = ReduceOp::identity_element; - } else { - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; - } - fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); - } else { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first, - vertex_value_output_first + graph_view.local_vertex_partition_range_size(), - init); - } - } - - std::optional> stream_pool_indices{std::nullopt}; - if constexpr (GraphViewType::is_multi_gpu) { - if ((graph_view.local_edge_partition_segment_offsets(0)) && - (handle.get_stream_pool_size() >= max_segments)) { - for (size_t i = 1; i < graph_view.number_of_local_edge_partitions(); ++i) { - assert(graph_view.local_edge_partition_segment_offsets(i)); - } - - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is - // update_major ? V / comm_size * sizeof(T) : 0 - // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - - size_t num_streams = - std::min(static_cast(minor_comm_size) * max_segments, - raft::round_down_safe(handle.get_stream_pool_size(), max_segments)); - if constexpr (update_major) { - size_t value_size{0}; - if constexpr (is_thrust_tuple_of_arithmetic::value) { - auto elem_sizes = compute_thrust_tuple_element_sizes{}(); - value_size = std::reduce(elem_sizes.begin(), elem_sizes.end()); - } else { - value_size = sizeof(T); - } - - auto avg_vertex_degree = - graph_view.number_of_vertices() > 0 - ? (static_cast(graph_view.compute_number_of_edges(handle)) / - static_cast(graph_view.number_of_vertices())) - : double{0.0}; - - num_streams = - std::min(static_cast(avg_vertex_degree * (static_cast(sizeof(vertex_t)) / - static_cast(value_size))) * - max_segments, - num_streams); - } - - if (num_streams >= max_segments) { - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - } - - std::vector(0, rmm::cuda_stream_view{}))> - major_tmp_buffers{}; - if constexpr (GraphViewType::is_multi_gpu && update_major) { - std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), - size_t{0}); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - major_tmp_buffer_sizes[i] = - *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment - } else { - if constexpr (GraphViewType::is_storage_transposed) { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); - } else { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); - } - } - } - if (stream_pool_indices) { - auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; - major_tmp_buffers.reserve(num_concurrent_loops); - for (size_t i = 0; i < num_concurrent_loops; ++i) { - size_t max_size{0}; - for (size_t j = i; j < graph_view.number_of_local_edge_partitions(); - j += num_concurrent_loops) { - max_size = std::max(major_tmp_buffer_sizes[j], max_size); - } - major_tmp_buffers.push_back(allocate_dataframe_buffer(max_size, handle.get_stream())); - } - } else { - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer( - *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()), - handle.get_stream())); - } - } else { // dummy - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); - } - - if (stream_pool_indices) { handle.sync_stream(); } - - auto edge_mask_view = graph_view.edge_mask_view(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto major_init = ReduceOp::identity_element; - if constexpr (update_major) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; - } else { - major_init = init; - } - } - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); - } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); - - auto major_buffer_first = - get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); - - std::conditional_t, - VertexValueOutputIterator> - output_buffer{}; - if constexpr (GraphViewType::is_multi_gpu) { - if constexpr (update_major) { - output_buffer = major_buffer_first; - } else { - output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); - } - } else { - output_buffer = vertex_value_output_first; - } - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - - // FIXME: we may further improve performance by 1) individually tuning block sizes for - // different segments; and 2) adding one more segment for very high degree vertices and - // running segmented reduction - if (edge_partition.dcs_nzd_vertex_count()) { - auto exec_stream = - stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) - : handle.get_stream(); - - if constexpr (update_major) { // this is necessary as we don't visit every vertex in the - // hypersparse segment - thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + (*segment_offsets)[3], - output_buffer + (*segment_offsets)[4], - major_init); - } - - if (*(edge_partition.dcs_nzd_vertex_count()) > 0) { - raft::grid_1d_thread_t update_grid(*(edge_partition.dcs_nzd_vertex_count()), - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } - detail::per_v_transform_reduce_e_hypersparse - <<>>( - edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[2], - edge_partition.major_range_first() + (*segment_offsets)[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; } - detail::per_v_transform_reduce_e_mid_degree - <<>>( - edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[1], - edge_partition.major_range_first() + (*segment_offsets)[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - if ((*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_block_t update_grid((*segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::per_v_transform_reduce_e_high_degree - <<>>( - edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_first() + (*segment_offsets)[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } else { - if (edge_partition.major_range_size() > 0) { - raft::grid_1d_thread_t update_grid(edge_partition.major_range_size(), - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_last(), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op); - } - } - - if constexpr (GraphViewType::is_multi_gpu && update_major) { - auto& comm = handle.get_comms(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - if (segment_offsets && stream_pool_indices) { - if (edge_partition.dcs_nzd_vertex_count()) { - device_reduce( - minor_comm, - major_buffer_first + (*segment_offsets)[3], - vertex_value_output_first + (*segment_offsets)[3], - (*segment_offsets)[4] - (*segment_offsets)[3], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size())); - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[2], - vertex_value_output_first + (*segment_offsets)[2], - (*segment_offsets)[3] - (*segment_offsets)[2], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size())); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[1], - vertex_value_output_first + (*segment_offsets)[1], - (*segment_offsets)[2] - (*segment_offsets)[1], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size())); - } - if ((*segment_offsets)[1] > 0) { - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - (*segment_offsets)[1], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size())); - } - } else { - size_t reduction_size = static_cast( - segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - reduction_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream()); - } - } - - if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { - handle.sync_stream_pool( - *stream_pool_indices); // to prevent buffer over-write (this can happen as *segment_offsets - // do not necessarily coincide in different edge partitions). - } + if (do_expensive_check) { + // currently, nothing to do } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // applying the initial value is deferred to here - vertex_t max_vertex_partition_size{0}; - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - max_vertex_partition_size = - std::max(max_vertex_partition_size, - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); - } - auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); - auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); - std::optional> minor_key_offsets{}; - if constexpr (GraphViewType::is_storage_transposed) { - minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); - } else { - minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); - } - for (int i = 0; i < major_comm_size; ++i) { - auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - thrust::fill(handle.get_thrust_policy(), - tx_buffer_first, - tx_buffer_first + - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), - minor_init); - auto value_first = thrust::make_transform_iterator( - view.value_first(), - cuda::proclaim_return_type( - [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); - thrust::scatter(handle.get_thrust_policy(), - value_first + (*minor_key_offsets)[i], - value_first + (*minor_key_offsets)[i + 1], - thrust::make_transform_iterator( - (*(view.keys())).begin() + (*minor_key_offsets)[i], - cuda::proclaim_return_type( - [key_first = graph_view.vertex_partition_range_first( - this_segment_vertex_partition_id)] __device__(auto key) { - return key - key_first; - })), - tx_buffer_first); - device_reduce(major_comm, - tx_buffer_first, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } else { - auto first_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); - vertex_t minor_range_first = - graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - - minor_range_first; - device_reduce(major_comm, - view.value_first() + offset, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } - } + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } -} // namespace detail - /** - * @brief Iterate over every vertex's incoming edges to update vertex properties. + * @brief Iterate over every vertex's outgoing edges to update vertex properties. * - * This function is inspired by thrust::transform_reduce. + * This function is inspired by thrust::transform_reduce(). * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. @@ -1131,8 +240,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to - * fill the wrapper. + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. * @param edge_dst_value_input Wrapper used to access destination input property values (for the * edge destinations assigned to this process in multi-GPU). Use either * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or @@ -1145,14 +254,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has * known member variables) to take a more optimized code path. See the documentation in the * reduce_op.cuh file for instructions on writing custom reduction operators. - * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first - * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the + * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` * (exclusive) is deduced as @p vertex_value_output_first + @p * graph_view.local_vertex_partition_range_size(). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). @@ -1165,7 +276,7 @@ template -void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, +void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, @@ -1180,23 +291,37 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } /** - * @brief Iterate over every vertex's outgoing edges to update vertex properties. + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. * * This function is inspired by thrust::transform_reduce(). * * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -1207,6 +332,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -1223,20 +350,22 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, * access edge property values). * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. - * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has * known member variables) to take a more optimized code path. See the documentation in the * reduce_op.cuh file for instructions on writing custom reduction operators. - * @param vertex_value_output_first Iterator pointing to the vertex property variables for the - * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` - * (exclusive) is deduced as @p vertex_value_output_first + @p - * graph_view.local_vertex_partition_range_size(). + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). */ template void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, + KeyBucketType const& key_list, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -1255,19 +385,33 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, VertexValueOutputIterator vertex_value_output_first, bool do_expensive_check = false) { + static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); + if (do_expensive_check) { // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } } // namespace cugraph diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh new file mode 100644 index 0000000000..87f590f571 --- /dev/null +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -0,0 +1,1196 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "detail/graph_partition_utils.cuh" +#include "prims/detail/extract_transform_v_frontier_e.cuh" +#include "prims/detail/prim_utils.cuh" +#include "prims/property_op_utils.cuh" +#include "prims/reduce_op.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { + +namespace detail { + +int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512; + +template +struct transform_reduce_v_frontier_call_e_op_t { + EdgeOp e_op{}; + + __device__ thrust::optional< + std::conditional_t && !std::is_same_v, + thrust::tuple, + std::conditional_t, key_t, payload_t>>> + operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const + { + auto e_op_result = e_op(key, dst, sv, dv, ev); + if (e_op_result.has_value()) { + auto reduce_by = dst; + if constexpr (std::is_same_v && std::is_same_v) { + return reduce_by; + } else if constexpr (std::is_same_v && !std::is_same_v) { + return thrust::make_tuple(reduce_by, *e_op_result); + } else if constexpr (!std::is_same_v && std::is_same_v) { + return thrust::make_tuple(reduce_by, *e_op_result); + } else { + return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)), + thrust::get<1>(*e_op_result)); + } + } else { + return thrust::nullopt; + } + } +}; + +template +struct update_keep_flag_t { + using input_key_t = + typename thrust::iterator_traits::value_type; // uint32_t (compressed) or + // key_t (i.e. vertex_t) + + raft::device_span bitmap{}; + raft::device_span keep_flags{}; + key_t v_range_first{}; + InputKeyIterator input_key_first{}; + thrust::optional invalid_input_key{}; + + __device__ void operator()(size_t i) const + { + auto v = *(input_key_first + i); + if (invalid_input_key && (v == *invalid_input_key)) { + return; // just discard + } + input_key_t v_offset{}; + if constexpr ((sizeof(key_t) == 8) && std::is_same_v) { + v_offset = v; + } else { + v_offset = v - v_range_first; + } + cuda::atomic_ref bitmap_word( + bitmap[packed_bool_offset(v_offset)]); + auto old = bitmap_word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + if ((old & packed_bool_mask(v_offset)) == packed_bool_empty_mask()) { + cuda::atomic_ref keep_flag_word( + keep_flags[packed_bool_offset(i)]); + keep_flag_word.fetch_or(packed_bool_mask(i), cuda::std::memory_order_relaxed); + } + } +}; + +template +std::tuple, optional_dataframe_buffer_type_t> +filter_buffer_elements( + raft::handle_t const& handle, + rmm::device_uvector&& + unique_v_buffer, // assumes that buffer elements are locally reduced first and unique + optional_dataframe_buffer_type_t&& payload_buffer, + raft::device_span vertex_partition_range_offsets, // size = major_comm_size + 1 + vertex_t allreduce_count_per_rank, + int subgroup_size) +{ + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + + rmm::device_uvector priorities(allreduce_count_per_rank * major_comm_size, + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + thrust::for_each( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + [offsets = vertex_partition_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = + thrust::distance(offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + priorities[allreduce_count_per_rank * root + v_offset] = + rank_to_priority( + major_comm_rank, root, subgroup_size, major_comm_size, v_offset); + } + }); + device_allreduce(major_comm, + priorities.data(), + priorities.data(), + priorities.size(), + raft::comms::op_t::MIN, + handle.get_stream()); + if constexpr (std::is_same_v) { + unique_v_buffer.resize( + thrust::distance( + unique_v_buffer.begin(), + thrust::remove_if( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + unique_v_buffer.begin(), + [offsets = vertex_partition_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + } else { + auto kv_pair_first = thrust::make_zip_iterator(unique_v_buffer.begin(), + get_dataframe_buffer_begin(payload_buffer)); + unique_v_buffer.resize( + thrust::distance( + kv_pair_first, + thrust::remove_if( + handle.get_thrust_policy(), + kv_pair_first, + kv_pair_first + unique_v_buffer.size(), + unique_v_buffer.begin(), + [offsets = vertex_partition_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + resize_dataframe_buffer(payload_buffer, unique_v_buffer.size(), handle.get_stream()); + } + + return std::make_tuple(std::move(unique_v_buffer), std::move(payload_buffer)); +} + +template +std::tuple, optional_dataframe_buffer_type_t> +sort_and_reduce_buffer_elements( + raft::handle_t const& handle, + dataframe_buffer_type_t&& key_buffer, + optional_dataframe_buffer_type_t&& payload_buffer, + ReduceOp reduce_op, + std::conditional_t, std::tuple, std::byte /* dummy */> + vertex_range, + std::optional invalid_key /* drop (key, (payload)) pairs with invalid key */) +{ + constexpr bool compressed = + std::is_integral_v && (sizeof(key_t) == 8) && + std::is_same_v; // we currently compress only when key_t is an integral + // type (i.e. vertex_t) + static_assert(compressed || std::is_same_v); + + if constexpr (std::is_integral_v && + (std::is_same_v || + std::is_same_v>)) { // try to use + // bitmap for + // filtering + key_t range_size = std::get<1>(vertex_range) - std::get<0>(vertex_range); + if (static_cast(size_dataframe_buffer(key_buffer)) >= + static_cast(range_size) * + 0.125 /* tuning parameter */) { // use bitmap for filtering + rmm::device_uvector bitmap(packed_bool_size(range_size), handle.get_stream()); + rmm::device_uvector keep_flags(packed_bool_size(size_dataframe_buffer(key_buffer)), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + thrust::fill( + handle.get_thrust_policy(), keep_flags.begin(), keep_flags.end(), packed_bool_empty_mask()); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), + update_keep_flag_t{ + raft::device_span(bitmap.data(), bitmap.size()), + raft::device_span(keep_flags.data(), keep_flags.size()), + std::get<0>(vertex_range), + get_dataframe_buffer_begin(key_buffer), + to_thrust_optional(invalid_key)}); + auto stencil_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span(keep_flags.data(), + keep_flags.size())] __device__(size_t i) { + return (keep_flags[packed_bool_offset(i)] & packed_bool_mask(i)) != + packed_bool_empty_mask(); + })); + if constexpr (std::is_same_v) { + resize_dataframe_buffer( + key_buffer, + thrust::distance(get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer)); + } else { + static_assert(std::is_same_v>); + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + thrust::sort_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + } + + if constexpr (compressed) { + rmm::device_uvector output_key_buffer(key_buffer.size(), handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + key_buffer.begin(), + key_buffer.end(), + output_key_buffer.begin(), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(uint32_t v_offset) { + return static_cast(v_first + v_offset); + })); + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); + } else { + return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + } + } + } + + if constexpr (std::is_same_v) { + thrust::sort(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer)); + } else { + thrust::sort_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer)); + } + + auto output_key_buffer = allocate_dataframe_buffer(0, handle.get_stream()); + if constexpr (std::is_same_v) { + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + get_dataframe_buffer_begin(output_key_buffer), + thrust::copy_if(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + get_dataframe_buffer_begin(output_key_buffer), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + } else { + resize_dataframe_buffer( + key_buffer, + thrust::distance( + get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); + } else if constexpr (std::is_same_v>) { + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + auto tmp_payload_buffer = allocate_dataframe_buffer( + size_dataframe_buffer(payload_buffer), handle.get_stream()); + auto input_pair_first = + thrust::make_zip_iterator(input_key_first, get_dataframe_buffer_begin(payload_buffer)); + auto output_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_key_buffer), + get_dataframe_buffer_begin(tmp_payload_buffer)); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + output_pair_first, + thrust::copy_if(handle.get_thrust_policy(), + input_pair_first, + input_pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + output_pair_first, + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + tmp_payload_buffer, size_dataframe_buffer(output_key_buffer), handle.get_stream()); + payload_buffer = std::move(tmp_payload_buffer); + } else { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance( + pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + } else { + if (invalid_key) { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + cuda::proclaim_return_type( + [invalid_key = *invalid_key] __device__(auto kv) { + auto key = thrust::get<0>(kv); + return key == invalid_key; + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + } + auto num_uniques = + thrust::count_if(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), + is_first_in_run_t{ + get_dataframe_buffer_begin(key_buffer)}); + + auto new_key_buffer = allocate_dataframe_buffer(num_uniques, handle.get_stream()); + auto new_payload_buffer = + allocate_dataframe_buffer(num_uniques, handle.get_stream()); + + if constexpr (compressed) { + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + thrust::reduce_by_key(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } else { + thrust::reduce_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } + + output_key_buffer = std::move(new_key_buffer); + payload_buffer = std::move(new_payload_buffer); + } + + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); +} + +template +std::conditional_t< + !std::is_same_v, + std::tuple( + 0, rmm::cuda_stream_view{})), + decltype(detail::allocate_optional_dataframe_buffer( + 0, rmm::cuda_stream_view{}))>, + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> +transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + ReduceOp reduce_op, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed, + "GraphViewType should support the push model."); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename KeyBucketType::key_type; + using payload_t = typename ReduceOp::value_type; + + if (do_expensive_check) { + // currently, nothing to do + } + + // 1. fill the buffer + + detail::transform_reduce_v_frontier_call_e_op_t + e_op_wrapper{e_op}; + + auto [key_buffer, payload_buffer] = + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op_wrapper, + do_expensive_check); + // 2. reduce the buffer + + std::vector vertex_partition_range_offsets{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + vertex_partition_range_offsets = std::vector(major_comm_size + 1); + for (int i = 0; i < major_comm_size; ++i) { + auto vertex_partition_id = + detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + vertex_partition_range_offsets[i] = + graph_view.vertex_partition_range_first(vertex_partition_id); + } + vertex_partition_range_offsets.back() = graph_view.local_edge_partition_dst_range_last(); + } else { + vertex_partition_range_offsets = + std::vector{graph_view.local_edge_partition_dst_range_first(), + graph_view.local_edge_partition_dst_range_last()}; + } + std::conditional_t, std::tuple, std::byte /* dummy */> + vertex_range{}; + if constexpr (std::is_integral_v) { + vertex_range = std::make_tuple(vertex_partition_range_offsets.front(), + vertex_partition_range_offsets.back()); + } + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + std::nullopt); + if constexpr (GraphViewType::is_multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + if (major_comm_size > 1) { + size_t local_key_buffer_size = size_dataframe_buffer(key_buffer); + auto avg_key_buffer_size = + host_scalar_allreduce( + major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / + major_comm_size; + + rmm::device_uvector d_vertex_partition_range_offsets( + vertex_partition_range_offsets.size(), handle.get_stream()); + raft::update_device(d_vertex_partition_range_offsets.data(), + vertex_partition_range_offsets.data(), + vertex_partition_range_offsets.size(), + handle.get_stream()); + + constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; + std::conditional_t + max_vertex_partition_size{}; + if constexpr (try_compression) { + for (int i = 0; i < major_comm_size; ++i) { + max_vertex_partition_size = + std::max(vertex_partition_range_offsets[i + 1] - vertex_partition_range_offsets[i], + max_vertex_partition_size); + } + } + + if constexpr (std::is_same_v && + std::is_same_v>) { + vertex_t min_vertex_partition_size = std::numeric_limits::max(); + for (int i = 0; i < major_comm_size; ++i) { + min_vertex_partition_size = + std::min(vertex_partition_range_offsets[i + 1] - vertex_partition_range_offsets[i], + min_vertex_partition_size); + } + + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + if (segment_offsets && + (static_cast(avg_key_buffer_size) > + static_cast(graph_view.number_of_vertices() / comm_size) * + double{0.2})) { // duplicates expected for high in-degree vertices (and we assume + // correlation between in-degrees & out-degrees) // FIXME: we need + // a better criterion + size_t key_size{0}; + size_t payload_size{0}; + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + key_size = sizeof(uint32_t); + } else { + key_size = sizeof(key_t); + } + } else { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + payload_size = sizeof(payload_t); + } else { + payload_size = sum_thrust_tuple_element_sizes(); + } + } + + int subgroup_size{}; + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size <= num_gpus_per_node) { + subgroup_size = major_comm_size; + } else { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::min(major_comm_size, num_gpus_per_node) + : std::max(num_gpus_per_node / minor_comm_size, int{1}); + } + + auto p2p_size_per_rank = avg_key_buffer_size * (key_size + payload_size); + auto p2p_size_per_node = p2p_size_per_rank * std::min(num_gpus_per_node, comm_size); + auto allreduce_size_per_node = p2p_size_per_node / 16 /* tuning parameter */; + auto allreduce_size_per_rank = + allreduce_size_per_node / (major_comm_size * (num_gpus_per_node / subgroup_size)); + + if (major_comm_size <= std::numeric_limits::max()) { // priority = uint8_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_partition_range_offsets.data(), + d_vertex_partition_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint8_t)), + min_vertex_partition_size), + subgroup_size); + } else { // priority = uint32_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_partition_range_offsets.data(), + d_vertex_partition_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint32_t)), + min_vertex_partition_size), + subgroup_size); + } + } + } + + rmm::device_uvector d_tx_buffer_last_boundaries(major_comm_size, handle.get_stream()); + auto key_v_first = + thrust_tuple_get_or_identity( + get_dataframe_buffer_begin(key_buffer)); + thrust::lower_bound(handle.get_thrust_policy(), + key_v_first, + key_v_first + size_dataframe_buffer(key_buffer), + d_vertex_partition_range_offsets.begin() + 1, + d_vertex_partition_range_offsets.end(), + d_tx_buffer_last_boundaries.begin()); + std::conditional_t>, + std::byte /* dummy */> + compressed_v_buffer{}; + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + compressed_v_buffer = + rmm::device_uvector(size_dataframe_buffer(key_buffer), handle.get_stream()); + thrust::transform( + handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + (*compressed_v_buffer).begin(), + cuda::proclaim_return_type( + [firsts = raft::device_span(d_vertex_partition_range_offsets.data(), + static_cast(major_comm_size)), + lasts = raft::device_span( + d_vertex_partition_range_offsets.data() + 1, + static_cast(major_comm_size))] __device__(auto v) { + auto major_comm_rank = thrust::distance( + lasts.begin(), thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); + return static_cast(v - firsts[major_comm_rank]); + })); + resize_dataframe_buffer(key_buffer, 0, handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + } + } + std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); + raft::update_host(h_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.size(), + handle.get_stream()); + handle.sync_stream(); + std::vector tx_counts(h_tx_buffer_last_boundaries.size()); + std::adjacent_difference( + h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); + + size_t min_element_size{cache_line_size}; + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + min_element_size = std::min(sizeof(uint32_t), min_element_size); + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(payload_t), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + } + assert((cache_line_size % min_element_size) == 0); + auto alignment = cache_line_size / min_element_size; + std::optional, key_t>> + invalid_key{std::nullopt}; + + if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) { + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + invalid_key = std::numeric_limits::max(); + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = key_t{}; + thrust::get<0>(*invalid_key) = invalid_vertex_id_v; + } + + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + alignment, + std::make_optional(std::get<1>(*invalid_key)), + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + std::make_optional(std::get<0>(*invalid_key)), + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + invalid_key, + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(payload_buffer), + tx_counts, + alignment, + std::nullopt, + handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } else { + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, std::ignore) = + shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } + + if constexpr (std::is_integral_v) { + vertex_range = std::make_tuple(graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_last()); + } + if constexpr (try_compression) { + if (compressed_v_buffer) { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(*compressed_v_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt); + } else { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt); + } + } else { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key); + } + } + } + + if constexpr (!std::is_same_v) { + return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + } else { + return std::move(key_buffer); + } +} + +} // namespace detail + +template +size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier) +{ + static_assert(!GraphViewType::is_storage_transposed, + "GraphViewType should support the push model."); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename KeyBucketType::key_type; + + size_t ret{0}; + + auto local_frontier_vertex_first = + thrust_tuple_get_or_identity(frontier.begin()); + + std::vector local_frontier_sizes{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); + } else { + local_frontier_sizes = std::vector{static_cast(frontier.size())}; + } + + auto edge_mask_view = graph_view.edge_mask_view(); + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; + + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + rmm::device_uvector edge_partition_frontier_vertices(local_frontier_sizes[i], + handle.get_stream()); + device_bcast(minor_comm, + local_frontier_vertex_first, + edge_partition_frontier_vertices.data(), + local_frontier_sizes[i], + static_cast(i), + handle.get_stream()); + + if (edge_partition_e_mask) { + ret += + edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), + edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), + edge_partition_frontier_vertices.end(), + handle.get_stream()); + } + } else { + assert(i == 0); + if (edge_partition_e_mask) { + ret += edge_partition.compute_number_of_edges_with_mask( + (*edge_partition_e_mask).value_first(), + local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } else { + ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, + local_frontier_vertex_first + frontier.size(), + handle.get_stream()); + } + } + } + + return ret; +} + +/** + * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor + * outputs by (tagged-)destination ID. + * + * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are + * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag + * type (KeyBucketType::key_type is identical to a vertex type otherwise). + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the + * current (tagged-)vertex frontier. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param frontier KeyBucketType class object for the current vertex frontier. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for + * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be + * discarded); 2) dummy (but valid) thrust::optional object (e.g. + * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is + * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be + * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); + * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type + * is not void). + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key + * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order + * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. + */ +template +std::conditional_t< + !std::is_same_v, + std::tuple( + 0, rmm::cuda_stream_view{})), + decltype(detail::allocate_optional_dataframe_buffer( + 0, rmm::cuda_stream_view{}))>, + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> +transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + ReduceOp reduce_op, + bool do_expensive_check = false) +{ + return detail::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + reduce_op, + do_expensive_check); +} + +} // namespace cugraph diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh deleted file mode 100644 index e58ab08fa9..0000000000 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "detail/graph_partition_utils.cuh" -#include "prims/detail/extract_transform_v_frontier_e.cuh" -#include "prims/property_op_utils.cuh" -#include "prims/reduce_op.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace cugraph { - -namespace detail { - -int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512; - -template -struct transform_reduce_v_frontier_call_e_op_t { - EdgeOp e_op{}; - - __device__ thrust::optional< - std::conditional_t && !std::is_same_v, - thrust::tuple, - std::conditional_t, key_t, payload_t>>> - operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const - { - auto e_op_result = e_op(key, dst, sv, dv, ev); - if (e_op_result.has_value()) { - auto reduce_by = reduce_by_src ? thrust_tuple_get_or_identity(key) : dst; - if constexpr (std::is_same_v && std::is_same_v) { - return reduce_by; - } else if constexpr (std::is_same_v && !std::is_same_v) { - return thrust::make_tuple(reduce_by, *e_op_result); - } else if constexpr (!std::is_same_v && std::is_same_v) { - return thrust::make_tuple(reduce_by, *e_op_result); - } else { - return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)), - thrust::get<1>(*e_op_result)); - } - } else { - return thrust::nullopt; - } - } -}; - -template -auto sort_and_reduce_buffer_elements( - raft::handle_t const& handle, - decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))&& key_buffer, - decltype(allocate_optional_dataframe_buffer(0, - rmm::cuda_stream_view{}))&& payload_buffer, - ReduceOp reduce_op) -{ - if constexpr (std::is_same_v) { - thrust::sort(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer)); - } else { - thrust::sort_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer)); - } - - if constexpr (std::is_same_v) { - auto it = thrust::unique(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer)); - resize_dataframe_buffer( - key_buffer, - static_cast(thrust::distance(get_dataframe_buffer_begin(key_buffer), it)), - handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); - } else if constexpr (std::is_same_v>) { - auto it = thrust::unique_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer)); - resize_dataframe_buffer(key_buffer, - static_cast(thrust::distance( - get_dataframe_buffer_begin(key_buffer), thrust::get<0>(it))), - handle.get_stream()); - resize_dataframe_buffer(payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); - shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); - } else { - auto num_uniques = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), - is_first_in_run_t{ - get_dataframe_buffer_begin(key_buffer)}); - - auto new_key_buffer = allocate_dataframe_buffer(num_uniques, handle.get_stream()); - auto new_payload_buffer = - allocate_dataframe_buffer(num_uniques, handle.get_stream()); - - thrust::reduce_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer), - get_dataframe_buffer_begin(new_key_buffer), - get_dataframe_buffer_begin(new_payload_buffer), - thrust::equal_to(), - reduce_op); - - key_buffer = std::move(new_key_buffer); - payload_buffer = std::move(new_payload_buffer); - } - - return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); -} - -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - static_assert(!GraphViewType::is_storage_transposed, - "GraphViewType should support the push model."); - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; - using payload_t = typename ReduceOp::value_type; - - if (do_expensive_check) { - // currently, nothing to do - } - - // 1. fill the buffer - - detail::transform_reduce_v_frontier_call_e_op_t - e_op_wrapper{e_op}; - - bool constexpr max_one_e_per_frontier_key = - reduce_by_src && std::is_same_v>; - auto [key_buffer, payload_buffer] = - detail::extract_transform_v_frontier_e( - handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op_wrapper, - do_expensive_check); - - // 2. reduce the buffer - - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); - if constexpr (GraphViewType::is_multi_gpu) { - // FIXME: this step is unnecessary if major_comm_size== 1 - auto& comm = handle.get_comms(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - std::vector h_vertex_lasts(reduce_by_src ? minor_comm_size : major_comm_size); - for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - auto vertex_partition_id = - reduce_by_src - ? detail::compute_local_edge_partition_major_range_vertex_partition_id_t{major_comm_size, - minor_comm_size, - major_comm_rank, - minor_comm_rank}( - i) - : detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id); - } - - rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); - raft::update_device( - d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream()); - rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), - handle.get_stream()); - auto reduce_by_first = - thrust_tuple_get_or_identity( - get_dataframe_buffer_begin(key_buffer)); - thrust::lower_bound(handle.get_thrust_policy(), - reduce_by_first, - reduce_by_first + size_dataframe_buffer(key_buffer), - d_vertex_lasts.begin(), - d_vertex_lasts.end(), - d_tx_buffer_last_boundaries.begin()); - std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); - raft::update_host(h_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.size(), - handle.get_stream()); - handle.sync_stream(); - std::vector tx_counts(h_tx_buffer_last_boundaries.size()); - std::adjacent_difference( - h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); - - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, std::ignore) = shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(key_buffer), - tx_counts, - handle.get_stream()); - key_buffer = std::move(rx_key_buffer); - - if constexpr (!std::is_same_v) { - auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_payload_buffer, std::ignore) = - shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(payload_buffer), - tx_counts, - handle.get_stream()); - payload_buffer = std::move(rx_payload_buffer); - } - - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); - } - - if constexpr (!std::is_same_v) { - return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); - } else { - return std::move(key_buffer); - } -} - -} // namespace detail - -template -size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier) -{ - static_assert(!GraphViewType::is_storage_transposed, - "GraphViewType should support the push model."); - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; - - size_t ret{0}; - - auto local_frontier_vertex_first = - thrust_tuple_get_or_identity(frontier.begin()); - - std::vector local_frontier_sizes{}; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); - } else { - local_frontier_sizes = std::vector{static_cast(frontier.size())}; - } - - auto edge_mask_view = graph_view.edge_mask_view(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - - rmm::device_uvector edge_partition_frontier_vertices(local_frontier_sizes[i], - handle.get_stream()); - device_bcast(minor_comm, - local_frontier_vertex_first, - edge_partition_frontier_vertices.data(), - local_frontier_sizes[i], - static_cast(i), - handle.get_stream()); - - if (edge_partition_e_mask) { - ret += - edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), - edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); - } else { - ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); - } - } else { - assert(i == 0); - if (edge_partition_e_mask) { - ret += edge_partition.compute_number_of_edges_with_mask( - (*edge_partition_e_mask).value_first(), - local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); - } else { - ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); - } - } - } - - return ret; -} - -/** - * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor - * outputs by (tagged-)source ID. - * - * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. - * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. - * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. - * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeOp Type of the quinary edge operator. - * @tparam ReduceOp Type of the binary reduction operator. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. - * @param edge_src_value_input Wrapper used to access source input property values (for the edge - * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() - * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to fill the - * wrapper. - * @param edge_dst_value_input Wrapper used to access destination input property values (for the - * edge destinations assigned to this process in multi-GPU). Use either - * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or - * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property - * values). Use update_edge_dst_property to fill the wrapper. - * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned - * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to - * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not - * access edge property values). - * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for - * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be - * discarded); 2) dummy (but valid) thrust::optional object (e.g. - * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is - * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be - * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); - * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type - * is not void). - * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. - * There are pre-defined reduction operators in prims/reduce_op.cuh. It is - * recommended to use the pre-defined reduction operators whenever possible as the current (and - * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has - * known member variables) to take a more optimized code path. See the documentation in the - * reduce_op.cuh file for instructions on writing custom reduction operators. - * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key - * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order - * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. - */ -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); -} - -/** - * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor - * outputs by (tagged-)destination ID. - * - * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. - * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. - * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. - * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeOp Type of the quinary edge operator. - * @tparam ReduceOp Type of the binary reduction operator. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. - * @param edge_src_value_input Wrapper used to access source input property values (for the edge - * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() - * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to fill the - * wrapper. - * @param edge_dst_value_input Wrapper used to access destination input property values (for the - * edge destinations assigned to this process in multi-GPU). Use either - * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or - * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property - * values). Use update_edge_dst_property to fill the wrapper. - * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned - * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to - * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not - * access edge property values). - * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for - * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be - * discarded); 2) dummy (but valid) thrust::optional object (e.g. - * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is - * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be - * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); - * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type - * is not void). - * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. - * There are pre-defined reduction operators in prims/reduce_op.cuh. It is - * recommended to use the pre-defined reduction operators whenever possible as the current (and - * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has - * known member variables) to take a more optimized code path. See the documentation in the - * reduce_op.cuh file for instructions on writing custom reduction operators. - * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key - * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order - * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. - */ -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); -} - -} // namespace cugraph diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh index 1bfdc23c66..2f842f710c 100644 --- a/cpp/src/prims/update_edge_src_dst_property.cuh +++ b/cpp/src/prims/update_edge_src_dst_property.cuh @@ -16,6 +16,7 @@ #pragma once #include "detail/graph_partition_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -265,8 +266,8 @@ template void update_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMajorPropertyOutputWrapper edge_major_property_output) { @@ -288,12 +289,12 @@ void update_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto local_v_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -317,7 +318,7 @@ void update_edge_major_property(raft::handle_t const& handle, graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -325,34 +326,41 @@ void update_edge_major_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + local_v_list_sizes[i], + i, + handle.get_stream()); device_bcast(minor_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -360,7 +368,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -386,7 +394,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -407,7 +415,7 @@ void update_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -420,20 +428,22 @@ void update_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_firsts[0]] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -455,13 +465,11 @@ void update_edge_minor_property(raft::handle_t const& handle, auto edge_partition_value_first = edge_minor_property_output.value_first(); if constexpr (GraphViewType::is_multi_gpu) { - using vertex_t = typename GraphViewType::vertex_type; - using bcast_buffer_type = - decltype(allocate_dataframe_buffer< - std::conditional_t>( - size_t{0}, handle.get_stream())); + using vertex_t = typename GraphViewType::vertex_type; + using bcast_buffer_type = dataframe_buffer_type_t< + std::conditional_t>; auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -487,8 +495,8 @@ void update_edge_minor_property(raft::handle_t const& handle, (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t)) / std::max(bcast_size, size_t{1}); - num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1}); - num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast(major_comm_size)); + num_concurrent_bcasts = + std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); auto num_rounds = (static_cast(major_comm_size) + num_concurrent_bcasts - size_t{1}) / num_concurrent_bcasts; @@ -532,15 +540,17 @@ void update_edge_minor_property(raft::handle_t const& handle, *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets()); } } else { - std::vector rx_counts(major_comm_size, size_t{0}); + std::vector local_v_list_sizes(major_comm_size, size_t{0}); for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = compute_local_edge_partition_minor_range_vertex_partition_id_t{ major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - rx_counts[i] = graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); + local_v_list_sizes[i] = + graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector rx_displacements(major_comm_size, size_t{0}); - std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); + std::exclusive_scan( + local_v_list_sizes.begin(), local_v_list_sizes.end(), rx_displacements.begin(), size_t{0}); key_offsets_or_rx_displacements = std::move(rx_displacements); } @@ -683,8 +693,8 @@ template void update_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMinorPropertyOutputWrapper edge_minor_property_output) { @@ -706,22 +716,49 @@ void update_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); - }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - std::conditional_t>( - contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size, - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + auto v_list_size = + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + std::array v_list_range = {vertex_t{0}, vertex_t{0}}; + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_vertex_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *sorted_unique_vertex_first + : (*(sorted_unique_vertex_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } + + auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); + auto local_v_list_range_firsts = + host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); + auto local_v_list_range_lasts = + host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); + + std::optional> v_list_bitmap{std::nullopt}; + if (major_comm_size > 1) { + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + + constexpr double threshold_ratio = + 0.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + if (avg_fill_ratio > threshold_ratio) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -735,13 +772,23 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_view(size_t{0})); auto edge_partition_keys = edge_minor_property_output.keys(); for (int i = 0; i < major_comm_size; ++i) { + rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + std::conditional_t>( + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + if (i == major_comm_rank) { auto vertex_partition = vertex_partition_device_view_t( graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -749,34 +796,53 @@ void update_edge_minor_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + std::variant, decltype(sorted_unique_vertex_first)> + v_list{}; + if (v_list_bitmap) { + v_list = + (i == major_comm_rank) + ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), size_t{0}); + } else { + v_list = sorted_unique_vertex_first; + } + device_bcast_vertex_list(major_comm, + v_list, + rx_vertices.begin(), + local_v_list_range_firsts[i], + local_v_list_range_lasts[i], + local_v_list_sizes[i], + i, + handle.get_stream()); device_bcast(major_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -784,7 +850,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], @@ -812,7 +878,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -833,7 +899,7 @@ void update_edge_minor_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_first); } @@ -844,20 +910,22 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_src_range_size()); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_first] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -909,8 +977,9 @@ void update_edge_src_property(raft::handle_t const& handle, /** * @brief Update graph edge source property values from the input vertex property values. * - * This version updates only a subset of graph edge source property values. [@p vertex_first, @p - * vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -919,10 +988,12 @@ void update_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -937,8 +1008,8 @@ template void update_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeSrcValueOutputWrapper edge_src_property_output, bool do_expensive_check = false) @@ -946,8 +1017,8 @@ void update_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -958,23 +1029,23 @@ void update_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } else { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } @@ -1026,8 +1097,9 @@ void update_edge_dst_property(raft::handle_t const& handle, /** * @brief Update graph edge destination property values from the input vertex property values. * - * This version updates only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -1037,10 +1109,12 @@ void update_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -1055,8 +1129,8 @@ template void update_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeDstValueOutputWrapper edge_dst_property_output, bool do_expensive_check = false) @@ -1064,8 +1138,8 @@ void update_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -1076,23 +1150,23 @@ void update_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } else { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index b13e6bfd45..6e7d8515be 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -15,15 +15,24 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" + +#include #include #include +#include +#include #include +#include #include #include #include +#include +#include +#include #include #include #include @@ -48,6 +57,191 @@ namespace cugraph { +template +KeyIterator compute_key_lower_bound(KeyIterator sorted_unique_key_first, + KeyIterator sorted_unique_key_last, + vertex_t v_threshold, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + if constexpr (std::is_same_v) { + return thrust::lower_bound( + rmm::exec_policy(stream_view), sorted_unique_key_first, sorted_unique_key_last, v_threshold); + } else { + key_t k_threshold{}; + thrust::get<0>(k_threshold) = v_threshold; + return thrust::lower_bound( + rmm::exec_policy(stream_view), + sorted_unique_key_first, + sorted_unique_key_last, + k_threshold, + [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); }); + } +} + +template +std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, + KeyIterator sorted_key_last, + raft::host_span segment_offsets, + vertex_t vertex_range_first, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + std::vector h_thresholds(segment_offsets.size() - 2); + for (size_t i = 0; i < h_thresholds.size(); ++i) { + h_thresholds[i] = vertex_range_first + segment_offsets[i + 1]; + } + + rmm::device_uvector d_thresholds(h_thresholds.size(), stream_view); + raft::update_device(d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), stream_view); + + rmm::device_uvector d_offsets(d_thresholds.size(), stream_view); + if constexpr (std::is_same_v) { + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_key_first, + sorted_key_last, + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } else { + auto sorted_vertex_first = + thrust::make_transform_iterator(sorted_key_first, thrust_tuple_get{}); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_vertex_first, + sorted_vertex_first + thrust::distance(sorted_key_first, sorted_key_last), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } + + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + h_offsets[0] = size_t{0}; + h_offsets.back() = static_cast(thrust::distance(sorted_key_first, sorted_key_last)); + + return h_offsets; +} + +template +rmm::device_uvector compute_vertex_list_bitmap_info( + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + auto bitmap = rmm::device_uvector( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + rmm::device_uvector lasts(bitmap.size(), stream_view); + auto bdry_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{1}), + cuda::proclaim_return_type( + [vertex_range_first, + vertex_range_size = vertex_range_last - vertex_range_first] __device__(vertex_t i) { + return vertex_range_first + + static_cast( + std::min(packed_bools_per_word() * i, static_cast(vertex_range_size))); + })); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + bdry_first, + bdry_first + bitmap.size(), + lasts.begin()); + thrust::tabulate( + rmm::exec_policy_nosync(stream_view), + bitmap.begin(), + bitmap.end(), + cuda::proclaim_return_type( + [sorted_unique_vertex_first, + vertex_range_first, + lasts = raft::device_span(lasts.data(), lasts.size())] __device__(size_t i) { + auto offset_first = (i != 0) ? lasts[i - 1] : vertex_t{0}; + auto offset_last = lasts[i]; + auto ret = packed_bool_empty_mask(); + for (auto j = offset_first; j < offset_last; ++j) { + auto v_offset = *(sorted_unique_vertex_first + j) - vertex_range_first; + ret |= packed_bool_mask(v_offset); + } + return ret; + })); + + return bitmap; +} + +template +void device_bcast_vertex_list( + raft::comms::comms_t const& comm, + std::variant, InputVertexIterator> v_list, + OutputVertexIterator output_v_first, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + size_t v_list_size, + int root, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + static_assert( + std::is_same_v::value_type, vertex_t>); + + if (v_list.index() == 0) { // bitmap + rmm::device_uvector tmp_bitmap( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + assert((comm.get_rank() != root) || (std::get<0>(v_list).size() == tmp_bitmap.size())); + device_bcast( + comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view); + rmm::device_scalar dummy(size_t{0}, stream_view); // we already know the count + detail::copy_if_nosync( + thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + tmp_bitmap.data(), tmp_bitmap.size())] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != + packed_bool_empty_mask()); + })), + output_v_first, + raft::device_span(dummy.data(), size_t{1}), + stream_view); + } else { + device_bcast(comm, std::get<1>(v_list), output_v_first, v_list_size, root, stream_view); + } +} + +template +void retrieve_vertex_list_from_bitmap( + raft::device_span bitmap, + OutputVertexIterator output_v_first, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + assert((comm.get_rank() != root) || + (bitmap.size() >= packed_bool_size(vertex_range_last - vertex_ragne_first))); + detail::copy_if_nosync(thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type([bitmap] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & + packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + output_v_first, + count, + stream_view); +} + // key type is either vertex_t (tag_t == void) or thrust::tuple (tag_t != void) // if sorted_unique is true, stores unique key objects in the sorted (non-descending) order. // if false, there can be duplicates and the elements may not be sorted. @@ -328,20 +522,6 @@ class key_bucket_t { } } - auto const begin() const - { - if constexpr (std::is_same_v) { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() - : std::get<1>(vertices_).begin(); - } else { - return vertices_.index() == 0 - ? thrust::make_zip_iterator( - thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) - : thrust::make_zip_iterator( - thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); - } - } - auto begin() { CUGRAPH_EXPECTS( @@ -355,12 +535,22 @@ class key_bucket_t { } } - auto const end() const + auto const cbegin() const { - return begin() + - (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); + if constexpr (std::is_same_v) { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() + : std::get<1>(vertices_).begin(); + } else { + return vertices_.index() == 0 + ? thrust::make_zip_iterator( + thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) + : thrust::make_zip_iterator( + thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); + } } + auto const begin() const { return cbegin(); } + auto end() { CUGRAPH_EXPECTS( @@ -369,15 +559,13 @@ class key_bucket_t { return begin() + std::get<0>(vertices_).size(); } - auto const vertex_begin() const + auto const cend() const { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + return begin() + + (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); } - auto const vertex_end() const - { - return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); - } + auto const end() const { return cend(); } auto vertex_begin() { @@ -387,6 +575,13 @@ class key_bucket_t { return std::get<0>(vertices_).begin(); } + auto const vertex_cbegin() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + } + + auto const vertex_begin() const { return vertex_cbegin(); } + auto vertex_end() { CUGRAPH_EXPECTS( @@ -395,6 +590,13 @@ class key_bucket_t { return std::get<0>(vertices_).end(); } + auto const vertex_cend() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); + } + + auto const vertex_end() const { return vertex_cend(); } + bool is_owning() { return (vertices_.index() == 0); } private: diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 9796ddd60a..cd98db3165 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -44,6 +44,7 @@ #include #include +#include namespace cugraph { @@ -299,6 +300,121 @@ bool check_no_parallel_edge(raft::handle_t const& handle, (org_edge_first + edgelist_srcs.size()); } +template +std::vector> +split_edge_chunk_compressed_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_compressed_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors, + size_t element_size) +{ + auto num_chunks = edgelist_compressed_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_compressed_elements{}; + edge_partition_compressed_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_compressed_elements.push_back(rmm::device_uvector( + edge_partition_edge_counts[i] * element_size, handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy( + handle.get_thrust_policy(), + edgelist_compressed_elements[k].begin() + segment_offset * element_size, + edgelist_compressed_elements[k].begin() + (segment_offset + segment_size) * element_size, + edge_partition_compressed_elements[i].begin() + output_offset * element_size); + } + } + } + edgelist_compressed_elements.clear(); + + return edge_partition_compressed_elements; +} + +template +std::vector> split_edge_chunk_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors) +{ + static_assert(std::is_arithmetic_v); // otherwise, unimplemented. + auto num_chunks = edgelist_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_elements{}; + edge_partition_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_elements.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy(handle.get_thrust_policy(), + edgelist_elements[k].begin() + segment_offset, + edgelist_elements[k].begin() + (segment_offset + segment_size), + edge_partition_elements[i].begin() + output_offset); + } + } + } + edgelist_elements.clear(); + + return edge_partition_elements; +} + +template +void decompress_vertices(raft::handle_t const& handle, + raft::device_span compressed_vertices, + raft::device_span vertices, + size_t compressed_v_size) +{ + auto input_v_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [byte_first = compressed_vertices.begin(), compressed_v_size] __device__(size_t i) { + uint64_t v{0}; + for (size_t j = 0; j < compressed_v_size; ++j) { + auto b = *(byte_first + i * compressed_v_size + j); + v |= static_cast(b) << (8 * j); + } + return static_cast(v); + })); + thrust::copy( + handle.get_thrust_policy(), input_v_first, input_v_first + vertices.size(), vertices.begin()); +} + template >>&& edge_partition_edgelist_weights, std::optional>>&& edge_partition_edgelist_edge_ids, std::optional>>&& edge_partition_edgelist_edge_types, - std::vector> const& edgelist_intra_partition_segment_offsets, + std::vector> const& edgelist_intra_partition_segment_offset_vectors, graph_properties_t graph_properties, bool renumber) { @@ -347,14 +463,14 @@ create_graph_from_partitioned_edgelist( src_ptrs[i] = edge_partition_edgelist_srcs[i].begin(); dst_ptrs[i] = edge_partition_edgelist_dsts[i].begin(); } - auto [renumber_map_labels, meta] = - cugraph::renumber_edgelist(handle, - std::move(local_vertices), - src_ptrs, - dst_ptrs, - edgelist_edge_counts, - edgelist_intra_partition_segment_offsets, - store_transposed); + auto [renumber_map_labels, meta] = cugraph::renumber_edgelist( + handle, + std::move(local_vertices), + src_ptrs, + dst_ptrs, + edgelist_edge_counts, + edgelist_intra_partition_segment_offset_vectors, + store_transposed); auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); @@ -369,7 +485,7 @@ create_graph_from_partitioned_edgelist( if (edge_partition_edgelist_edge_ids) { element_size += sizeof(edge_id_t); } if (edge_partition_edgelist_edge_types) { element_size += sizeof(edge_type_t); } auto constexpr mem_frugal_ratio = - 0.25; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the // total_global_mem, switch to the memory frugal approach auto mem_frugal_threshold = static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); @@ -684,11 +800,13 @@ create_graph_from_partitioned_edgelist( std::move(edge_partition_offsets), std::move(edge_partition_indices), std::move(edge_partition_dcs_nzd_vertices), - cugraph::graph_meta_t{meta.number_of_vertices, - meta.number_of_edges, - graph_properties, - meta.partition, - meta.edge_partition_segment_offsets}), + cugraph::graph_meta_t{ + meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.edge_partition_segment_offsets, + meta.edge_partition_hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -790,7 +908,7 @@ create_graph_from_edgelist_impl( handle.sync_stream(); std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); - auto edgelist_intra_partition_segment_offsets = std::vector>( + auto edgelist_intra_partition_segment_offset_vectors = std::vector>( minor_comm_size, std::vector(major_comm_size + 1, edge_t{0})); for (int i = 0; i < minor_comm_size; ++i) { edgelist_edge_counts[i] = std::accumulate(h_edge_counts.begin() + major_comm_size * i, @@ -798,7 +916,7 @@ create_graph_from_edgelist_impl( edge_t{0}); std::partial_sum(h_edge_counts.begin() + major_comm_size * i, h_edge_counts.begin() + major_comm_size * (i + 1), - edgelist_intra_partition_segment_offsets[i].begin() + 1); + edgelist_intra_partition_segment_offset_vectors[i].begin() + 1); } std::vector edgelist_displacements(minor_comm_size, edge_t{0}); std::partial_sum(edgelist_edge_counts.begin(), @@ -898,7 +1016,7 @@ create_graph_from_edgelist_impl( std::move(edge_partition_edgelist_weights), std::move(edge_partition_edgelist_edge_ids), std::move(edge_partition_edgelist_edge_types), - edgelist_intra_partition_segment_offsets, + edgelist_intra_partition_segment_offset_vectors, graph_properties, renumber); } @@ -1021,30 +1139,66 @@ create_graph_from_edgelist_impl( } } - // 1. groupby each edge chunks to their target local adjacency matrix partition (and further + auto num_chunks = edgelist_srcs.size(); + + // 1. set whether to temporarily compress vertex IDs or not in splitting edge chunks + + size_t compressed_v_size = + sizeof(vertex_t); // if set to a value smaller than sizeof(vertex_t), temporarily store vertex + // IDs in compressed_v_size byte variables + + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if constexpr (sizeof(vertex_t) == 8) { // 64 bit vertex ID + static_assert(std::is_signed_v); // __clzll takes a signed integer + + auto total_global_mem = handle.get_device_properties().totalGlobalMem; + size_t element_size = sizeof(vertex_t) * 2; + if (edgelist_weights) { element_size += sizeof(weight_t); } + if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); } + if (edgelist_edge_types) { element_size += sizeof(edge_type_t); } + edge_t num_edges{0}; + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + num_edges += edgelist_srcs[i].size(); + } + bool compress{false}; + if (static_cast(num_edges) * element_size > + static_cast(total_global_mem * 0.5 /* tuning parameter */)) { + compress = true; + } + + if (compress) { + size_t min_clz{sizeof(vertex_t) * 8}; + for (size_t i = 0; i < num_chunks; ++i) { + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_srcs[i].begin(), + edgelist_srcs[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_dsts[i].begin(), + edgelist_dsts[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + } + compressed_v_size = sizeof(vertex_t) - (min_clz / 8); + compressed_v_size = std::max(compressed_v_size, size_t{1}); + } + } + + // 2. groupby each edge chunks to their target local adjacency matrix partition (and further // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex // IDs). - std::vector>> edgelist_partitioned_srcs( - edgelist_srcs.size()); - std::vector>> edgelist_partitioned_dsts( - edgelist_srcs.size()); - auto edgelist_partitioned_weights = - edgelist_weights ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - auto edgelist_partitioned_edge_ids = - edgelist_edge_ids - ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - auto edgelist_partitioned_edge_types = - edgelist_edge_types - ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; - - for (size_t i = 0; i < edgelist_srcs.size(); ++i) { // iterate over input edge chunks + std::vector> edgelist_edge_offset_vectors(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks std::optional> this_chunk_weights{std::nullopt}; if (edgelist_weights) { this_chunk_weights = std::move((*edgelist_weights)[i]); } std::optional> this_chunk_edge_ids{std::nullopt}; @@ -1060,6 +1214,9 @@ create_graph_from_edgelist_impl( this_chunk_edge_ids, this_chunk_edge_types, true); + if (this_chunk_weights) { (*edgelist_weights)[i] = std::move(*this_chunk_weights); } + if (this_chunk_edge_ids) { (*edgelist_edge_ids)[i] = std::move(*this_chunk_edge_ids); } + if (this_chunk_edge_types) { (*edgelist_edge_types)[i] = std::move(*this_chunk_edge_types); } std::vector h_this_chunk_edge_counts(d_this_chunk_edge_counts.size()); raft::update_host(h_this_chunk_edge_counts.data(), @@ -1067,132 +1224,84 @@ create_graph_from_edgelist_impl( d_this_chunk_edge_counts.size(), handle.get_stream()); handle.sync_stream(); - std::vector h_this_chunk_edge_displacements(h_this_chunk_edge_counts.size()); - std::exclusive_scan(h_this_chunk_edge_counts.begin(), + std::vector h_this_chunk_edge_offsets( + h_this_chunk_edge_counts.size() + 1, + 0); // size = minor_comm_size (# local edge partitions) * major_comm_size (# segments in the + // local minor range) + std::inclusive_scan(h_this_chunk_edge_counts.begin(), h_this_chunk_edge_counts.end(), - h_this_chunk_edge_displacements.begin(), - size_t{0}); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_srcs(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_displacements[j]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_srcs.size(), tmp_srcs.begin()); - edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs)); - } - edgelist_srcs[i].resize(0, handle.get_stream()); - edgelist_srcs[i].shrink_to_fit(handle.get_stream()); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_dsts(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_displacements[j]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_dsts.size(), tmp_dsts.begin()); - edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts)); - } - edgelist_dsts[i].resize(0, handle.get_stream()); - edgelist_dsts[i].shrink_to_fit(handle.get_stream()); - - if (this_chunk_weights) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_weights(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = (*this_chunk_weights).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_weights.size(), - tmp_weights.begin()); - (*edgelist_partitioned_weights)[i].push_back(std::move(tmp_weights)); - } - (*this_chunk_weights).resize(0, handle.get_stream()); - (*this_chunk_weights).shrink_to_fit(handle.get_stream()); - } - - if (this_chunk_edge_ids) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_ids(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_ids).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_ids.size(), - tmp_edge_ids.begin()); - (*edgelist_partitioned_edge_ids)[i].push_back(std::move(tmp_edge_ids)); - } - (*this_chunk_edge_ids).resize(0, handle.get_stream()); - (*this_chunk_edge_ids).shrink_to_fit(handle.get_stream()); - } + h_this_chunk_edge_offsets.begin() + 1); + edgelist_edge_offset_vectors[i] = std::move(h_this_chunk_edge_offsets); + } - if (this_chunk_edge_types) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_types(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_types).begin() + h_this_chunk_edge_displacements[j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_types.size(), - tmp_edge_types.begin()); - (*edgelist_partitioned_edge_types)[i].push_back(std::move(tmp_edge_types)); - } - (*this_chunk_edge_types).resize(0, handle.get_stream()); - (*this_chunk_edge_types).shrink_to_fit(handle.get_stream()); + // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement + + std::optional>> edgelist_compressed_srcs{std::nullopt}; + std::optional>> edgelist_compressed_dsts{std::nullopt}; + if (compressed_v_size < sizeof(vertex_t)) { + edgelist_compressed_srcs = std::vector>{}; + edgelist_compressed_dsts = std::vector>{}; + (*edgelist_compressed_srcs).reserve(num_chunks); + (*edgelist_compressed_dsts).reserve(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks + // compress source values + auto tmp_srcs = rmm::device_uvector(edgelist_srcs[i].size() * compressed_v_size, + handle.get_stream()); + auto input_src_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [src_first = edgelist_srcs[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(src_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_src_first, + input_src_first + edgelist_srcs[i].size() * compressed_v_size, + tmp_srcs.begin()); + edgelist_srcs[i].resize(0, handle.get_stream()); + edgelist_srcs[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_srcs).push_back(std::move(tmp_srcs)); + + // compress destination values + + auto tmp_dsts = rmm::device_uvector(edgelist_dsts[i].size() * compressed_v_size, + handle.get_stream()); + auto input_dst_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [dst_first = edgelist_dsts[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(dst_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_dst_first, + input_dst_first + edgelist_dsts[i].size() * compressed_v_size, + tmp_dsts.begin()); + edgelist_dsts[i].resize(0, handle.get_stream()); + edgelist_dsts[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_dsts).push_back(std::move(tmp_dsts)); } } - edgelist_srcs.clear(); - edgelist_dsts.clear(); - if (edgelist_weights) { (*edgelist_weights).clear(); } - if (edgelist_edge_ids) { (*edgelist_edge_ids).clear(); } - if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } - // 2. split the grouped edge chunks to local partitions + // 4. compute additional copy_offset vectors - auto edgelist_intra_partition_segment_offsets = std::vector>(minor_comm_size); - - std::vector> edge_partition_edgelist_srcs{}; - edge_partition_edgelist_srcs.reserve(minor_comm_size); - std::vector> edge_partition_edgelist_dsts{}; - edge_partition_edgelist_dsts.reserve(minor_comm_size); - auto edge_partition_edgelist_weights = - edgelist_partitioned_weights ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_weights) { (*edge_partition_edgelist_weights).reserve(minor_comm_size); } - auto edge_partition_edgelist_edge_ids = - edgelist_partitioned_edge_ids - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_ids) { - (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size); - } - auto edge_partition_edgelist_edge_types = - edgelist_partitioned_edge_types - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_types) { - (*edge_partition_edgelist_edge_types).reserve(minor_comm_size); - } - - for (int i = 0; i < minor_comm_size; ++i) { // iterate over local edge partitions + std::vector edge_partition_edge_counts(minor_comm_size); + std::vector> edge_partition_intra_partition_segment_offset_vectors( + minor_comm_size); + std::vector> edge_partition_intra_segment_copy_output_displacement_vectors( + minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { edge_t edge_count{0}; std::vector intra_partition_segment_sizes(major_comm_size, 0); - std::vector intra_segment_copy_output_displacements(major_comm_size * - edgelist_partitioned_srcs.size()); + std::vector intra_segment_copy_output_displacements(major_comm_size * num_chunks); for (int j = 0; j < major_comm_size /* # segments in the local minor range */; ++j) { edge_t displacement{0}; - for (size_t k = 0; k < edgelist_partitioned_srcs.size() /* # input edge chunks */; ++k) { - auto segment_size = edgelist_partitioned_srcs[k][i * major_comm_size + j].size(); + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; edge_count += segment_size; intra_partition_segment_sizes[j] += segment_size; - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k] = - displacement; + intra_segment_copy_output_displacements[j * num_chunks + k] = displacement; displacement += segment_size; } } @@ -1201,93 +1310,133 @@ create_graph_from_edgelist_impl( intra_partition_segment_sizes.end(), intra_partition_segment_offsets.begin() + 1); - rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) { - auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_srcs.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); + edge_partition_edge_counts[i] = edge_count; + edge_partition_intra_partition_segment_offset_vectors[i] = + std::move(intra_partition_segment_offsets); + edge_partition_intra_segment_copy_output_displacement_vectors[i] = + std::move(intra_segment_copy_output_displacements); + } - rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_dsts.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); + // 5. split the grouped edge chunks to local partitions - if (edge_partition_edgelist_weights) { - rmm::device_uvector tmp_weights(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_weights)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_weights.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); - } + std::vector> edge_partition_edgelist_srcs{}; + std::vector> edge_partition_edgelist_dsts{}; + std::optional>> edge_partition_edgelist_weights{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_ids{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_types{ + std::nullopt}; - if (edge_partition_edgelist_edge_ids) { - rmm::device_uvector tmp_edge_ids(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_ids)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_ids.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); - } + std::optional>> + edge_partition_edgelist_compressed_srcs{}; + std::optional>> + edge_partition_edgelist_compressed_dsts{}; - if (edge_partition_edgelist_edge_types) { - rmm::device_uvector tmp_edge_types(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_types)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_types.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); - } - } - (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); - } + if (compressed_v_size < sizeof(vertex_t)) { + edge_partition_edgelist_compressed_srcs = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); + + edge_partition_edgelist_compressed_dsts = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); + } else { + edge_partition_edgelist_srcs = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + + edge_partition_edgelist_dsts = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + + if (edgelist_weights) { + edge_partition_edgelist_weights = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_weights), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_ids) { + edge_partition_edgelist_edge_ids = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_ids), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_types) { + edge_partition_edgelist_edge_types = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_types), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } - edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets); + // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory + // requirement + + if (compressed_v_size < sizeof(vertex_t)) { + assert(edge_partition_edgelist_compressed_srcs); + assert(edge_partition_edgelist_compressed_dsts); + + edge_partition_edgelist_srcs.reserve(minor_comm_size); + edge_partition_edgelist_dsts.reserve(minor_comm_size); + + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_srcs(edge_partition_edge_counts[i], handle.get_stream()); + decompress_vertices( + handle, + raft::device_span((*edge_partition_edgelist_compressed_srcs)[i].data(), + (*edge_partition_edgelist_compressed_srcs)[i].size()), + raft::device_span(tmp_srcs.data(), tmp_srcs.size()), + compressed_v_size); + edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); + (*edge_partition_edgelist_compressed_srcs)[i].resize(0, handle.get_stream()); + (*edge_partition_edgelist_compressed_srcs)[i].shrink_to_fit(handle.get_stream()); + + rmm::device_uvector tmp_dsts(edge_partition_edge_counts[i], handle.get_stream()); + decompress_vertices( + handle, + raft::device_span((*edge_partition_edgelist_compressed_dsts)[i].data(), + (*edge_partition_edgelist_compressed_dsts)[i].size()), + raft::device_span(tmp_dsts.data(), tmp_dsts.size()), + compressed_v_size); + edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); + (*edge_partition_edgelist_compressed_dsts)[i].resize(0, handle.get_stream()); + (*edge_partition_edgelist_compressed_dsts)[i].shrink_to_fit(handle.get_stream()); + } } return create_graph_from_partitioned_edgelist(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " + "Invalid input arguments: graph_properties.is_symmetric is true but the input edge " + "list is " "not symmetric."); } @@ -1377,7 +1527,8 @@ create_graph_from_edgelist_impl( handle, raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but the input edge " + "list " "has parallel edges."); } } @@ -1605,7 +1756,8 @@ create_graph_from_edgelist_impl( cugraph::graph_meta_t{ num_vertices, graph_properties, - renumber ? std::optional>{meta.segment_offsets} : std::nullopt}), + renumber ? std::optional>{meta.segment_offsets} : std::nullopt, + meta.hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -1759,15 +1911,15 @@ create_graph_from_edgelist_impl( renumber); if (graph_properties.is_symmetric) { - CUGRAPH_EXPECTS( - (check_symmetric( - handle, - raft::device_span(aggregate_edgelist_srcs.data(), - aggregate_edgelist_srcs.size()), - raft::device_span(aggregate_edgelist_dsts.data(), - aggregate_edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " - "not symmetric."); + CUGRAPH_EXPECTS((check_symmetric( + handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size()))), + "Invalid input arguments: graph_properties.is_symmetric is true but the " + "input edge list is " + "not symmetric."); } if (!graph_properties.is_multigraph) { @@ -1777,7 +1929,8 @@ create_graph_from_edgelist_impl( aggregate_edgelist_srcs.size()), raft::device_span(aggregate_edgelist_dsts.data(), aggregate_edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but " + "the input edge list " "has parallel edges."); } } diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh index 1ef975c1de..86e3c45ca2 100644 --- a/cpp/src/structure/detail/structure_utils.cuh +++ b/cpp/src/structure/detail/structure_utils.cuh @@ -60,7 +60,8 @@ rmm::device_uvector compute_sparse_offsets( bool edgelist_major_sorted, rmm::cuda_stream_view stream_view) { - rmm::device_uvector offsets((major_range_last - major_range_first) + 1, stream_view); + rmm::device_uvector offsets(static_cast(major_range_last - major_range_first) + 1, + stream_view); if (edgelist_major_sorted) { offsets.set_element_to_zero_async(0, stream_view); thrust::upper_bound(rmm::exec_policy(stream_view), @@ -77,7 +78,9 @@ rmm::device_uvector compute_sparse_offsets( edgelist_major_first, edgelist_major_last, [offset_view, major_range_first] __device__(auto v) { - atomicAdd(&offset_view[v - major_range_first], edge_t{1}); + cuda::atomic_ref atomic_counter( + offset_view[v - major_range_first]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); }); thrust::exclusive_scan( @@ -246,30 +249,112 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, rmm::device_uvector offsets(0, stream_view); rmm::device_uvector indices(0, stream_view); - auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); if (edgelist_minors.size() > mem_frugal_threshold) { - offsets = compute_sparse_offsets(edgelist_majors.begin(), - edgelist_majors.end(), - major_range_first, - major_range_last, - false, - stream_view); + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if ((sizeof(vertex_t) == 8) && (static_cast(major_range_last - major_range_first) <= + static_cast(std::numeric_limits::max()))) { + rmm::device_uvector edgelist_major_offsets(edgelist_majors.size(), stream_view); + thrust::transform( + rmm::exec_policy_nosync(stream_view), + edgelist_majors.begin(), + edgelist_majors.end(), + edgelist_major_offsets.begin(), + cuda::proclaim_return_type([major_range_first] __device__(vertex_t major) { + return static_cast(major - major_range_first); + })); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + + offsets = + compute_sparse_offsets(edgelist_major_offsets.begin(), + edgelist_major_offsets.end(), + uint32_t{0}, + static_cast(major_range_last - major_range_first), + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_major_offsets.size() * (i + 1)) / 4)))); + } - auto pivot = major_range_first + static_cast(thrust::distance( - offsets.begin(), - thrust::lower_bound(rmm::exec_policy(stream_view), - offsets.begin(), - offsets.end(), - edgelist_minors.size() / 2))); - auto second_first = - detail::mem_frugal_partition(edge_first, - edge_first + edgelist_minors.size(), - thrust_tuple_get, 0>{}, - pivot, - stream_view); - thrust::sort(rmm::exec_policy(stream_view), edge_first, second_first); - thrust::sort(rmm::exec_policy(stream_view), second_first, edge_first + edgelist_minors.size()); + auto pair_first = + thrust::make_zip_iterator(edgelist_major_offsets.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(pair_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(pair_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), pair_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), + last_quarter_first, + pair_first + edgelist_major_offsets.size()); + } else { + offsets = compute_sparse_offsets(edgelist_majors.begin(), + edgelist_majors.end(), + major_range_first, + major_range_last, + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = + major_range_first + + static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_minors.size() * (i + 1)) / 4)))); + } + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(edge_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(edge_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), edge_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort( + rmm::exec_policy(stream_view), last_quarter_first, edge_first + edgelist_majors.size()); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + } } else { + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); thrust::sort(rmm::exec_policy(stream_view), edge_first, edge_first + edgelist_minors.size()); offsets = compute_sparse_offsets(edgelist_majors.begin(), edgelist_majors.end(), @@ -277,12 +362,11 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, major_range_last, true, stream_view); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); } indices = std::move(edgelist_minors); - edgelist_majors.resize(0, stream_view); - edgelist_majors.shrink_to_fit(stream_view); - std::optional> dcs_nzd_vertices{std::nullopt}; if (major_hypersparse_first) { std::tie(offsets, dcs_nzd_vertices) = compress_hypersparse_offsets(std::move(offsets), diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index ef43b7b13e..6661f0488d 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -146,8 +146,7 @@ update_local_sorted_unique_edge_majors_minors( auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); - auto use_dcs = - num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); + auto use_dcs = edge_partition_dcs_nzd_vertices.has_value(); std::optional>> local_sorted_unique_edge_majors{ std::nullopt}; @@ -166,14 +165,15 @@ update_local_sorted_unique_edge_majors_minors( // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets - { + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range(); auto minor_range_size = meta.partition.local_edge_partition_minor_range_size(); - rmm::device_uvector minor_bitmaps( - (minor_range_size + (sizeof(uint32_t) * 8 - 1)) / (sizeof(uint32_t) * 8), - handle.get_stream()); - thrust::fill( - handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0}); + rmm::device_uvector minor_bitmaps(packed_bool_size(minor_range_size), + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + minor_bitmaps.begin(), + minor_bitmaps.end(), + packed_bool_empty_mask()); for (size_t i = 0; i < edge_partition_indices.size(); ++i) { thrust::for_each(handle.get_thrust_policy(), edge_partition_indices[i].begin(), @@ -281,92 +281,96 @@ update_local_sorted_unique_edge_majors_minors( // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets - std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - num_local_unique_edge_major_counts[i] += thrust::count_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), - has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); - } - auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), - num_local_unique_edge_major_counts.end()); - - vertex_t aggregate_major_range_size{0}; - for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { - aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); - } - - auto max_major_properties_fill_ratio = - host_scalar_allreduce(comm, - static_cast(num_local_unique_edge_majors) / - static_cast(aggregate_major_range_size), - raft::comms::op_t::MAX, - handle.get_stream()); + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { + std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + num_local_unique_edge_major_counts[i] = thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), + has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); + } + auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), + num_local_unique_edge_major_counts.end()); - if (max_major_properties_fill_ratio < - detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { - auto const chunk_size = - static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); + vertex_t aggregate_major_range_size{0}; + for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { + aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); + } - local_sorted_unique_edge_majors = std::vector>{}; - local_sorted_unique_edge_major_chunk_start_offsets = - std::vector>{}; + auto max_major_properties_fill_ratio = + host_scalar_allreduce(comm, + static_cast(num_local_unique_edge_majors) / + static_cast(aggregate_major_range_size), + raft::comms::op_t::MAX, + handle.get_stream()); - (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); - (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - auto [major_range_first, major_range_last] = - meta.partition.local_edge_partition_major_range(i); - auto sparse_range_last = - use_dcs - ? (major_range_first + - meta.edge_partition_segment_offsets[num_segments_per_vertex_partition * i + + if (max_major_properties_fill_ratio < + detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { + auto const chunk_size = + static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); + + local_sorted_unique_edge_majors = std::vector>{}; + local_sorted_unique_edge_major_chunk_start_offsets = + std::vector>{}; + + (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); + (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + auto [major_range_first, major_range_last] = + meta.partition.local_edge_partition_major_range(i); + auto sparse_range_last = + use_dcs + ? (major_range_first + + meta + .edge_partition_segment_offsets[num_segments_per_vertex_partition * i + detail::num_sparse_segments_per_vertex_partition]) - : major_range_last; - - rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], - handle.get_stream()); - CUGRAPH_EXPECTS( - sparse_range_last - major_range_first < std::numeric_limits::max(), - "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), work-around required."); - auto cur_size = thrust::distance( - unique_edge_majors.begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(major_range_first), - thrust::make_counting_iterator(sparse_range_last), + : major_range_last; + + rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], + handle.get_stream()); + CUGRAPH_EXPECTS(sparse_range_last - major_range_first < std::numeric_limits::max(), + "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), " + "work-around required."); + auto cur_size = thrust::distance( unique_edge_majors.begin(), - has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); - if (use_dcs) { - thrust::copy(handle.get_thrust_policy(), - (*edge_partition_dcs_nzd_vertices)[i].begin(), - (*edge_partition_dcs_nzd_vertices)[i].end(), - unique_edge_majors.begin() + cur_size); + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(major_range_first), + thrust::make_counting_iterator(sparse_range_last), + unique_edge_majors.begin(), + has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); + if (use_dcs) { + thrust::copy(handle.get_thrust_policy(), + (*edge_partition_dcs_nzd_vertices)[i].begin(), + (*edge_partition_dcs_nzd_vertices)[i].end(), + unique_edge_majors.begin() + cur_size); + } + + auto num_chunks = static_cast( + ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); + rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, + handle.get_stream()); + + auto chunk_start_vertex_first = + thrust::make_transform_iterator(thrust::make_counting_iterator(vertex_t{0}), + detail::multiply_and_add_t{ + static_cast(chunk_size), major_range_first}); + thrust::lower_bound(handle.get_thrust_policy(), + unique_edge_majors.begin(), + unique_edge_majors.end(), + chunk_start_vertex_first, + chunk_start_vertex_first + num_chunks, + unique_edge_major_chunk_start_offsets.begin()); + unique_edge_major_chunk_start_offsets.set_element( + num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); + + (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); + (*local_sorted_unique_edge_major_chunk_start_offsets) + .push_back(std::move(unique_edge_major_chunk_start_offsets)); } - - auto num_chunks = static_cast( - ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); - rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, - handle.get_stream()); - - auto chunk_start_vertex_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - detail::multiply_and_add_t{static_cast(chunk_size), major_range_first}); - thrust::lower_bound(handle.get_thrust_policy(), - unique_edge_majors.begin(), - unique_edge_majors.end(), - chunk_start_vertex_first, - chunk_start_vertex_first + num_chunks, - unique_edge_major_chunk_start_offsets.begin()); - unique_edge_major_chunk_start_offsets.set_element( - num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); - - (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); - (*local_sorted_unique_edge_major_chunk_start_offsets) - .push_back(std::move(unique_edge_major_chunk_start_offsets)); + local_sorted_unique_edge_major_chunk_size = chunk_size; } - local_sorted_unique_edge_major_chunk_size = chunk_size; } return std::make_tuple(std::move(local_sorted_unique_edge_majors), @@ -378,6 +382,50 @@ update_local_sorted_unique_edge_majors_minors( std::move(local_sorted_unique_edge_minor_vertex_partition_offsets)); } +template +std::enable_if_t>> +compute_edge_partition_dcs_nzd_range_bitmaps( + raft::handle_t const& handle, + graph_meta_t const& meta, + std::vector> const& edge_partition_dcs_nzd_vertices) +{ + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + auto num_segments_per_vertex_partition = + static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); + + std::vector> edge_partition_dcs_nzd_range_bitmaps{}; + edge_partition_dcs_nzd_range_bitmaps.reserve(edge_partition_dcs_nzd_vertices.size()); + for (size_t i = 0; i < edge_partition_dcs_nzd_vertices.size(); ++i) { + raft::host_span segment_offsets( + meta.edge_partition_segment_offsets.data() + num_segments_per_vertex_partition * i, + num_segments_per_vertex_partition); + rmm::device_uvector bitmap( + packed_bool_size(segment_offsets[detail::num_sparse_segments_per_vertex_partition + 1] - + segment_offsets[detail::num_sparse_segments_per_vertex_partition]), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + auto major_range_first = meta.partition.local_edge_partition_major_range_first(i); + auto major_hypersparse_first = + major_range_first + segment_offsets[detail::num_sparse_segments_per_vertex_partition]; + thrust::for_each(handle.get_thrust_policy(), + edge_partition_dcs_nzd_vertices[i].begin(), + edge_partition_dcs_nzd_vertices[i].end(), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + major_hypersparse_first] __device__(auto major) { + auto offset = major - major_hypersparse_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + }); + edge_partition_dcs_nzd_range_bitmaps.push_back(std::move(bitmap)); + } + + return edge_partition_dcs_nzd_range_bitmaps; +} + } // namespace template @@ -400,7 +448,8 @@ graph_t @@ -452,7 +506,8 @@ graph_t(indices.size()), meta.properties), offsets_(std::move(offsets)), indices_(std::move(indices)), - segment_offsets_(meta.segment_offsets) + segment_offsets_(meta.segment_offsets), + hypersparse_degree_offsets_(meta.hypersparse_degree_offsets) { } diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index f925a14273..31de9b1e5d 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -488,14 +488,18 @@ graph_view_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta) : detail::graph_base_t( meta.number_of_vertices, meta.number_of_edges, meta.properties), edge_partition_offsets_(edge_partition_offsets), edge_partition_indices_(edge_partition_indices), edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices), + edge_partition_dcs_nzd_range_bitmaps_(edge_partition_dcs_nzd_range_bitmaps), partition_(meta.partition), edge_partition_segment_offsets_(meta.edge_partition_segment_offsets), + edge_partition_hypersparse_degree_offsets_(meta.edge_partition_hypersparse_degree_offsets), local_sorted_unique_edge_srcs_(meta.local_sorted_unique_edge_srcs), local_sorted_unique_edge_src_chunk_start_offsets_( meta.local_sorted_unique_edge_src_chunk_start_offsets), @@ -538,7 +542,8 @@ graph_view_t #include #include -#ifdef TIMING -#include -#endif #include @@ -127,10 +122,6 @@ extract_induced_subgraphs( raft::device_span subgraph_vertices, bool do_expensive_check) { -#ifdef TIMING - HighResTimer hr_timer; - hr_timer.start("extract_induced_subgraphs"); -#endif // 1. check input arguments if (do_expensive_check) { @@ -281,10 +272,6 @@ extract_induced_subgraphs( true, handle.get_stream()); -#ifdef TIMING - hr_timer.stop(); - hr_timer.display_and_clear(std::cout); -#endif return std::make_tuple(std::move(edge_majors), std::move(edge_minors), std::move(edge_weights), diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 41f81d72ab..bd7d48ac31 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -51,6 +51,8 @@ #include #include +#include + #include #include #include @@ -233,128 +235,299 @@ std::optional find_locally_unused_ext_vertex_id( : std::nullopt /* if the entire range of vertex_t is used */; } -// returns renumber map and segment_offsets +// returns renumber map, segment_offsets, and hypersparse_degree_offsets template -std::tuple, std::vector, vertex_t> compute_renumber_map( - raft::handle_t const& handle, - std::optional>&& local_vertices, - std::vector const& edgelist_majors, - std::vector const& edgelist_minors, - std::vector const& edgelist_edge_counts) +std::tuple, + std::vector, + std::optional>, + vertex_t> +compute_renumber_map(raft::handle_t const& handle, + std::optional>&& local_vertices, + std::vector const& edgelist_majors, + std::vector const& edgelist_minors, + std::vector const& edgelist_edge_counts) { - rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); - - edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); - - // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct - // local_vertices) + // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to + // construct local_vertices) - rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); + rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { - sorted_unique_majors.resize(num_local_edges, handle.get_stream()); - size_t major_offset{0}; - for (size_t i = 0; i < edgelist_majors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[i], - edgelist_majors[i] + edgelist_edge_counts[i], - sorted_unique_majors.begin() + major_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]); - major_offset += static_cast(thrust::distance( - sorted_unique_majors.begin() + major_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]))); + constexpr size_t num_bins{ + 8}; // increase the number of bins to cut peak memory usage (at the expense of additional + // computing), limit the maximum temporary memory usage to "size of local edge list + // majors|minors * 2 / # bins" + constexpr uint32_t hash_seed = + 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function + // used to map vertices to GPUs, and we may not see the expected randomization) + + auto edge_major_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_majors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_major_count_vectors) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { + rmm::device_uvector d_edge_major_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_major_counts.begin(), + d_edge_major_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_major_counts.data(), + d_edge_major_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_major_count_vectors)[i].data(), + d_edge_major_counts.data(), + d_edge_major_counts.size(), + handle.get_stream()); + } } - sorted_unique_majors.resize(major_offset, handle.get_stream()); - if (edgelist_majors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end()); + auto edge_minor_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_minors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_minor_count_vectors) { + for (size_t i = 0; i < edgelist_minors.size(); ++i) { + rmm::device_uvector d_edge_minor_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_minor_counts.begin(), + d_edge_minor_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_minors[i], + edgelist_minors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_minor_counts.data(), + d_edge_minor_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_minor_count_vectors)[i].data(), + d_edge_minor_counts.data(), + d_edge_minor_counts.size(), + handle.get_stream()); + } } - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - } - - // 2. if local_vertices.has_value() is false, find unique vertices from edge minors (to construct - // local_vertices) - rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); - if (!local_vertices) { - sorted_unique_minors.resize(num_local_edges, handle.get_stream()); - size_t minor_offset{0}; - for (size_t i = 0; i < edgelist_minors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_minors[i], - edgelist_minors[i] + edgelist_edge_counts[i], - sorted_unique_minors.begin() + minor_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]); - minor_offset += static_cast(thrust::distance( - sorted_unique_minors.begin() + minor_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]))); - } - sorted_unique_minors.resize(minor_offset, handle.get_stream()); - if (edgelist_minors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end()); - sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end())), - handle.get_stream()); - } - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - } + handle.sync_stream(); - // 3. update sorted_local_vertices. - // if local_vertices.has_value() is false, reconstruct local_vertices first + for (size_t i = 0; i < num_bins; ++i) { + rmm::device_uvector this_bin_sorted_unique_majors(0, handle.get_stream()); + { + std::vector> edge_partition_tmp_majors{}; // for bin "i" + edge_partition_tmp_majors.reserve(edgelist_majors.size()); + for (size_t j = 0; j < edgelist_majors.size(); ++j) { + rmm::device_uvector tmp_majors(0, handle.get_stream()); + if (num_bins > 1) { + tmp_majors.resize((*edge_major_count_vectors)[j][i], handle.get_stream()); + thrust::copy_if(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + tmp_majors.begin(), + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + } else { + tmp_majors.resize(edgelist_edge_counts[j], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + tmp_majors.begin()); + } + thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); + tmp_majors.resize( + thrust::distance( + tmp_majors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), + handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + + edge_partition_tmp_majors.push_back(std::move(tmp_majors)); + } + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + std::vector tx_counts(minor_comm_size); + for (int j = 0; j < minor_comm_size; ++j) { + tx_counts[j] = edge_partition_tmp_majors[j].size(); + } + this_bin_sorted_unique_majors.resize(std::reduce(tx_counts.begin(), tx_counts.end()), + handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_majors[j].begin(), + edge_partition_tmp_majors[j].end(), + this_bin_sorted_unique_majors.begin() + output_offset); + output_offset += edge_partition_tmp_majors[j].size(); + } + this_bin_sorted_unique_majors = shuffle_and_unique_segment_sorted_values( + minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream()); + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } + } - if (local_vertices) { + rmm::device_uvector this_bin_sorted_unique_minors(0, handle.get_stream()); + { + std::vector> edge_partition_tmp_minors{}; // for bin "i" + edge_partition_tmp_minors.reserve(edgelist_minors.size()); + for (size_t j = 0; j < edgelist_minors.size(); ++j) { + rmm::device_uvector tmp_minors(0, handle.get_stream()); + if (num_bins > 1) { + tmp_minors.resize((*edge_minor_count_vectors)[j][i], handle.get_stream()); + thrust::copy_if(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin(), + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + } else { + tmp_minors.resize(edgelist_edge_counts[j], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin()); + } + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + + edge_partition_tmp_minors.push_back(std::move(tmp_minors)); + } + if (edge_partition_tmp_minors.size() == 1) { + this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); + } else { + edge_t aggregate_size{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + aggregate_size += edge_partition_tmp_minors[j].size(); + } + this_bin_sorted_unique_minors.resize(aggregate_size, handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_minors[j].begin(), + edge_partition_tmp_minors[j].end(), + this_bin_sorted_unique_minors.begin() + output_offset); + output_offset += edge_partition_tmp_minors[j].size(); + } + edge_partition_tmp_minors.clear(); + thrust::sort(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end()); + this_bin_sorted_unique_minors.resize( + thrust::distance(this_bin_sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end())), + handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); + } + if constexpr (multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + if (major_comm_size > 1) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + compute_gpu_id_from_ext_vertex_t gpu_id_func{ + comm_size, major_comm_size, minor_comm_size}; + auto d_tx_counts = groupby_and_count( + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), + [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { + return partition_manager::compute_major_comm_rank_from_global_comm_rank( + major_comm_size, minor_comm_size, gpu_id_func(v)); + }, + major_comm_size, + std::numeric_limits::max(), + handle.get_stream()); + std::vector h_tx_counts(d_tx_counts.size()); + raft::update_host( + h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream()); + handle.sync_stream(); + std::vector tx_displacements(h_tx_counts.size()); + std::exclusive_scan( + h_tx_counts.begin(), h_tx_counts.end(), tx_displacements.begin(), size_t{0}); + for (int j = 0; j < major_comm_size; ++j) { + thrust::sort( + handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin() + tx_displacements[j], + this_bin_sorted_unique_minors.begin() + (tx_displacements[j] + h_tx_counts[j])); + } + this_bin_sorted_unique_minors = shuffle_and_unique_segment_sorted_values( + major_comm, this_bin_sorted_unique_minors.begin(), h_tx_counts, handle.get_stream()); + } + } + } + rmm::device_uvector this_bin_sorted_unique_vertices(0, handle.get_stream()); + { + rmm::device_uvector merged_vertices( + this_bin_sorted_unique_majors.size() + this_bin_sorted_unique_minors.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + this_bin_sorted_unique_majors.begin(), + this_bin_sorted_unique_majors.end(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), + merged_vertices.begin()); + this_bin_sorted_unique_majors.resize(0, handle.get_stream()); + this_bin_sorted_unique_majors.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_minors.resize(0, handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); + merged_vertices.resize(thrust::distance(merged_vertices.begin(), + thrust::unique(handle.get_thrust_policy(), + merged_vertices.begin(), + merged_vertices.end())), + handle.get_stream()); + merged_vertices.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_vertices = std::move(merged_vertices); + } + if (sorted_local_vertices.size() == 0) { + sorted_local_vertices = std::move(this_bin_sorted_unique_vertices); + } else { + rmm::device_uvector merged_vertices( + sorted_local_vertices.size() + this_bin_sorted_unique_vertices.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + sorted_local_vertices.begin(), + sorted_local_vertices.end(), + this_bin_sorted_unique_vertices.begin(), + this_bin_sorted_unique_vertices.end(), + merged_vertices.begin()); // merging two unique sets from different hash + // bins, so the merged set can't have duplicates + sorted_local_vertices = std::move(merged_vertices); + } + } + } else { sorted_local_vertices = std::move(*local_vertices); thrust::sort( handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - } else { - sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), - handle.get_stream()); - - thrust::merge(handle.get_thrust_policy(), - sorted_unique_majors.begin(), - sorted_unique_majors.end(), - sorted_unique_minors.begin(), - sorted_unique_minors.end(), - sorted_local_vertices.begin()); - - sorted_unique_majors.resize(0, handle.get_stream()); - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - sorted_unique_minors.resize(0, handle.get_stream()); - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); - - if constexpr (multi_gpu) { - sorted_local_vertices = - cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( - handle, std::move(sorted_local_vertices)); - thrust::sort( - handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); - } } + // 2. find an unused vertex ID + auto locally_unused_vertex_id = find_locally_unused_ext_vertex_id( handle, raft::device_span(sorted_local_vertices.data(), sorted_local_vertices.size()), @@ -363,17 +536,9 @@ std::tuple, std::vector, vertex_t> compu "Invalid input arguments: there is no unused value in the entire range of " "vertex_t, increase vertex_t to 64 bit."); - // 4. compute global degrees for the sorted local vertices + // 3. compute global degrees for the sorted local vertices rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); - std::optional> stream_pool_indices{ - std::nullopt}; // FIXME: move this inside the if statement - - auto constexpr num_chunks = size_t{ - 2}; // tuning parameter, this trade-offs # binary searches (up to num_chunks times more binary - // searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and temporary - // buffer requirement (cut by num_chunks times), currently set to 2 to avoid peak memory - // usage happening in this part (especially when minor_comm_size is small) if constexpr (multi_gpu) { auto& comm = handle.get_comms(); @@ -386,94 +551,37 @@ std::tuple, std::vector, vertex_t> compu auto edge_partition_major_range_sizes = host_scalar_allgather(minor_comm, sorted_local_vertices.size(), handle.get_stream()); - if ((minor_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) { - auto vertex_edge_counts = host_scalar_allreduce( - comm, - thrust::make_tuple(static_cast(sorted_local_vertices.size()), num_local_edges), - raft::comms::op_t::SUM, - handle.get_stream()); - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is approximately - // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) + - // (E / (comm_size * minor_comm_size)) / num_chunks * sizeof(vertex_t) * 2 + - // std::min(V/P, (E / (comm_size * minor_comm_size)) / num_chunks) * (sizeof(vertex_t) + - // sizeof(edge_t)) - // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) - auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0 - ? static_cast(thrust::get<1>(vertex_edge_counts)) / - static_cast(thrust::get<0>(vertex_edge_counts)) - : double{0.0}; - auto num_streams = static_cast( - (avg_vertex_degree * sizeof(vertex_t)) / - (static_cast(sizeof(vertex_t) + sizeof(edge_t)) + - (((avg_vertex_degree / minor_comm_size) / num_chunks) * sizeof(vertex_t) * 2) + - (std::min(1.0, ((avg_vertex_degree / minor_comm_size) / num_chunks)) * - (sizeof(vertex_t) + sizeof(edge_t))))); - if (num_streams >= 2) { - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - for (int i = 0; i < minor_comm_size; ++i) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size()) - : handle.get_stream(); - - rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], loop_stream); + rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], + handle.get_stream()); device_bcast(minor_comm, sorted_local_vertices.data(), sorted_majors.data(), edge_partition_major_range_sizes[i], i, - loop_stream); + handle.get_stream()); - rmm::device_uvector sorted_major_degrees(sorted_majors.size(), loop_stream); - thrust::fill(rmm::exec_policy(loop_stream), + rmm::device_uvector sorted_major_degrees(sorted_majors.size(), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), sorted_major_degrees.begin(), sorted_major_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, loop_stream); - tmp_majors.reserve( - (static_cast(edgelist_edge_counts[i]) + (num_chunks - 1)) / num_chunks, - loop_stream); - size_t offset{0}; - for (size_t j = 0; j < num_chunks; ++j) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[i]) - offset); - tmp_majors.resize(this_chunk_size, loop_stream); - thrust::copy(rmm::exec_policy(loop_stream), - edgelist_majors[i] + offset, - edgelist_majors[i] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(rmm::exec_policy(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); - rmm::device_uvector tmp_values(num_unique_majors, loop_stream); - thrust::reduce_by_key(rmm::exec_policy(loop_stream), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(rmm::exec_policy(loop_stream), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_majors.data(), - static_cast(sorted_majors.size()), - sorted_major_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [sorted_majors = + raft::device_span(sorted_majors.data(), sorted_majors.size()), + sorted_major_degrees = raft::device_span( + sorted_major_degrees.data(), sorted_major_degrees.size())] __device__(auto major) { + auto it = + thrust::lower_bound(thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); device_reduce(minor_comm, sorted_major_degrees.begin(), @@ -481,11 +589,9 @@ std::tuple, std::vector, vertex_t> compu edge_partition_major_range_sizes[i], raft::comms::op_t::SUM, i, - loop_stream); + handle.get_stream()); if (i == minor_comm_rank) { sorted_local_vertex_degrees = std::move(sorted_major_degrees); } } - - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } else { assert(edgelist_majors.size() == 1); @@ -495,47 +601,24 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertex_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, handle.get_stream()); - tmp_majors.reserve(static_cast(edgelist_edge_counts[0] + (num_chunks - 1)) / num_chunks, - handle.get_stream()); - size_t offset{0}; - for (size_t i = 0; i < num_chunks; ++i) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[0]) - offset); - tmp_majors.resize(this_chunk_size, handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[0] + offset, - edgelist_majors[0] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, handle.get_stream()); - rmm::device_uvector tmp_values(num_unique_majors, handle.get_stream()); - thrust::reduce_by_key(handle.get_thrust_policy(), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(handle.get_thrust_policy(), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_local_vertices.data(), - static_cast(sorted_local_vertices.size()), - sorted_local_vertex_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each(handle.get_thrust_policy(), + edgelist_majors[0], + edgelist_majors[0] + edgelist_edge_counts[0], + [sorted_majors = raft::device_span( + sorted_local_vertices.data(), sorted_local_vertices.size()), + sorted_major_degrees = raft::device_span( + sorted_local_vertex_degrees.data(), + sorted_local_vertex_degrees.size())] __device__(auto major) { + auto it = thrust::lower_bound( + thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); } - // 4. sort local vertices by degree (descending) + // 5. sort local vertices by degree (descending) thrust::sort_by_key(handle.get_thrust_policy(), sorted_local_vertex_degrees.begin(), @@ -543,7 +626,7 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertices.begin(), thrust::greater()); - // 5. compute segment_offsets + // 6. compute segment_offsets static_assert(detail::num_sparse_segments_per_vertex_partition == 3); static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) && @@ -553,57 +636,85 @@ std::tuple, std::vector, vertex_t> compu (detail::hypersparse_threshold_ratio <= 1.0)); size_t mid_degree_threshold{detail::mid_degree_threshold}; size_t low_degree_threshold{detail::low_degree_threshold}; - size_t hypersparse_degree_threshold{0}; + size_t hypersparse_degree_threshold{1}; if (multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); mid_degree_threshold *= minor_comm_size; low_degree_threshold *= minor_comm_size; - hypersparse_degree_threshold = - static_cast(minor_comm_size * detail::hypersparse_threshold_ratio); + hypersparse_degree_threshold = std::max( + static_cast(minor_comm_size * detail::hypersparse_threshold_ratio), size_t{1}); } - auto num_segments_per_vertex_partition = - detail::num_sparse_segments_per_vertex_partition + - (hypersparse_degree_threshold > 0 ? size_t{2} : size_t{1}); // last is 0-degree segment - rmm::device_uvector d_thresholds(num_segments_per_vertex_partition - 1, - handle.get_stream()); - auto h_thresholds = - hypersparse_degree_threshold > 0 - ? std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - static_cast(hypersparse_degree_threshold), - std::min(static_cast(hypersparse_degree_threshold), edge_t{1})} - : std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - edge_t{1}}; - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - - rmm::device_uvector d_segment_offsets(num_segments_per_vertex_partition + 1, - handle.get_stream()); - auto vertex_count = static_cast(sorted_local_vertices.size()); - d_segment_offsets.set_element_to_zero_async(0, handle.get_stream()); - d_segment_offsets.set_element( - num_segments_per_vertex_partition, vertex_count, handle.get_stream()); + std::vector h_segment_offsets{}; + std::optional> h_hypersparse_degree_offsets{}; + { + auto num_partitions = detail::num_sparse_segments_per_vertex_partition /* high, mid, low */ + + (hypersparse_degree_threshold > 1 + ? hypersparse_degree_threshold - size_t{1} + /* one partition per each global degree in the hypersparse region */ + : size_t{0}) + + size_t{1} /* zero */; + rmm::device_uvector d_thresholds(num_partitions - 1, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + d_thresholds.begin(), + d_thresholds.end(), + [mid_degree_threshold, + low_degree_threshold, + hypersparse_degree_threshold] __device__(size_t i) { + if (i == 0) { + return mid_degree_threshold; // high,mid boundary + } else if (i == 1) { + return low_degree_threshold; // mid, low boundary + } else { + assert(hypersparse_degree_threshold > (i - 2)); + return hypersparse_degree_threshold - (i - 2); + } + }); + rmm::device_uvector d_offsets(num_partitions + 1, handle.get_stream()); + d_offsets.set_element_to_zero_async(0, handle.get_stream()); + auto vertex_count = static_cast(sorted_local_vertices.size()); + d_offsets.set_element(num_partitions, vertex_count, handle.get_stream()); + thrust::upper_bound(handle.get_thrust_policy(), + sorted_local_vertex_degrees.begin(), + sorted_local_vertex_degrees.end(), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin() + 1, + thrust::greater{}); + std::vector h_offsets(d_offsets.size()); + raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); + handle.sync_stream(); - thrust::upper_bound(handle.get_thrust_policy(), - sorted_local_vertex_degrees.begin(), - sorted_local_vertex_degrees.end(), - d_thresholds.begin(), - d_thresholds.end(), - d_segment_offsets.begin() + 1, - thrust::greater{}); - - std::vector h_segment_offsets(d_segment_offsets.size()); - raft::update_host(h_segment_offsets.data(), - d_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - handle.sync_stream(); + auto num_segments_per_vertex_partition = + detail::num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold > 1 ? size_t{2} : size_t{1}); // last is 0-degree segment + h_segment_offsets.resize(num_segments_per_vertex_partition + 1); + std::copy(h_offsets.begin(), + h_offsets.begin() + num_sparse_segments_per_vertex_partition + 1, + h_segment_offsets.begin()); + *(h_segment_offsets.rbegin()) = *(h_offsets.rbegin()); + if (hypersparse_degree_threshold > 1) { + *(h_segment_offsets.rbegin() + 1) = *(h_offsets.rbegin() + 1); + + h_hypersparse_degree_offsets = std::vector(hypersparse_degree_threshold); + std::copy(h_offsets.begin() + num_sparse_segments_per_vertex_partition, + h_offsets.begin() + num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold - 1), + (*h_hypersparse_degree_offsets).begin()); + auto shift = (*h_hypersparse_degree_offsets)[0]; + std::transform((*h_hypersparse_degree_offsets).begin(), + (*h_hypersparse_degree_offsets).end(), + (*h_hypersparse_degree_offsets).begin(), + [shift](auto offset) { return offset - shift; }); + *((*h_hypersparse_degree_offsets).rbegin()) = *(h_offsets.rbegin() + 1); + } + } - return std::make_tuple( - std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id); + return std::make_tuple(std::move(sorted_local_vertices), + h_segment_offsets, + h_hypersparse_degree_offsets, + *locally_unused_vertex_id); } template @@ -789,32 +900,28 @@ void expensive_check_edgelist( } template -std::vector aggregate_segment_offsets(raft::handle_t const& handle, - std::vector const& segment_offsets) +std::vector aggregate_offset_vectors(raft::handle_t const& handle, + std::vector const& offsets) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - rmm::device_uvector d_segment_offsets(segment_offsets.size(), handle.get_stream()); - raft::update_device( - d_segment_offsets.data(), segment_offsets.data(), segment_offsets.size(), handle.get_stream()); - rmm::device_uvector d_aggregate_segment_offsets( - minor_comm_size * d_segment_offsets.size(), handle.get_stream()); - minor_comm.allgather(d_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - - std::vector h_aggregate_segment_offsets(d_aggregate_segment_offsets.size(), - vertex_t{0}); - raft::update_host(h_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.size(), + rmm::device_uvector d_offsets(offsets.size(), handle.get_stream()); + raft::update_device(d_offsets.data(), offsets.data(), offsets.size(), handle.get_stream()); + rmm::device_uvector d_aggregate_offset_vectors(minor_comm_size * d_offsets.size(), + handle.get_stream()); + minor_comm.allgather( + d_offsets.data(), d_aggregate_offset_vectors.data(), d_offsets.size(), handle.get_stream()); + + std::vector h_aggregate_offset_vectors(d_aggregate_offset_vectors.size(), vertex_t{0}); + raft::update_host(h_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.size(), handle.get_stream()); handle.sync_stream(); // this is necessary as h_aggregate_offsets can be used right after return. - return h_aggregate_segment_offsets; + return h_aggregate_offset_vectors; } } // namespace detail @@ -857,10 +964,10 @@ renumber_edgelist( (*edgelist_intra_partition_segment_offsets).size() == static_cast(minor_comm_size), "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets).size()."); for (size_t i = 0; i < edgelist_majors.size(); ++i) { - CUGRAPH_EXPECTS( - (*edgelist_intra_partition_segment_offsets)[i].size() == - static_cast(major_comm_size + 1), - "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets)[].size()."); + CUGRAPH_EXPECTS((*edgelist_intra_partition_segment_offsets)[i].size() == + static_cast(major_comm_size + 1), + "Invalid input arguments: erroneous " + "(*edgelist_intra_partition_segment_offsets)[].size()."); CUGRAPH_EXPECTS( std::is_sorted((*edgelist_intra_partition_segment_offsets)[i].begin(), (*edgelist_intra_partition_segment_offsets)[i].end()), @@ -868,8 +975,8 @@ renumber_edgelist( CUGRAPH_EXPECTS( ((*edgelist_intra_partition_segment_offsets)[i][0] == 0) && ((*edgelist_intra_partition_segment_offsets)[i].back() == edgelist_edge_counts[i]), - "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 and " - "(*edgelist_intra_partition_segment_offsets)[].back() should coincide with " + "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 " + "and (*edgelist_intra_partition_segment_offsets)[].back() should coincide with " "edgelist_edge_counts[]."); } } @@ -893,7 +1000,10 @@ renumber_edgelist( // 1. compute renumber map - auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + vertex_partition_segment_offsets, + vertex_partition_hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map(handle, std::move(local_vertices), edgelist_const_majors, @@ -966,11 +1076,16 @@ renumber_edgelist( } } - if ((static_cast(partition.local_edge_partition_minor_range_size() * - 2.5 /* tuning parameter */) >= - static_cast(number_of_edges / comm_size)) && - edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) - // part than the O(E/P) part + double approx_mem_requirements = + static_cast(partition.local_edge_partition_minor_range_size()) * + (static_cast( + sizeof(vertex_t)) /* rmm::device_uvector renumber_map_minor_labels */ + + + static_cast(sizeof(vertex_t) * 2) * + 2.5 /* kv_store_t renumber_map, * 2.5 to consider load factor */); + if ((approx_mem_requirements > + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) && + edgelist_intra_partition_segment_offsets) { vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = @@ -1020,10 +1135,10 @@ renumber_edgelist( recvcounts[i] = partition.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector displacements(recvcounts.size(), 0); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0}); device_allgatherv(major_comm, - renumber_map_labels.begin(), - renumber_map_minor_labels.begin(), + renumber_map_labels.data(), + renumber_map_minor_labels.data(), recvcounts, displacements, handle.get_stream()); @@ -1045,12 +1160,20 @@ renumber_edgelist( } auto edge_partition_segment_offsets = - detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); + detail::aggregate_offset_vectors(handle, vertex_partition_segment_offsets); + auto edge_partition_hypersparse_degree_offsets = + vertex_partition_hypersparse_degree_offsets + ? std::make_optional( + detail::aggregate_offset_vectors(handle, *vertex_partition_hypersparse_degree_offsets)) + : std::nullopt; return std::make_tuple( std::move(renumber_map_labels), - renumber_meta_t{ - number_of_vertices, number_of_edges, partition, edge_partition_segment_offsets}); + renumber_meta_t{number_of_vertices, + number_of_edges, + partition, + edge_partition_segment_offsets, + edge_partition_hypersparse_degree_offsets}); } template @@ -1078,7 +1201,10 @@ renumber_edgelist(raft::handle_t const& handle, std::nullopt); } - auto [renumber_map_labels, segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + segment_offsets, + hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map( handle, std::move(vertices), @@ -1099,8 +1225,9 @@ renumber_edgelist(raft::handle_t const& handle, renumber_map_view.find( edgelist_minors, edgelist_minors + num_edgelist_edges, edgelist_minors, handle.get_stream()); - return std::make_tuple(std::move(renumber_map_labels), - renumber_meta_t{segment_offsets}); + return std::make_tuple( + std::move(renumber_map_labels), + renumber_meta_t{segment_offsets, hypersparse_degree_offsets}); } } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh index 3efa58d963..69c7c556bd 100644 --- a/cpp/src/structure/renumber_utils_impl.cuh +++ b/cpp/src/structure/renumber_utils_impl.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -363,7 +364,7 @@ void renumber_ext_vertices(raft::handle_t const& handle, } std::unique_ptr> renumber_map_ptr{nullptr}; - if (multi_gpu) { + if constexpr (multi_gpu) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); @@ -402,11 +403,12 @@ void renumber_ext_vertices(raft::handle_t const& handle, rmm::device_uvector int_vertices_for_sorted_unique_ext_vertices(0, handle.get_stream()); auto [unique_ext_vertices, int_vertices_for_unique_ext_vertices] = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, local_renumber_map.view(), std::move(sorted_unique_ext_vertices), detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); renumber_map_ptr = std::make_unique>( unique_ext_vertices.begin(), @@ -573,7 +575,6 @@ void unrenumber_int_vertices(raft::handle_t const& handle, auto local_int_vertex_first = vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - auto local_int_vertex_last = vertex_partition_range_lasts[vertex_partition_id]; rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); sorted_unique_int_vertices.resize( @@ -595,16 +596,20 @@ void unrenumber_int_vertices(raft::handle_t const& handle, sorted_unique_int_vertices.end())), handle.get_stream()); - auto [unique_int_vertices, ext_vertices_for_unique_int_vertices] = - collect_values_for_unique_int_vertices(handle, - std::move(sorted_unique_int_vertices), - renumber_map_labels, - vertex_partition_range_lasts); + auto ext_vertices_for_sorted_unique_int_vertices = + collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + renumber_map_labels, + vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); kv_store_t renumber_map( - unique_int_vertices.begin(), - unique_int_vertices.begin() + unique_int_vertices.size(), - ext_vertices_for_unique_int_vertices.begin(), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end(), + ext_vertices_for_sorted_unique_int_vertices.begin(), invalid_vertex_id::value, invalid_vertex_id::value, handle.get_stream()); diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 8a18dedd2a..ba40db1f08 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -16,8 +16,9 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" +#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh" #include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -51,6 +52,24 @@ namespace cugraph { namespace { +template +struct direction_optimizing_info_t { + rmm::device_uvector + approx_out_degrees; // if graph_view.local_vertex_partition_segment_offsets().has_value() is + // true, holds approximate degrees only for the high and mid degree + // segments; otherwise, exact + rmm::device_uvector visited_bitmap; + std::optional> nzd_unvisited_vertices{ + std::nullopt}; // valid only during bottom-up iterations + std::optional num_nzd_unvisited_low_degree_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() is true + std::optional num_nzd_unvisited_hypersparse_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() && + // graph_view.use_dcs() are both true +}; + template struct topdown_e_op_t { detail::edge_partition_endpoint_property_device_view_t @@ -69,18 +88,25 @@ struct topdown_e_op_t { } }; -template +template struct bottomup_e_op_t { - detail::edge_partition_endpoint_property_device_view_t + __device__ vertex_t operator()( + vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const + { + return dst; + } +}; + +template +struct bottomup_pred_op_t { + detail::edge_partition_endpoint_property_device_view_t prev_visited_flags{}; // visited in the previous iterations vertex_t dst_first{}; - __device__ thrust::optional operator()( + __device__ bool operator()( vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const { - auto dst_offset = dst - dst_first; - auto old = prev_visited_flags.get(dst_offset); - return old ? thrust::optional{dst} : thrust::nullopt; + return prev_visited_flags.get(dst - dst_first); } }; @@ -144,14 +170,27 @@ void bfs(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { is_sorted = static_cast(host_scalar_allreduce(handle.get_comms(), static_cast(is_sorted), - raft::comms::op_t::SUM, + raft::comms::op_t::MIN, handle.get_stream())); } - CUGRAPH_EXPECTS( is_sorted, "Invalid input arguments: input sources should be sorted in the non-descending order."); + bool no_duplicates = (static_cast(thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(n_sources), + is_first_in_run_t{sources})) == n_sources); + if constexpr (GraphViewType::is_multi_gpu) { + no_duplicates = static_cast(host_scalar_allreduce(handle.get_comms(), + static_cast(no_duplicates), + raft::comms::op_t::MIN, + handle.get_stream())); + } + CUGRAPH_EXPECTS(no_duplicates, + "Invalid input arguments: input sources should not have duplicates."); + auto num_invalid_vertices = thrust::count_if(handle.get_thrust_policy(), sources, @@ -189,34 +228,119 @@ void bfs(raft::handle_t const& handle, // 3. update meta data for direction optimizing BFS - constexpr edge_t direction_optimizing_alpha = 14; - constexpr vertex_t direction_optimizing_beta = 24; + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + + double direction_optimizing_alpha = + (graph_view.number_of_vertices() > 0) + ? ((static_cast(graph_view.compute_number_of_edges(handle)) / + static_cast(graph_view.number_of_vertices())) * + (1.0 / 3.75) /* tuning parametger */) + : double{1.0}; + constexpr vertex_t direction_optimizing_beta = 24; // tuning parameter - std::optional> out_degrees{std::nullopt}; - std::optional> nzd_unvisited_vertices{std::nullopt}; + std::optional> aux_info{std::nullopt}; if (direction_optimizing) { - out_degrees = graph_view.compute_out_degrees(handle); - nzd_unvisited_vertices = rmm::device_uvector( - graph_view.local_vertex_partition_range_size(), handle.get_stream()); - (*nzd_unvisited_vertices) - .resize(thrust::distance( - (*nzd_unvisited_vertices).begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()), - (*nzd_unvisited_vertices).begin(), - [vertex_partition, - sources = raft::device_span(sources, n_sources), - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = - vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return (out_degrees[v_offset] > edge_t{0}) && - !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); - })), - handle.get_stream()); - (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream()); + rmm::device_uvector approx_out_degrees(0, handle.get_stream()); + if (segment_offsets) { // exploit internal knowedge for exhaustive performance optimization for + // large-scale benchmarking (the else path is sufficient for small + // clusters with few tens of GPUs) + size_t partition_idx{0}; + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + partition_idx = static_cast(minor_comm_rank); + partition_size = static_cast(minor_comm_size); + } + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_mask_view = graph_view.edge_mask_view(); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + auto high_and_mid_degree_segment_size = + (*segment_offsets)[2]; // compute local degrees for high & mid degree segments only, for + // low & hypersparse segments, use low_degree_threshold * + // partition_size * 0.5 & partition_size * + // hypersparse_threshold_ratio * 0.5 as approximate out degrees + if (edge_partition_e_mask) { + approx_out_degrees = edge_partition.compute_local_degrees_with_mask( + (*edge_partition_e_mask).value_first(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } else { + approx_out_degrees = edge_partition.compute_local_degrees( + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } + thrust::transform(handle.get_thrust_policy(), + approx_out_degrees.begin(), + approx_out_degrees.end(), + approx_out_degrees.begin(), + multiplier_t{static_cast( + partition_size)}); // local_degrees => approximate global degrees + } else { + approx_out_degrees = graph_view.compute_out_degrees(handle); // exact + } + + rmm::device_uvector visited_bitmap( + packed_bool_size(graph_view.local_vertex_partition_range_size()), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + visited_bitmap.begin(), + visited_bitmap.end(), + packed_bool_empty_mask()); + thrust::for_each( + handle.get_thrust_policy(), + sources, + sources + n_sources, + [bitmap = raft::device_span(visited_bitmap.data(), visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + + std::optional num_nzd_unvisited_low_degree_vertices{std::nullopt}; + std::optional num_nzd_unvisited_hypersparse_vertices{std::nullopt}; + if (segment_offsets) { + num_nzd_unvisited_low_degree_vertices = (*segment_offsets)[3] - (*segment_offsets)[2]; + if (graph_view.use_dcs()) { + num_nzd_unvisited_hypersparse_vertices = (*segment_offsets)[4] - (*segment_offsets)[3]; + } + if (n_sources > 0) { + std::vector h_sources(n_sources); + raft::update_host(h_sources.data(), sources, n_sources, handle.get_stream()); + handle.sync_stream(); + for (size_t i = 0; i < h_sources.size(); ++i) { + auto v_offset = h_sources[i] - graph_view.local_vertex_partition_range_first(); + if ((v_offset >= (*segment_offsets)[2]) && (v_offset < (*segment_offsets)[3])) { + --(*num_nzd_unvisited_low_degree_vertices); + } else if (graph_view.use_dcs()) { + if ((v_offset >= (*segment_offsets)[3]) && (v_offset < (*segment_offsets)[4])) { + --(*num_nzd_unvisited_hypersparse_vertices); + } + } + } + } + } + + aux_info = + direction_optimizing_info_t{std::move(approx_out_degrees), + std::move(visited_bitmap), + std::nullopt, + num_nzd_unvisited_low_degree_vertices, + num_nzd_unvisited_hypersparse_vertices}; } // 4. initialize BFS frontier @@ -237,7 +361,6 @@ void bfs(raft::handle_t const& handle, handle, graph_view); // this may mark some vertices visited in previous iterations as unvisited // (but this is OK as we check prev_dst_visited_flags first) fill_edge_dst_property(handle, graph_view, dst_visited_flags.mutable_view(), false); - fill_edge_dst_property(handle, graph_view, vertex_frontier.bucket(bucket_idx_cur).begin(), @@ -247,12 +370,12 @@ void bfs(raft::handle_t const& handle, // 4. BFS iteration vertex_t depth{0}; - bool top_down = true; - auto cur_aggregate_vertex_frontier_size = + bool topdown = true; + auto cur_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_cur).aggregate_size()); while (true) { - vertex_t next_aggregate_vertex_frontier_size{}; - if (top_down) { + vertex_t next_aggregate_frontier_size{}; + if (topdown) { topdown_e_op_t e_op{}; e_op.prev_visited_flags = detail::edge_partition_endpoint_property_device_view_t( @@ -263,14 +386,15 @@ void bfs(raft::handle_t const& handle, e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + reduce_op::any()); auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), predecessor_buffer.begin()); @@ -286,9 +410,9 @@ void bfs(raft::handle_t const& handle, key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); - next_aggregate_vertex_frontier_size = + next_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_next).aggregate_size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } + if (next_aggregate_frontier_size == 0) { break; } fill_edge_dst_property(handle, graph_view, @@ -298,65 +422,146 @@ void bfs(raft::handle_t const& handle, true); if (direction_optimizing) { - auto m_f = thrust::transform_reduce( - handle.get_thrust_policy(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); + if (vertex_frontier.bucket(bucket_idx_next).size() > 0) { + thrust::for_each( + handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + } + double m_f{0.0}; + double m_u{0.0}; { - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - tmp_vertices.begin())), - handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + partition_size = static_cast(minor_comm_size); + } + + auto f_vertex_first = vertex_frontier.bucket(bucket_idx_next).begin(); + auto f_vertex_last = vertex_frontier.bucket(bucket_idx_next).end(); + + if (segment_offsets) { + // FIXME: this actually over-estimates for graphs with power-law degree distribution + auto approx_low_segment_degree = + static_cast(low_degree_threshold * partition_size) * 0.5; + auto approx_hypersparse_segment_degree = + static_cast(partition_size) * hypersparse_threshold_ratio * 0.5; + auto f_segment_offsets = compute_key_segment_offsets( + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + (f_segment_offsets[3] - f_segment_offsets[2]); + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + (f_segment_offsets[4] - f_segment_offsets[3]); + } + f_vertex_last = f_vertex_first + f_segment_offsets[2]; + m_f = static_cast((f_segment_offsets[3] - f_segment_offsets[2])) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_f += static_cast(f_segment_offsets[4] - f_segment_offsets[3]) * + approx_hypersparse_segment_degree; + } + + m_u = static_cast(*((*aux_info).num_nzd_unvisited_low_degree_vertices)) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_u += static_cast(*((*aux_info).num_nzd_unvisited_hypersparse_vertices)) * + approx_hypersparse_segment_degree; + } + } + + m_f += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + f_vertex_first, + f_vertex_last, + cuda::proclaim_return_type( + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) { + auto v_offset = v - v_first; + return out_degrees[v_offset]; + }), + edge_t{0}, + thrust::plus{})); + + m_u += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(segment_offsets + ? (*segment_offsets)[2] + : graph_view.local_vertex_partition_range_size()), + cuda::proclaim_return_type( + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + bitmap = raft::device_span( + (*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size())] __device__(vertex_t v_offset) { + auto word = bitmap[packed_bool_offset(v_offset)]; + if ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()) { // visited + return edge_t{0}; + } else { + return out_degrees[v_offset]; + } + }), + edge_t{0}, + thrust::plus{})); } - auto m_u = thrust::transform_reduce( - handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); - auto aggregate_m_f = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_f, raft::comms::op_t::SUM, handle.get_stream()) - : m_f; - auto aggregate_m_u = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) - : m_u; + auto aggregate_m_f = m_f; + auto aggregate_m_u = m_u; + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce(handle.get_comms(), + thrust::make_tuple(m_f, m_u), + raft::comms::op_t::SUM, + handle.get_stream()); + aggregate_m_f = thrust::get<0>(tmp); + aggregate_m_u = thrust::get<1>(tmp); + } if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && - (next_aggregate_vertex_frontier_size >= cur_aggregate_vertex_frontier_size)) { - top_down = false; + (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { + topdown = false; + (*aux_info).nzd_unvisited_vertices = rmm::device_uvector( + segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(), + handle.get_stream()); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator( + segment_offsets ? graph_view.local_vertex_partition_range_first() + + *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_last()), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) == packed_bool_empty_mask()); + })), + handle.get_stream()); } } - if (top_down) { // staying in top-down + if (topdown) { // staying in top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t(handle); vertex_frontier.swap_buckets(bucket_idx_cur, bucket_idx_next); @@ -364,63 +569,122 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + (*((*aux_info).nzd_unvisited_vertices)).size())); vertex_frontier.bucket(bucket_idx_next) = key_bucket_t(handle); } } else { // bottom up - bottomup_e_op_t e_op{}; - e_op.prev_visited_flags = - detail::edge_partition_endpoint_property_device_view_t( - prev_dst_visited_flags.mutable_view()); - e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); - auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_src(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); + rmm::device_uvector new_frontier_vertex_buffer(0, handle.get_stream()); + { + bottomup_e_op_t e_op{}; + bottomup_pred_op_t pred_op{}; + pred_op.prev_visited_flags = + detail::edge_partition_endpoint_property_device_view_t( + prev_dst_visited_flags.view()); + pred_op.dst_first = graph_view.local_edge_partition_dst_range_first(); + + rmm::device_uvector predecessor_buffer( + vertex_frontier.bucket(bucket_idx_cur).size(), handle.get_stream()); + per_v_transform_reduce_if_outgoing_e(handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + invalid_vertex, + reduce_op::any(), + pred_op, + predecessor_buffer.begin(), + true); + auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), + predecessor_buffer.begin()); + + // FIXME: this scatter_if and the resize below can be concurrently executed. + thrust::scatter_if( + handle.get_thrust_policy(), + input_pair_first, + input_pair_first + predecessor_buffer.size(), + thrust::make_transform_iterator( + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), + predecessor_buffer.begin(), + thrust::make_zip_iterator(distances, predecessor_first), + detail::is_not_equal_t{invalid_vertex}); + + new_frontier_vertex_buffer.resize(predecessor_buffer.size(), handle.get_stream()); + new_frontier_vertex_buffer.resize( + thrust::distance(new_frontier_vertex_buffer.begin(), + thrust::copy_if(handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + vertex_frontier.bucket(bucket_idx_cur).cend(), + predecessor_buffer.begin(), + new_frontier_vertex_buffer.begin(), + detail::is_not_equal_t{invalid_vertex})), + handle.get_stream()); - auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), - predecessor_buffer.begin()); - thrust::scatter( - handle.get_thrust_policy(), - input_pair_first, - input_pair_first + new_frontier_vertex_buffer.size(), - thrust::make_transform_iterator( + assert(direction_optimizing); + + thrust::for_each( + handle.get_thrust_policy(), new_frontier_vertex_buffer.begin(), - detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), - thrust::make_zip_iterator(distances, predecessor_first)); + new_frontier_vertex_buffer.end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::remove_if( + handle.get_thrust_policy(), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + (*((*aux_info).nzd_unvisited_vertices)).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + handle.get_stream()); - assert(direction_optimizing); + if (segment_offsets) { + auto key_segment_offsets = compute_key_segment_offsets( + new_frontier_vertex_buffer.begin(), + new_frontier_vertex_buffer.end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + key_segment_offsets[3] - key_segment_offsets[2]; + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + key_segment_offsets[4] - key_segment_offsets[3]; + } + } + } - { - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - new_frontier_vertex_buffer.begin(), - new_frontier_vertex_buffer.end(), - tmp_vertices.begin())), + next_aggregate_frontier_size = static_cast(new_frontier_vertex_buffer.size()); + auto aggregate_nzd_unvisited_vertices = + static_cast((*((*aux_info).nzd_unvisited_vertices)).size()); + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce( + handle.get_comms(), + thrust::make_tuple(next_aggregate_frontier_size, aggregate_nzd_unvisited_vertices), + raft::comms::op_t::SUM, handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + next_aggregate_frontier_size = thrust::get<0>(tmp); + aggregate_nzd_unvisited_vertices = thrust::get<1>(tmp); } - next_aggregate_vertex_frontier_size = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast(new_frontier_vertex_buffer.size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast(new_frontier_vertex_buffer.size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } + if (next_aggregate_frontier_size == 0) { break; } fill_edge_dst_property(handle, graph_view, @@ -429,21 +693,13 @@ void bfs(raft::handle_t const& handle, prev_dst_visited_flags.mutable_view(), true); - auto aggregate_nzd_unvisted_vertices = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast((*nzd_unvisited_vertices).size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast((*nzd_unvisited_vertices).size()); - - if ((next_aggregate_vertex_frontier_size * direction_optimizing_beta < - aggregate_nzd_unvisted_vertices) && - (next_aggregate_vertex_frontier_size < cur_aggregate_vertex_frontier_size)) { - top_down = true; + if ((next_aggregate_frontier_size * direction_optimizing_beta < + aggregate_nzd_unvisited_vertices) && + (next_aggregate_frontier_size < cur_aggregate_frontier_size)) { + topdown = true; } - if (top_down) { // swithcing to top-down + if (topdown) { // swithcing to top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); @@ -451,11 +707,11 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + ((*(*aux_info).nzd_unvisited_vertices)).size())); } } - cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size; + cur_aggregate_frontier_size = next_aggregate_frontier_size; depth++; if (depth >= depth_limit) { break; } diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh index 40030e2e39..d228460bec 100644 --- a/cpp/src/traversal/extract_bfs_paths_impl.cuh +++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh @@ -220,11 +220,15 @@ std::tuple, vertex_t> extract_bfs_paths( detail::decrement_position{}); if constexpr (multi_gpu) { - current_frontier = collect_values_for_int_vertices(handle, - current_frontier.begin(), - current_frontier.end(), - predecessors, - h_vertex_partition_range_lasts); + auto& comm = handle.get_comms(); + current_frontier = + collect_values_for_int_vertices(comm, + current_frontier.begin(), + current_frontier.end(), + predecessors, + h_vertex_partition_range_lasts, + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } else { thrust::transform(handle.get_thrust_policy(), current_frontier.begin(), diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh index acf3cfe8fc..44fa21a525 100644 --- a/cpp/src/traversal/k_hop_nbrs_impl.cuh +++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include @@ -147,15 +147,15 @@ k_hop_nbrs(raft::handle_t const& handle, rmm::device_uvector nbrs(0, handle.get_stream()); for (size_t iter = 0; iter < k; ++iter) { auto new_frontier_key_buffer = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - push_graph_view, - frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{}, - reduce_op::null{}, - do_expensive_check); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + push_graph_view, + frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{}, + reduce_op::null{}, + do_expensive_check); if (iter < (k - 1)) { frontier.bucket(bucket_idx_cur).clear(); frontier.bucket(bucket_idx_cur) diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index e1b7444b92..b3cd0d57c6 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -22,7 +22,7 @@ #include "prims/kv_store.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -641,7 +641,6 @@ rmm::device_uvector od_shortest_distances( cutoff, invalid_distance}; detail::transform_reduce_v_frontier_call_e_op_t< - false, thrust::tuple, weight_t, vertex_t, @@ -653,8 +652,8 @@ rmm::device_uvector od_shortest_distances( auto new_frontier_tagged_vertex_buffer = allocate_dataframe_buffer>(0, handle.get_stream()); - std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = detail:: - extract_transform_v_frontier_e, weight_t>( + std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = + detail::extract_transform_v_frontier_e, weight_t>( handle, graph_view, vertex_frontier.bucket(bucket_idx_near), @@ -675,12 +674,14 @@ rmm::device_uvector od_shortest_distances( resize_dataframe_buffer(new_frontier_tagged_vertex_buffer, 0, handle.get_stream()); shrink_to_fit_dataframe_buffer(new_frontier_tagged_vertex_buffer, handle.get_stream()); - std::tie(new_frontier_keys, distance_buffer) = - detail::sort_and_reduce_buffer_elements>( + std::tie(new_frontier_keys, distance_buffer) = detail:: + sort_and_reduce_buffer_elements>( handle, std::move(new_frontier_keys), std::move(distance_buffer), - reduce_op::minimum()); + reduce_op::minimum(), + std::make_tuple(vertex_t{0}, graph_view.number_of_vertices()), + std::nullopt); } vertex_frontier.bucket(bucket_idx_near).clear(); diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh index 47908524fe..3429672b15 100644 --- a/cpp/src/traversal/sssp_impl.cuh +++ b/cpp/src/traversal/sssp_impl.cuh @@ -19,7 +19,7 @@ #include "prims/fill_edge_src_dst_property.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -197,7 +197,7 @@ void sssp(raft::handle_t const& handle, push_graph_view.local_vertex_partition_view()); auto [new_frontier_vertex_buffer, distance_predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst( + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( handle, push_graph_view, vertex_frontier.bucket(bucket_idx_cur_near), diff --git a/cpp/src/utilities/collect_comm.cuh b/cpp/src/utilities/collect_comm.cuh index 2197409fe2..dc4267aac5 100644 --- a/cpp/src/utilities/collect_comm.cuh +++ b/cpp/src/utilities/collect_comm.cuh @@ -50,79 +50,73 @@ namespace cugraph { -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template -decltype(allocate_dataframe_buffer(0, - rmm::cuda_stream_view{})) -collect_values_for_keys(raft::handle_t const& handle, - KVStoreViewType kv_store_view, - KeyIterator collect_key_first, - KeyIterator collect_key_last, - KeyToGPUIdOp key_to_gpu_id_op) +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template +dataframe_buffer_type_t collect_values_for_keys( + raft::comms::comms_t const& comm, + KVStoreViewType kv_store_view, + KeyIterator collect_key_first, + KeyIterator collect_key_last, + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; static_assert(std::is_same_v::value_type, key_t>); using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - // 1. collect values for the unique keys in [collect_key_first, collect_key_last) rmm::device_uvector unique_keys(thrust::distance(collect_key_first, collect_key_last), - handle.get_stream()); + stream_view); thrust::copy( - handle.get_thrust_policy(), collect_key_first, collect_key_last, unique_keys.begin()); - thrust::sort(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end()); + rmm::exec_policy_nosync(stream_view), collect_key_first, collect_key_last, unique_keys.begin()); + thrust::sort(rmm::exec_policy_nosync(stream_view), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( unique_keys.begin(), - thrust::unique(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end())), - handle.get_stream()); + thrust::unique(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end())), + stream_view); - auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); { - rmm::device_uvector rx_unique_keys(0, handle.get_stream()); + rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); - std::tie(rx_values_for_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); + std::tie(rx_values_for_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); values_for_unique_keys = std::move(rx_values_for_unique_keys); } // 2. build a kv_store_t object for the k, v pairs in unique_keys, values_for_unique_keys. - kv_store_t unique_key_value_store( - handle.get_stream()); + kv_store_t unique_key_value_store(stream_view); if constexpr (KVStoreViewType::binary_search) { unique_key_value_store = kv_store_t(std::move(unique_keys), std::move(values_for_unique_keys), kv_store_view.invalid_value(), false, - handle.get_stream()); + stream_view); } else { auto kv_pair_first = thrust::make_zip_iterator( thrust::make_tuple(unique_keys.begin(), get_dataframe_buffer_begin(values_for_unique_keys))); auto valid_kv_pair_last = - thrust::remove_if(handle.get_thrust_policy(), + thrust::remove_if(rmm::exec_policy(stream_view), kv_pair_first, kv_pair_first + unique_keys.size(), [invalid_value = kv_store_view.invalid_value()] __device__(auto pair) { @@ -136,176 +130,173 @@ collect_values_for_keys(raft::handle_t const& handle, get_dataframe_buffer_begin(values_for_unique_keys), kv_store_view.invalid_key(), kv_store_view.invalid_value(), - handle.get_stream()); + stream_view); - unique_keys.resize(0, handle.get_stream()); - resize_dataframe_buffer(values_for_unique_keys, 0, handle.get_stream()); - unique_keys.shrink_to_fit(handle.get_stream()); - shrink_to_fit_dataframe_buffer(values_for_unique_keys, handle.get_stream()); + unique_keys.resize(0, stream_view); + resize_dataframe_buffer(values_for_unique_keys, 0, stream_view); + unique_keys.shrink_to_fit(stream_view); + shrink_to_fit_dataframe_buffer(values_for_unique_keys, stream_view); } auto unique_key_value_store_view = unique_key_value_store.view(); // 3. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( - thrust::distance(collect_key_first, collect_key_last), handle.get_stream()); - unique_key_value_store_view.find(collect_key_first, - collect_key_last, - get_dataframe_buffer_begin(value_buffer), - handle.get_stream()); + thrust::distance(collect_key_first, collect_key_last), stream_view); + unique_key_value_store_view.find( + collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer), stream_view); return value_buffer; } -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template std::tuple, - decltype(allocate_dataframe_buffer( - 0, cudaStream_t{nullptr}))> + dataframe_buffer_type_t> collect_values_for_unique_keys( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, KVStoreViewType kv_store_view, rmm::device_uvector&& collect_unique_keys, - KeyToGPUIdOp key_to_gpu_id_op) + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - - auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, stream_view); { auto [rx_unique_keys, rx_value_counts] = groupby_gpu_id_and_shuffle_values( comm, collect_unique_keys.begin(), collect_unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - std::tie(values_for_collect_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + std::tie(values_for_collect_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); } return std::make_tuple(std::move(collect_unique_keys), std::move(values_for_collect_unique_keys)); } template -std::tuple< - rmm::device_uvector, - decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr}))> -collect_values_for_unique_int_vertices(raft::handle_t const& handle, - rmm::device_uvector&& collect_unique_int_vertices, - ValueIterator local_value_first, - std::vector const& vertex_partition_range_lasts) +dataframe_buffer_type_t::value_type> +collect_values_for_sorted_unique_int_vertices( + raft::comms::comms_t const& comm, + raft::device_span collect_sorted_unique_int_vertices, + ValueIterator local_value_first, + std::vector const& comm_rank_vertex_partition_range_lasts, + vertex_t local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using value_t = typename thrust::iterator_traits::value_type; - auto& comm = handle.get_comms(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_size = major_comm.get_size(); - auto const major_comm_rank = major_comm.get_rank(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - auto const minor_comm_rank = minor_comm.get_rank(); + // 1.find tx_counts - // 1. groupby and shuffle internal vertices + rmm::device_uvector d_range_lasts(comm_rank_vertex_partition_range_lasts.size(), + stream_view); + raft::update_device(d_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.size(), + stream_view); - rmm::device_uvector d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(), - handle.get_stream()); - raft::update_device(d_vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.size(), - handle.get_stream()); + rmm::device_uvector d_offsets(d_range_lasts.size() - 1, stream_view); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + collect_sorted_unique_int_vertices.begin(), + collect_sorted_unique_int_vertices.end(), + d_range_lasts.begin(), + d_range_lasts.begin() + (d_range_lasts.size() - 1), + d_offsets.begin()); - auto [rx_int_vertices, rx_int_vertex_counts] = groupby_gpu_id_and_shuffle_values( - comm, - collect_unique_int_vertices.begin(), - collect_unique_int_vertices.end(), - detail::compute_gpu_id_from_int_vertex_t{ - raft::device_span(d_vertex_partition_range_lasts.data(), - d_vertex_partition_range_lasts.size()), - major_comm_size, - minor_comm_size}, - handle.get_stream()); - - // 2: Lookup return values - - auto vertex_partition_id = - partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank); - auto local_int_vertex_first = - vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - - auto value_buffer = - allocate_dataframe_buffer(rx_int_vertices.size(), handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); + h_offsets[0] = 0; + h_offsets.back() = collect_sorted_unique_int_vertices.size(); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + + std::vector tx_counts(comm_rank_vertex_partition_range_lasts.size()); + std::adjacent_difference(h_offsets.begin() + 1, h_offsets.end(), tx_counts.begin()); + + // 2. shuffle sorted unique internal vertices to the owning ranks + + auto [rx_int_vertices, rx_counts] = + shuffle_values(comm, collect_sorted_unique_int_vertices.begin(), tx_counts, stream_view); + + // 3.Lookup return values + + auto value_buffer = allocate_dataframe_buffer(rx_int_vertices.size(), stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), rx_int_vertices.begin(), rx_int_vertices.end(), get_dataframe_buffer_begin(value_buffer), - [local_value_first, local_int_vertex_first] __device__(auto v) { - return local_value_first[v - local_int_vertex_first]; + [local_value_first, local_vertex_partition_range_first] __device__(auto v) { + return local_value_first[v - local_vertex_partition_range_first]; }); + rx_int_vertices.resize(0, stream_view); + rx_int_vertices.shrink_to_fit(stream_view); - // 3: Shuffle results back to original GPU + // 4. Shuffle results back to the original ranks - std::tie(value_buffer, std::ignore) = shuffle_values( - comm, get_dataframe_buffer_begin(value_buffer), rx_int_vertex_counts, handle.get_stream()); + std::tie(value_buffer, std::ignore) = + shuffle_values(comm, get_dataframe_buffer_begin(value_buffer), rx_counts, stream_view); - return std::make_tuple(std::move(collect_unique_int_vertices), std::move(value_buffer)); + return value_buffer; } template -decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr})) +dataframe_buffer_type_t::value_type> collect_values_for_int_vertices( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, VertexIterator collect_vertex_first, VertexIterator collect_vertex_last, ValueIterator local_value_first, std::vector::value_type> const& - vertex_partition_range_lasts) + comm_rank_vertex_partition_range_lasts, + typename thrust::iterator_traits::value_type local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using vertex_t = typename thrust::iterator_traits::value_type; using value_t = typename thrust::iterator_traits::value_type; size_t input_size = thrust::distance(collect_vertex_first, collect_vertex_last); - rmm::device_uvector sorted_unique_int_vertices(input_size, handle.get_stream()); + rmm::device_uvector sorted_unique_int_vertices(input_size, stream_view); - raft::copy( - sorted_unique_int_vertices.data(), collect_vertex_first, input_size, handle.get_stream()); + raft::copy(sorted_unique_int_vertices.data(), collect_vertex_first, input_size, stream_view); - thrust::sort(handle.get_thrust_policy(), + thrust::sort(rmm::exec_policy_nosync(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); - auto last = thrust::unique(handle.get_thrust_policy(), + auto last = thrust::unique(rmm::exec_policy(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); sorted_unique_int_vertices.resize(thrust::distance(sorted_unique_int_vertices.begin(), last), - handle.get_stream()); - - auto [unique_int_vertices, tmp_value_buffer] = collect_values_for_unique_int_vertices( - handle, std::move(sorted_unique_int_vertices), local_value_first, vertex_partition_range_lasts); + stream_view); - kv_store_t kv_map(std::move(unique_int_vertices), + auto tmp_value_buffer = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + local_value_first, + comm_rank_vertex_partition_range_lasts, + local_vertex_partition_range_first, + stream_view); + + kv_store_t kv_map(std::move(sorted_unique_int_vertices), std::move(tmp_value_buffer), invalid_vertex_id::value, false, - handle.get_stream()); + stream_view); auto device_view = detail::kv_binary_search_store_device_view_t(kv_map.view()); - auto value_buffer = allocate_dataframe_buffer(input_size, handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + auto value_buffer = allocate_dataframe_buffer(input_size, stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), collect_vertex_first, collect_vertex_last, get_dataframe_buffer_begin(value_buffer), diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh index 70327db5ff..1cf2493cd2 100644 --- a/cpp/src/utilities/shuffle_vertex_pairs.cuh +++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh @@ -61,10 +61,10 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl( (edge_ids ? sizeof(edge_t) : size_t{0}) + (edge_types ? sizeof(edge_type_t) : size_t{0}); auto constexpr mem_frugal_ratio = - 0.1; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the - // total_global_mem, switch to the memory frugal approach (thrust::sort is used to - // group-by by default, and thrust::sort requires temporary buffer comparable to the input - // data size) + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + // total_global_mem, switch to the memory frugal approach (thrust::sort is used to + // group-by by default, and thrust::sort requires temporary buffer comparable to the + // input data size) auto mem_frugal_threshold = static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a2eeafea8c..44963f9151 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -698,9 +698,9 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu) ############################################################################################### - # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST tests -------------------------- - ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST_TEST - prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu) + # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests ------------------------------ + ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST + prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu) ############################################################################################### # - MG PRIMS REDUCE_V tests ------------------------------------------------------------------- diff --git a/cpp/tests/c_api/mg_test_utils.h b/cpp/tests/c_api/mg_test_utils.h index b040d8dc52..12ca16bfe3 100644 --- a/cpp/tests/c_api/mg_test_utils.h +++ b/cpp/tests/c_api/mg_test_utils.h @@ -36,6 +36,15 @@ } \ } while (0) +#define C_NCCL_TRY(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' at file=%s line=%d failed.", #call, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + #define C_CUDA_TRY(call) \ do { \ cudaError_t const status = call; \ diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu similarity index 74% rename from cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu rename to cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu index 9b7e24856f..085077017b 100644 --- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu +++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include "utilities/base_fixture.hpp" #include "utilities/conversion_utilities.hpp" @@ -203,48 +203,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); - } - - auto mg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto mg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - - if constexpr (std::is_same_v) { - mg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(mg_reduce_by_src_new_frontier_key_buffer, mg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.stop(); - hr_timer.display_and_clear(std::cout); - } - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); + hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_dst"); } auto mg_reduce_by_dst_new_frontier_key_buffer = @@ -286,64 +245,27 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst // 3. compare SG & MG results if (prims_usecase.check_correctness) { - if constexpr (std::is_same_v) { - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_src_new_frontier_key_buffer.begin(), - mg_reduce_by_src_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_dst_new_frontier_key_buffer.begin(), - mg_reduce_by_dst_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } else { - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } - - auto mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_new_frontier_key_buffer.data(), - mg_reduce_by_src_new_frontier_key_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).size()); - } - auto mg_reduce_by_dst_aggregate_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); if constexpr (std::is_same_v) { + cugraph::unrenumber_local_int_vertices(*handle_, + mg_reduce_by_dst_new_frontier_key_buffer.data(), + mg_reduce_by_dst_new_frontier_key_buffer.size(), + (*mg_renumber_map).data(), + mg_graph_view.local_vertex_partition_range_first(), + mg_graph_view.local_vertex_partition_range_last()); mg_reduce_by_dst_aggregate_new_frontier_key_buffer = cugraph::test::device_gatherv(*handle_, mg_reduce_by_dst_new_frontier_key_buffer.data(), mg_reduce_by_dst_new_frontier_key_buffer.size()); } else { + cugraph::unrenumber_local_int_vertices( + *handle_, + std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).data(), + std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(), + (*mg_renumber_map).data(), + mg_graph_view.local_vertex_partition_range_first(), + mg_graph_view.local_vertex_partition_range_last()); std::get<0>(mg_reduce_by_dst_aggregate_new_frontier_key_buffer) = cugraph::test::device_gatherv( *handle_, @@ -356,26 +278,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst std::get<1>(mg_reduce_by_dst_new_frontier_key_buffer).size()); } - [[maybe_unused]] auto mg_reduce_by_src_aggregate_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (!std::is_same_v) { - if constexpr (std::is_arithmetic_v) { - mg_reduce_by_src_aggregate_payload_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_payload_buffer.data(), - mg_reduce_by_src_payload_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<0>(mg_reduce_by_src_payload_buffer).data(), - std::get<0>(mg_reduce_by_src_payload_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<1>(mg_reduce_by_src_payload_buffer).data(), - std::get<1>(mg_reduce_by_src_payload_buffer).size()); - } - } - [[maybe_unused]] auto mg_reduce_by_dst_aggregate_payload_buffer = cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); if constexpr (!std::is_same_v) { @@ -409,22 +311,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (handle_->get_comms().get_rank() == int{0}) { if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), @@ -471,34 +362,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst .insert(cugraph::get_dataframe_buffer_begin(sg_key_buffer), cugraph::get_dataframe_buffer_end(sg_key_buffer)); - auto sg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto sg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - sg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(sg_reduce_by_src_new_frontier_key_buffer, sg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - auto sg_reduce_by_dst_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); [[maybe_unused]] auto sg_reduce_by_dst_payload_buffer = @@ -528,22 +391,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst } if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), @@ -551,14 +403,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer)); } - bool key_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - ASSERT_TRUE(key_passed); - - key_passed = thrust::equal( + auto key_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer), @@ -567,13 +412,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if constexpr (!std::is_same_v) { bool payload_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_payload_buffer)); - ASSERT_TRUE(payload_passed); - - payload_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index 4d4b83e275..3cd712798e 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -100,16 +100,6 @@ class Tests_MGBFS : public ::testing::TestWithParam initialize_mg_handle(size_t pool_size = 64); +std::unique_ptr initialize_mg_handle( + size_t pool_size = 8 /* default value of CUDA_DEVICE_MAX_CONNECTIONS */); // NCCL lazily initializes for P2P, and this enforces P2P initialization for better performance // measurements diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 0a706d1cf8..ed96ba2391 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -24,7 +24,6 @@ #include #include #include -#include // legacy coo_to_csr #include @@ -234,7 +233,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { construct_edgelist(raft::handle_t const& handle, bool test_weighted, bool store_transposed, - bool multi_gpu) const + bool multi_gpu, + bool shuffle = true) const { CUGRAPH_EXPECTS( (size_t{1} << scale_) <= static_cast(std::numeric_limits::max()), @@ -246,7 +246,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { // cuMemAddressReserve // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size) - size_t constexpr num_partitions_per_gpu = 4; + size_t constexpr num_partitions_per_gpu = 8; size_t num_partitions = num_partitions_per_gpu * static_cast(multi_gpu ? handle.get_comms().get_size() : 1); @@ -330,7 +330,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { handle, std::move(tmp_src_v), std::move(tmp_dst_v), std::move(tmp_weights_v)); } - if (multi_gpu) { + if (multi_gpu && shuffle) { std::tie(store_transposed ? tmp_dst_v : tmp_src_v, store_transposed ? tmp_src_v : tmp_dst_v, tmp_weights_v, @@ -375,7 +375,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { translate(handle, vertex_v); - if (multi_gpu) { + if (multi_gpu && shuffle) { vertex_v = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( handle, std::move(vertex_v)); } @@ -391,6 +391,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { void set_edge_factor(size_t edge_factor) { edge_factor_ = edge_factor; } + bool undirected() const { return undirected_; } + private: size_t scale_{}; size_t edge_factor_{}; @@ -762,39 +764,5 @@ construct_graph(raft::handle_t const& handle, return std::make_tuple(std::move(graph), std::move(edge_weights), std::move(renumber_map)); } -namespace legacy { - -template -std::unique_ptr> construct_graph_csr( - raft::handle_t const& handle, input_usecase_t const& input_usecase, bool test_weighted) -{ - auto [d_src_v, d_dst_v, d_weight_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, test_weighted, false, false); - vertex_t num_vertices{}; // assuming that vertex IDs are non-negative consecutive integers - if (d_vertices_v) { - num_vertices = - max_element( - handle, raft::device_span((*d_vertices_v).data(), (*d_vertices_v).size())) + - 1; - } else { - num_vertices = - std::max( - max_element(handle, raft::device_span(d_src_v.data(), d_src_v.size())), - max_element(handle, raft::device_span(d_dst_v.data(), d_dst_v.size()))) + - 1; - } - - cugraph::legacy::GraphCOOView cooview( - d_src_v.data(), - d_dst_v.data(), - d_weight_v ? d_weight_v->data() : nullptr, - num_vertices, - static_cast(d_src_v.size())); - - return cugraph::coo_to_csr(cooview); -} - -} // namespace legacy } // namespace test } // namespace cugraph From a97775568159d054cba05ed85f885cf2ae765409 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:17:29 -0800 Subject: [PATCH 4/7] Fix debug build failure (#4774) cugraph is no longer dependent on cugraph_ops and no longer pulls files from the cugrpahops repo. But we have two `assert` statements still assuming cugrpahops files are available. These assert statements are compiled only in the debug mode. This PR fixes build errors due to these assert statements in the debug build. Closes #4763 Authors: - Seunghwa Kang (https://github.com/seunghwak) - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Joseph Nke (https://github.com/jnke2016) URL: https://github.com/rapidsai/cugraph/pull/4774 --- .../prims/detail/sample_and_compute_local_nbr_indices.cuh | 7 +------ cpp/src/sampling/neighbor_sampling_impl.hpp | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index 5ebc3dc8ae..dd0da77851 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -597,8 +597,6 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( raft::random::RngState& rng_state, size_t K) { - assert(cugraph::invalid_edge_id_v == cugraph::ops::graph::INVALID_ID); - edge_t mid_partition_degree_range_last = static_cast(K * 10); // tuning parameter assert(mid_partition_degree_range_last > K); size_t high_partition_oversampling_K = K * 2; // tuning parameter @@ -1567,10 +1565,7 @@ uniform_sample_and_compute_local_nbr_indices( size_t K, bool with_replacement) { - using edge_t = typename GraphViewType::edge_type; - - assert(cugraph::invalid_edge_id_v == cugraph::ops::graph::INVALID_ID); - + using edge_t = typename GraphViewType::edge_type; using vertex_t = typename GraphViewType::vertex_type; using key_t = typename thrust::iterator_traits::value_type; diff --git a/cpp/src/sampling/neighbor_sampling_impl.hpp b/cpp/src/sampling/neighbor_sampling_impl.hpp index ccca71cdf2..ce580ea9b4 100644 --- a/cpp/src/sampling/neighbor_sampling_impl.hpp +++ b/cpp/src/sampling/neighbor_sampling_impl.hpp @@ -179,7 +179,7 @@ neighbor_sample_impl(raft::handle_t const& handle, ? (fan_out.size() / num_edge_types) : ((fan_out.size() / num_edge_types) + 1); - for (auto hop = 0; hop < num_hops; hop++) { + for (size_t hop = 0; hop < num_hops; ++hop) { for (auto edge_type_id = 0; edge_type_id < num_edge_types; edge_type_id++) { auto k_level = fan_out[(hop * num_edge_types) + edge_type_id]; rmm::device_uvector srcs(0, handle.get_stream()); From e155a8f2e52ca71a89ac07f7fc205ebe4e062d55 Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:18:34 -0600 Subject: [PATCH 5/7] Updates READMEs, updates `core_number` to properly ignore `degree_type`, minor cleanup (#4776) * updates READMEs to remove outdated nx-cugraph text * updates `core_number` docs, APIs, tests to properly ignore `degree_type` due to `core_number` not supporting directed graphs which `degree_type` is intended for - `degree_type` settings will be honored when directed graphs are supported. * renames test helper function for clarity * fixes issue with datasets API to properly recreate the edgelist for MG (dask) if previously created for SG. Authors: - Rick Ratzel (https://github.com/rlratzel) Approvers: - Don Acosta (https://github.com/acostadon) - Alex Barghi (https://github.com/alexbarghi-nv) - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/cugraph/pull/4776 --- README.md | 10 ----- python/cugraph/cugraph/cores/core_number.py | 41 +++++++++++-------- .../cugraph/cugraph/dask/cores/core_number.py | 29 +++++++------ python/cugraph/cugraph/datasets/dataset.py | 4 +- .../cugraph/tests/core/test_core_number.py | 24 ++++++----- .../cugraph/tests/core/test_core_number_mg.py | 33 ++++++--------- .../tests/link_prediction/test_jaccard.py | 22 +++++----- .../pylibcugraph/pylibcugraph/core_number.pyx | 24 +++++------ 8 files changed, 90 insertions(+), 97 deletions(-) diff --git a/README.md b/README.md index 857406075e..f77b5702f7 100644 --- a/README.md +++ b/README.md @@ -34,16 +34,6 @@ ------ -## News - -___NEW!___ _[nx-cugraph](https://rapids.ai/nx-cugraph/)_, a NetworkX backend that provides GPU acceleration to NetworkX with zero code change. -``` -> pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com -> export NETWORKX_AUTOMATIC_BACKENDS=cugraph -``` -That's it. NetworkX now leverages cuGraph for accelerated graph algorithms. - ----- ## Table of contents diff --git a/python/cugraph/cugraph/cores/core_number.py b/python/cugraph/cugraph/cores/core_number.py index 0b411c2eed..d84069ddec 100644 --- a/python/cugraph/cugraph/cores/core_number.py +++ b/python/cugraph/cugraph/cores/core_number.py @@ -23,19 +23,16 @@ def core_number(G, degree_type="bidirectional"): """ Compute the core numbers for the nodes of the graph G. A k-core of a graph - is a maximal subgraph that contains nodes of degree k or more. - A node has a core number of k if it belongs a k-core but not to k+1-core. - This call does not support a graph with self-loops and parallel - edges. + is a maximal subgraph that contains nodes of degree k or more. A node has + a core number of k if it belongs to a k-core but not to k+1-core. This + call does not support a graph with self-loops and parallel edges. Parameters ---------- G : cuGraph.Graph or networkx.Graph - The graph should contain undirected edges where undirected edges are - represented as directed edges in both directions. While this graph - can contain edge weights, they don't participate in the calculation + The current implementation only supports undirected graphs. The graph + can contain edge weights, but they don't participate in the calculation of the core numbers. - The current implementation only supports undirected graphs. .. deprecated:: 24.12 Accepting a ``networkx.Graph`` is deprecated and will be removed in a @@ -43,9 +40,10 @@ def core_number(G, degree_type="bidirectional"): the ``nx-cugraph`` backend. See: https://rapids.ai/nx-cugraph/ degree_type: str, (default="bidirectional") - This option determines if the core number computation should be based - on input, output, or both directed edges, with valid values being - "incoming", "outgoing", and "bidirectional" respectively. + This option is currently ignored. This option may eventually determine + if the core number computation should be based on input, output, or + both directed edges, with valid values being "incoming", "outgoing", + and "bidirectional" respectively. Returns ------- @@ -63,7 +61,13 @@ def core_number(G, degree_type="bidirectional"): >>> from cugraph.datasets import karate >>> G = karate.get_graph(download=True) >>> df = cugraph.core_number(G) - + >>> df.head() + vertex core_number + 0 33 4 + 1 0 4 + 2 32 4 + 3 2 4 + 4 1 4 """ G, isNx = ensure_cugraph_obj_for_nx(G) @@ -71,11 +75,14 @@ def core_number(G, degree_type="bidirectional"): if G.is_directed(): raise ValueError("input graph must be undirected") - if degree_type not in ["incoming", "outgoing", "bidirectional"]: - raise ValueError( - f"'degree_type' must be either incoming, " - f"outgoing or bidirectional, got: {degree_type}" - ) + # degree_type is currently ignored until libcugraph supports directed + # graphs for core_number. Once supporteed, degree_type should be checked + # like so: + # if degree_type not in ["incoming", "outgoing", "bidirectional"]: + # raise ValueError( + # f"'degree_type' must be either incoming, " + # f"outgoing or bidirectional, got: {degree_type}" + # ) vertex, core_number = pylibcugraph_core_number( resource_handle=ResourceHandle(), diff --git a/python/cugraph/cugraph/dask/cores/core_number.py b/python/cugraph/cugraph/dask/cores/core_number.py index 4ae1fb547d..3266348f73 100644 --- a/python/cugraph/cugraph/dask/cores/core_number.py +++ b/python/cugraph/cugraph/dask/cores/core_number.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -53,15 +53,15 @@ def core_number(input_graph, degree_type="bidirectional"): Parameters ---------- input_graph : cugraph.graph - cuGraph graph descriptor, should contain the connectivity information, - (edge weights are not used in this algorithm). - The current implementation only supports undirected graphs. + The current implementation only supports undirected graphs. The graph + can contain edge weights, but they don't participate in the calculation + of the core numbers. degree_type: str, (default="bidirectional") - This option determines if the core number computation should be based - on input, output, or both directed edges, with valid values being - "incoming", "outgoing", and "bidirectional" respectively. - + This option is currently ignored. This option may eventually determine + if the core number computation should be based on input, output, or + both directed edges, with valid values being "incoming", "outgoing", + and "bidirectional" respectively. Returns ------- @@ -77,11 +77,14 @@ def core_number(input_graph, degree_type="bidirectional"): if input_graph.is_directed(): raise ValueError("input graph must be undirected") - if degree_type not in ["incoming", "outgoing", "bidirectional"]: - raise ValueError( - f"'degree_type' must be either incoming, " - f"outgoing or bidirectional, got: {degree_type}" - ) + # degree_type is currently ignored until libcugraph supports directed + # graphs for core_number. Once supporteed, degree_type should be checked + # like so: + # if degree_type not in ["incoming", "outgoing", "bidirectional"]: + # raise ValueError( + # f"'degree_type' must be either incoming, " + # f"outgoing or bidirectional, got: {degree_type}" + # ) # Initialize dask client client = default_client() diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index 15c30700fc..63389cbc16 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -352,7 +352,9 @@ def get_dask_graph( If True, stores the transpose of the adjacency matrix. Required for certain algorithms. """ - if self._edgelist is None: + if self._edgelist is None or not isinstance( + self._edgelist, dask_cudf.DataFrame + ): self.get_dask_edgelist(download=download) if create_using is None: diff --git a/python/cugraph/cugraph/tests/core/test_core_number.py b/python/cugraph/cugraph/tests/core/test_core_number.py index a01b837ff6..b50e60ceb8 100644 --- a/python/cugraph/cugraph/tests/core/test_core_number.py +++ b/python/cugraph/cugraph/tests/core/test_core_number.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -32,11 +32,15 @@ def setup_function(): # ============================================================================= # Pytest fixtures # ============================================================================= -degree_type = ["incoming", "outgoing"] +# FIXME: degree_type is currently unsupported (ignored) +# degree_type = ["incoming", "outgoing"] +# fixture_params = gen_fixture_params_product( +# (UNDIRECTED_DATASETS, "graph_file"), +# (degree_type, "degree_type"), +# ) fixture_params = gen_fixture_params_product( (UNDIRECTED_DATASETS, "graph_file"), - (degree_type, "degree_type"), ) @@ -46,7 +50,9 @@ def input_combo(request): This fixture returns a dictionary containing all input params required to run a Core number algo """ - parameters = dict(zip(("graph_file", "degree_type"), request.param)) + # FIXME: degree_type is not supported so do not test with different values + # parameters = dict(zip(("graph_file", "degree_type"), request.param)) + parameters = {"graph_file": request.param[0]} graph_file = parameters["graph_file"] G = graph_file.get_graph() @@ -69,7 +75,8 @@ def input_combo(request): def test_core_number(input_combo): G = input_combo["G"] Gnx = input_combo["Gnx"] - degree_type = input_combo["degree_type"] + # FIXME: degree_type is currently unsupported (ignored) + # degree_type = input_combo["degree_type"] nx_core_number_results = cudf.DataFrame() dic_results = nx.core_number(Gnx) @@ -80,7 +87,7 @@ def test_core_number(input_combo): ) core_number_results = ( - cugraph.core_number(G, degree_type) + cugraph.core_number(G) .sort_values("vertex") .reset_index(drop=True) .rename(columns={"core_number": "cugraph_core_number"}) @@ -109,8 +116,3 @@ def test_core_number_invalid_input(input_combo): with pytest.raises(ValueError): cugraph.core_number(G) - - invalid_degree_type = "invalid" - G = input_combo["G"] - with pytest.raises(ValueError): - cugraph.core_number(G, invalid_degree_type) diff --git a/python/cugraph/cugraph/tests/core/test_core_number_mg.py b/python/cugraph/cugraph/tests/core/test_core_number_mg.py index 1138c1dc48..2c2c7e40a2 100644 --- a/python/cugraph/cugraph/tests/core/test_core_number_mg.py +++ b/python/cugraph/cugraph/tests/core/test_core_number_mg.py @@ -17,7 +17,7 @@ import cugraph import cugraph.dask as dcg -from cugraph.datasets import karate, dolphins, karate_asymmetric +from cugraph.datasets import karate, dolphins # ============================================================================= @@ -35,7 +35,8 @@ def setup_function(): DATASETS = [karate, dolphins] -DEGREE_TYPE = ["incoming", "outgoing", "bidirectional"] +# FIXME: degree_type is currently unsupported (ignored) +# DEGREE_TYPE = ["incoming", "outgoing", "bidirectional"] # ============================================================================= @@ -43,9 +44,9 @@ def setup_function(): # ============================================================================= -def get_sg_results(dataset, degree_type): +def get_sg_results(dataset): G = dataset.get_graph(create_using=cugraph.Graph(directed=False)) - res = cugraph.core_number(G, degree_type) + res = cugraph.core_number(G) res = res.sort_values("vertex").reset_index(drop=True) return res @@ -57,23 +58,23 @@ def get_sg_results(dataset, degree_type): @pytest.mark.mg @pytest.mark.parametrize("dataset", DATASETS) -@pytest.mark.parametrize("degree_type", DEGREE_TYPE) -def test_sg_core_number(dask_client, dataset, degree_type, benchmark): +# @pytest.mark.parametrize("degree_type", DEGREE_TYPE) +def test_sg_core_number(dask_client, dataset, benchmark): # This test is only for benchmark purposes. sg_core_number_results = None G = dataset.get_graph(create_using=cugraph.Graph(directed=False)) - sg_core_number_results = benchmark(cugraph.core_number, G, degree_type) + sg_core_number_results = benchmark(cugraph.core_number, G) assert sg_core_number_results is not None @pytest.mark.mg @pytest.mark.parametrize("dataset", DATASETS) -@pytest.mark.parametrize("degree_type", DEGREE_TYPE) -def test_core_number(dask_client, dataset, degree_type, benchmark): +# @pytest.mark.parametrize("degree_type", DEGREE_TYPE) +def test_core_number(dask_client, dataset, benchmark): dataset.get_dask_edgelist(download=True) # reload with MG edgelist dg = dataset.get_dask_graph(create_using=cugraph.Graph(directed=False)) - result_core_number = benchmark(dcg.core_number, dg, degree_type) + result_core_number = benchmark(dcg.core_number, dg) result_core_number = ( result_core_number.drop_duplicates() .compute() @@ -82,7 +83,7 @@ def test_core_number(dask_client, dataset, degree_type, benchmark): .rename(columns={"core_number": "mg_core_number"}) ) - expected_output = get_sg_results(dataset, degree_type) + expected_output = get_sg_results(dataset) # Update the mg core number with sg core number results # for easy comparison using cuDF DataFrame methods. @@ -90,13 +91,3 @@ def test_core_number(dask_client, dataset, degree_type, benchmark): counts_diffs = result_core_number.query("mg_core_number != sg_core_number") assert len(counts_diffs) == 0 - - -@pytest.mark.mg -def test_core_number_invalid_input(): - dg = karate_asymmetric.get_graph(create_using=cugraph.Graph(directed=True)) - - invalid_degree_type = 3 - - with pytest.raises(ValueError): - dcg.core_number(dg, invalid_degree_type) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index c9fb73babb..ed3a796121 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -156,12 +156,10 @@ def networkx_call(M, benchmark_callable=None): # FIXME: This compare is shared across several tests... it should be # a general utility -def compare(src1, dst1, val1, src2, dst2, val2): - # +def assert_results_equal(src1, dst1, val1, src2, dst2, val2): # We will do comparison computations by using dataframe # merge functions (essentially doing fast joins). We # start by making two data frames - # df1 = cudf.DataFrame() df1["src1"] = src1 df1["dst1"] = dst1 @@ -174,19 +172,18 @@ def compare(src1, dst1, val1, src2, dst2, val2): if val2 is not None: df2["val2"] = val2 - # - # Check to see if all pairs in the original data frame - # still exist in the new data frame. If we join (merge) - # the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i]) - # then we should get exactly the same number of entries in - # the data frame if we did not lose any data. - # + # Check to see if all pairs in df1 still exist in the new (merged) data + # frame. If we join (merge) the data frames where (src1[i]=src2[i]) and + # (dst1[i]=dst2[i]) then we should get exactly the same number of entries + # in the data frame if we did not lose any data. join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"]) + # Print detailed differences on test failure if len(df1) != len(join): join2 = df1.merge( df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"] ) + orig_option = pd.get_option("display.max_rows") pd.set_option("display.max_rows", 500) print("df1 = \n", df1.sort_values(["src1", "dst1"])) print("df2 = \n", df2.sort_values(["src2", "dst2"])) @@ -196,6 +193,7 @@ def compare(src1, dst1, val1, src2, dst2, val2): .to_pandas() .query("src2.isnull()", engine="python"), ) + pd.set_option("display.max_rows", orig_option) assert len(df1) == len(join) @@ -485,7 +483,7 @@ def test_all_pairs_jaccard_with_topk(): worst_coeff = all_pairs_jaccard_results["jaccard_coeff"].min() better_than_k = jaccard_results[jaccard_results["jaccard_coeff"] > worst_coeff] - compare( + assert_results_equal( all_pairs_jaccard_results["first"], all_pairs_jaccard_results["second"], all_pairs_jaccard_results["jaccard_coeff"], @@ -494,7 +492,7 @@ def test_all_pairs_jaccard_with_topk(): jaccard_results["jaccard_coeff"], ) - compare( + assert_results_equal( better_than_k["first"], better_than_k["second"], better_than_k["jaccard_coeff"], diff --git a/python/pylibcugraph/pylibcugraph/core_number.pyx b/python/pylibcugraph/pylibcugraph/core_number.pyx index e754ef2c65..48d9c5de42 100644 --- a/python/pylibcugraph/pylibcugraph/core_number.pyx +++ b/python/pylibcugraph/pylibcugraph/core_number.pyx @@ -66,14 +66,14 @@ def core_number(ResourceHandle resource_handle, referencing data and running algorithms. graph : SGGraph or MGGraph - The input graph, for either Single or Multi-GPU operations. + The input graph, for either single or multi-GPU operations. The input + graph must be symmetric (the is_symmetric property must be True). degree_type: str - This option determines if the core number computation should be based - on input, output, or both directed edges, with valid values being - "incoming", "outgoing", and "bidirectional" respectively. - This option is currently ignored in this release, and setting it will - result in a warning. + This option is currently ignored. This option may eventually determine + if the core number computation should be based on input, output, or + both directed edges, with valid values being "incoming", "outgoing", + and "bidirectional" respectively. do_expensive_check: bool If True, performs more extensive tests on the inputs to ensure @@ -98,14 +98,14 @@ def core_number(ResourceHandle resource_handle, cdef cugraph_error_code_t error_code cdef cugraph_error_t* error_ptr - degree_type_map = { - "incoming": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_IN, - "outgoing": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_OUT, - "bidirectional": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_INOUT} - + # When supported, degree_type string should be mapped to constant like so: + # degree_type_map = { + # "incoming": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_IN, + # "outgoing": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_OUT, + # "bidirectional": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_INOUT} error_code = cugraph_core_number(c_resource_handle_ptr, c_graph_ptr, - degree_type_map[degree_type], + cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_IN, do_expensive_check, &result_ptr, &error_ptr) From 8388aca66d7beb3a4b173bdaca05685c34f49f2e Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Mon, 25 Nov 2024 16:20:11 -0600 Subject: [PATCH 6/7] Drop support for NetworkX 3.0 and 3.1 for nx-cugraph (#4766) This updates docs. Code changes are here: https://github.com/rapidsai/nx-cugraph/pull/27 See: https://github.com/rapidsai/cugraph/issues/4760 Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4766 --- docs/cugraph/source/nx_cugraph/how-it-works.md | 2 +- docs/cugraph/source/nx_cugraph/installation.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cugraph/source/nx_cugraph/how-it-works.md b/docs/cugraph/source/nx_cugraph/how-it-works.md index 88788f3c0c..0061b0445d 100644 --- a/docs/cugraph/source/nx_cugraph/how-it-works.md +++ b/docs/cugraph/source/nx_cugraph/how-it-works.md @@ -10,7 +10,7 @@ While NetworkX is a pure-Python implementation, backends may be written to use o ## Enabling nx-cugraph -It is recommended to use `networkx>=3.4` for optimal zero code change performance, but `nx-cugraph` will also work with `networkx 3.0+`. +It is recommended to use `networkx>=3.4` for optimal zero code change performance, but `nx-cugraph` will also work with `networkx 3.2+`. NetworkX will use `nx-cugraph` as the backend if any of the following are used: diff --git a/docs/cugraph/source/nx_cugraph/installation.md b/docs/cugraph/source/nx_cugraph/installation.md index a816801d00..9675306c47 100644 --- a/docs/cugraph/source/nx_cugraph/installation.md +++ b/docs/cugraph/source/nx_cugraph/installation.md @@ -10,7 +10,7 @@ This guide describes how to install ``nx-cugraph`` and use it in your workflows. - **Volta architecture or later NVIDIA GPU, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+** - **[CUDA](https://docs.nvidia.com/cuda/index.html) 11.2, 11.4, 11.5, 11.8, 12.0, 12.2, or 12.5** - **Python >= 3.10** - - **[NetworkX](https://networkx.org/documentation/stable/install.html#) >= 3.0 (version 3.4 or higher recommended)** + - **[NetworkX](https://networkx.org/documentation/stable/install.html#) >= 3.2 (version 3.4 or higher recommended)** More details about system requirements can be found in the [RAPIDS System Requirements Documentation](https://docs.rapids.ai/install#system-req). From 3d681cc9e6d85336822ccb6faf0d1cba5f759ead Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Mon, 25 Nov 2024 16:21:19 -0600 Subject: [PATCH 7/7] Add sphinx-lint pre-commit and some docs fixes (#4771) I noticed the `Traversals` table is not showing up in the [nx-cugraph docs](https://docs.rapids.ai/api/cugraph/stable/nx_cugraph/supported-algorithms/). `sphinx-lint` catches the underlying issue, and it also caught a couple other minor issues that this PR fixes. `sphinx-lint` is used by other notable repos such as `pandas` ([here](https://github.com/pandas-dev/pandas/blob/7fe270c8e7656c0c187260677b3b16a17a1281dc/.pre-commit-config.yaml#L92-L96)] and CPython ([here](https://github.com/python/cpython/blob/8fe1926164932f868e6e907ad72a74c2f2372b07/.pre-commit-config.yaml#L68-L73)) Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Jake Awe (https://github.com/AyodeAwe) - Ralph Liu (https://github.com/nv-rliu) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4771 --- .pre-commit-config.yaml | 5 +++++ docs/cugraph/source/api_docs/cugraph-ops/index.rst | 2 +- docs/cugraph/source/api_docs/cugraph-ops/python/index.rst | 2 +- docs/cugraph/source/nx_cugraph/index.rst | 2 +- docs/cugraph/source/nx_cugraph/supported-algorithms.rst | 2 +- docs/cugraph/source/tutorials/cugraph_blogs.rst | 4 ++-- 6 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4bb037b5fd..28f83a967c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,6 +53,11 @@ repos: meta[.]yaml$| setup[.]cfg$ - id: verify-alpha-spec + - repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v1.0.0 + hooks: + - id: sphinx-lint + args: ["--enable=all", "--disable=line-too-long"] - repo: https://github.com/rapidsai/dependency-file-generator rev: v1.16.0 hooks: diff --git a/docs/cugraph/source/api_docs/cugraph-ops/index.rst b/docs/cugraph/source/api_docs/cugraph-ops/index.rst index 0f6a6c937d..41ae941652 100644 --- a/docs/cugraph/source/api_docs/cugraph-ops/index.rst +++ b/docs/cugraph/source/api_docs/cugraph-ops/index.rst @@ -1,7 +1,7 @@ cugraph-ops API reference ========================= -This page provides a list of all publicly accessible modules, methods and classes through `pylibcugraphops.*` namespace. +This page provides a list of all publicly accessible modules, methods and classes through ``pylibcugraphops.*`` namespace. .. toctree:: :maxdepth: 2 diff --git a/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst b/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst index fb25f2fa00..f1f332cb01 100644 --- a/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst +++ b/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst @@ -1,7 +1,7 @@ cugraph-ops Python API reference ================================ -This page provides a list of all publicly accessible modules, methods and classes through `pylibcugraphops.*` namespace. +This page provides a list of all publicly accessible modules, methods and classes through ``pylibcugraphops.*`` namespace. .. toctree:: :maxdepth: 2 diff --git a/docs/cugraph/source/nx_cugraph/index.rst b/docs/cugraph/source/nx_cugraph/index.rst index 50565c805a..0eb8907b39 100644 --- a/docs/cugraph/source/nx_cugraph/index.rst +++ b/docs/cugraph/source/nx_cugraph/index.rst @@ -49,7 +49,7 @@ Users can have GPU-based, large-scale performance **without** changing their fam +--------------------------------------------------------------------------------------------------------+ | **Run the same code on CPU or GPU** | | | -| Nothing changes, not even your `import` statements, when going from CPU to GPU. | +| Nothing changes, not even your ``import`` statements, when going from CPU to GPU. | +--------------------------------------------------------------------------------------------------------+ diff --git a/docs/cugraph/source/nx_cugraph/supported-algorithms.rst b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst index ae32bc330f..ed31ec73ae 100644 --- a/docs/cugraph/source/nx_cugraph/supported-algorithms.rst +++ b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst @@ -180,7 +180,7 @@ Algorithms +---------------------------------------+ +---------------------------+ -| **Traversal** | +| **Traversal** | +===========================+ | bfs_edges | +---------------------------+ diff --git a/docs/cugraph/source/tutorials/cugraph_blogs.rst b/docs/cugraph/source/tutorials/cugraph_blogs.rst index 3665f425e3..57fa011ab5 100644 --- a/docs/cugraph/source/tutorials/cugraph_blogs.rst +++ b/docs/cugraph/source/tutorials/cugraph_blogs.rst @@ -65,9 +65,9 @@ Academic Papers * Alex Fender, Brad Rees, Joe Eaton (2022) `Massive Graph Analytics `_ Bader, D. (Editor) CRC Press - * S Kang, A. Fender, J. Eaton, B. Rees:`Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters`. In IEEE HPEC, Sep. 2020 + * S Kang, A. Fender, J. Eaton, B. Rees. `Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters `_. In IEEE HPEC, Sep. 2020 - * Hricik, T., Bader, D., & Green, O. (2020, September). `Using RAPIDS AI to accelerate graph data science workflows`. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE. + * Hricik, T., Bader, D., & Green, O. (2020, September). `Using RAPIDS AI to accelerate graph data science workflows `_. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE. * Richardson, B., Rees, B., Drabas, T., Oldridge, E., Bader, D. A., & Allen, R. (2020, August). Accelerating and Expanding End-to-End Data Science Workflows with DL/ML Interoperability Using RAPIDS. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (pp. 3503-3504).