From 460d8e468a73e2d8edee625b9a9fe97ef24fa36f Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@gmail.com>
Date: Fri, 22 Nov 2024 16:36:57 -0600
Subject: [PATCH 1/7] add telemetry (#4740)

Enables telemetry during cugraph's build process. This parses github job metadata to obtain timing information. It should have very little impact on overall build time, and should not interfere with any build tools.

This implements emitting OpenTelemetry traces and spans, as described in https://github.com/rapidsai/build-infra/issues/139

Authors:
  - Mike Sarahan (https://github.com/msarahan)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cugraph/pull/4740
---
 .github/workflows/pr.yaml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e82342dfd9..c8bf94b098 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -10,6 +10,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # Please keep pr-builder as the top job here
   pr-builder:
     needs:
       - changed-files
@@ -25,14 +26,24 @@ jobs:
       - wheel-tests-pylibcugraph
       - wheel-build-cugraph
       - wheel-tests-cugraph
+      - telemetry-setup
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
+  telemetry-setup:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    env:
+        OTEL_SERVICE_NAME: "pr-cugraph"
+    steps:
+      - name: Telemetry setup
+        uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
   changed-files:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
     with:
       files_yaml: |
@@ -63,9 +74,11 @@ jobs:
           - '!notebooks/**'
   checks:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
+      ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
@@ -161,6 +174,7 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   devcontainer:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
@@ -171,3 +185,17 @@ jobs:
         sccache -z;
         build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON;
         sccache -s;
+  telemetry-summarize:
+    runs-on: ubuntu-latest
+    needs: pr-builder
+    if: always()
+    continue-on-error: true
+    steps:
+      - name: Load stashed telemetry env vars
+        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
+        with:
+            load_service_name: true
+      - name: Telemetry summarize
+        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
+        with:
+          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"

From 3478fb57965ef896ba6ebc1ece69876de92b604c Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Mon, 25 Nov 2024 08:09:57 -0800
Subject: [PATCH 2/7] Increase max_iterations in MG HITS TEST (#4783)

Increase max_iterations in MG HITS tests (with edge masking).

With masking, we may end up with different graphs with different numbers of GPUs; this results in higher iteration counts for convergence for certain GPU counts. Increase the maximum iteration count to consider this.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4783
---
 cpp/tests/link_analysis/mg_hits_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp
index eb2a9bcd72..83e7647226 100644
--- a/cpp/tests/link_analysis/mg_hits_test.cpp
+++ b/cpp/tests/link_analysis/mg_hits_test.cpp
@@ -91,7 +91,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
       mg_graph_view.attach_edge_mask((*edge_mask).view());
     }
 
-    auto maximum_iterations = 200;
+    auto maximum_iterations = 400;
     weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_mg_hubs(mg_graph_view.local_vertex_partition_range_size(),
                                             handle_->get_stream());

From d71424346f7b10a507b2c7abffaaf288dafa4b0b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Mon, 25 Nov 2024 08:11:02 -0800
Subject: [PATCH 3/7] Primitives & BFS performance improvements (#4751)

This PR includes multiple updates to cut peak memory usage in graph creation and improve performance of BFS on scale-free graphs.

* Add a bitmap for non-zero local degree vertices in the hypersparse region; this information can be used to quickly filter out locally zero degree vertices which don't need to be processed in multiple instances.
* Store (global-)degree offsets for vertices in the hypersparse region; this information can used to quickly identify the vertices with a certain global degree (e.g. for global degree 1 vertices, we can skip inter-GPU reduction as we know each vertex has only one neighbor).
* Skip kernel invocations in computing edge counts if the vertex list is empty.
* Add asynchronous functions to compute edge counts. This helps in preventing unnecessary serialization when we can process multiple such functions concurrently.
* Replace rmm::exec_policy with rmm::exec_policy_nosync in multiple places; the former enforces stream synchronization at the end. The latter does not.
* Enforce cache line alignment in NCCL communication in multiple places (NCCL communication performance is significantly affected by cache line alignment, often leading to 30-40% or more differences).
* For primitives working on a subset of vertices, broadcast a vertex list using a bitmap if the vertex frontier size is large. If the vertex frontier size is small (in case vertex_t is 8B and the local vertex partition range can fit into 4B), use vertex offsets instead of vertices to cut communication volume.
* Merge multiple host scalar communication function calls to a single one.
* Increase multi-stream concurrency in detail::extract_transform_e & detail::per_v_transform_reduce_e
* Multiple optimizations in template specialization (for update_major == true && reduce_op == any && key type is vertex && working on a subset of vertices) in detail::per_v_transform_reduce_e (this includes pre-processing vertices with non-zero local degrees; so we don't need to process such vertices using multiple GPUs, pre-filtering of zero local degree vertices, allreduce communication to reduce shuffle communication volumes, and special treatment of global degree 1 vertices, and so on).
* Multiple optimizations & specializations in detail::fill_edge_minor_property that works on a subset of vertices (this includes kernel fusion, specialization for bitmap properties including direct broadcast to the property buffer and special treatments for vertex partition boundaries, and so on).
* Added multiple optimizations & specializations in transform_reduce_v_frontier_outgoing_e (especially for reduce_op::any and to cut communication volumes and to filter out (key, value) pairs that won't contribute to the final results).
* Multiple low-level optimizations in direction optimizing BFS (including approximations in determining between bottom -up and top-down).
* Multiple optimizations to cut peak memory usage in graph creation.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4751
---
 .../cugraph/edge_partition_device_view.cuh    |  139 +-
 cpp/include/cugraph/edge_partition_view.hpp   |   10 +-
 cpp/include/cugraph/graph.hpp                 |   24 +-
 cpp/include/cugraph/graph_functions.hpp       |   41 +-
 cpp/include/cugraph/graph_view.hpp            |   73 +-
 cpp/include/cugraph/partition_manager.hpp     |   26 +-
 .../cugraph/utilities/dataframe_buffer.hpp    |   73 +-
 cpp/include/cugraph/utilities/device_comm.hpp |  124 +-
 cpp/include/cugraph/utilities/misc_utils.cuh  |    2 +-
 .../cugraph/utilities/shuffle_comm.cuh        |  289 +-
 .../cugraph/utilities/thrust_tuple_utils.hpp  |   26 +
 .../betweenness_centrality_impl.cuh           |   20 +-
 .../approx_weighted_matching_impl.cuh         |    5 +-
 cpp/src/community/detail/common_methods.cuh   |    5 +-
 cpp/src/community/detail/refine_impl.cuh      |   15 +-
 .../weakly_connected_components_impl.cuh      |   39 +-
 cpp/src/cores/core_number_impl.cuh            |   19 +-
 cpp/src/lookup/lookup_src_dst_impl.cuh        |    5 +-
 .../detail/extract_transform_v_frontier_e.cuh | 1549 ++++--
 cpp/src/prims/detail/multi_stream_utils.cuh   |  156 +
 .../detail/optional_dataframe_buffer.hpp      |  174 +-
 .../prims/detail/per_v_transform_reduce_e.cuh | 4374 +++++++++++++++++
 cpp/src/prims/detail/prim_functors.cuh        |   22 +
 cpp/src/prims/detail/prim_utils.cuh           |  104 +
 cpp/src/prims/extract_transform_e.cuh         |    4 +-
 ...xtract_transform_v_frontier_outgoing_e.cuh |   24 +-
 cpp/src/prims/fill_edge_src_dst_property.cuh  |  859 +++-
 ..._v_pair_transform_dst_nbr_intersection.cuh |   95 +-
 ...r_v_random_select_transform_outgoing_e.cuh |  162 +-
 ...m_reduce_dst_key_aggregated_outgoing_e.cuh |    5 +-
 ...ransform_reduce_if_incoming_outgoing_e.cuh |  421 ++
 ...v_transform_reduce_incoming_outgoing_e.cuh | 1336 +----
 ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 1196 +++++
 ...educe_v_frontier_outgoing_e_by_src_dst.cuh |  585 ---
 .../prims/update_edge_src_dst_property.cuh    |  318 +-
 cpp/src/prims/vertex_frontier.cuh             |  248 +-
 .../create_graph_from_edgelist_impl.cuh       |  651 ++-
 cpp/src/structure/detail/structure_utils.cuh  |  136 +-
 cpp/src/structure/graph_impl.cuh              |  233 +-
 cpp/src/structure/graph_view_impl.cuh         |    7 +-
 cpp/src/structure/induced_subgraph_impl.cuh   |   13 -
 cpp/src/structure/renumber_edgelist_impl.cuh  |  761 +--
 cpp/src/structure/renumber_utils_impl.cuh     |   29 +-
 cpp/src/traversal/bfs_impl.cuh                |  582 ++-
 cpp/src/traversal/extract_bfs_paths_impl.cuh  |   14 +-
 cpp/src/traversal/k_hop_nbrs_impl.cuh         |   20 +-
 .../traversal/od_shortest_distances_impl.cuh  |   15 +-
 cpp/src/traversal/sssp_impl.cuh               |    4 +-
 cpp/src/utilities/collect_comm.cuh            |  245 +-
 cpp/src/utilities/shuffle_vertex_pairs.cuh    |    8 +-
 cpp/tests/CMakeLists.txt                      |    6 +-
 cpp/tests/c_api/mg_test_utils.h               |    9 +
 ...rm_reduce_v_frontier_outgoing_e_by_dst.cu} |  194 +-
 cpp/tests/traversal/mg_bfs_test.cpp           |   34 +-
 cpp/tests/utilities/mg_utilities.cpp          |    4 +-
 cpp/tests/utilities/mg_utilities.hpp          |    5 +-
 cpp/tests/utilities/test_graphs.hpp           |   46 +-
 57 files changed, 11526 insertions(+), 4057 deletions(-)
 create mode 100644 cpp/src/prims/detail/multi_stream_utils.cuh
 create mode 100644 cpp/src/prims/detail/per_v_transform_reduce_e.cuh
 create mode 100644 cpp/src/prims/detail/prim_utils.cuh
 create mode 100644 cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh
 create mode 100644 cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
 delete mode 100644 cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh
 rename cpp/tests/prims/{mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu => mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu} (74%)

diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index 583b0a3721..628c3cc10c 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -204,6 +204,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   edge_partition_device_view_t(edge_partition_view_t<vertex_t, edge_t, multi_gpu> view)
     : detail::edge_partition_device_view_base_t<vertex_t, edge_t>(view.offsets(), view.indices()),
       dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())),
+      dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())),
       major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())),
       major_range_first_(view.major_range_first()),
       major_range_last_(view.major_range_last()),
@@ -218,6 +219,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                           MajorIterator major_last,
                                           rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return dcs_nzd_vertices_ ? thrust::transform_reduce(
                                  rmm::exec_policy(stream),
                                  major_first,
@@ -250,12 +252,72 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                  thrust::plus<size_t>());
   }
 
+  template <typename MajorIterator>
+  __host__ void compute_number_of_edges_async(MajorIterator major_first,
+                                              MajorIterator major_last,
+                                              raft::device_span<size_t> count /* size = 1 */,
+                                              rmm::cuda_stream_view stream) const
+  {
+    if (thrust::distance(major_first, major_last) == 0) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
+    }
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
+    size_t tmp_storage_bytes{0};
+
+    if (dcs_nzd_vertices_) {
+      auto local_degree_first = thrust::make_transform_iterator(
+        major_first,
+        detail::local_degree_op_t<vertex_t,
+                                  edge_t,
+                                  size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                  multi_gpu,
+                                  true>{
+          this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_});
+      cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+      d_tmp_storage.resize(tmp_storage_bytes, stream);
+      cub::DeviceReduce::Sum(d_tmp_storage.data(),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+    } else {
+      auto local_degree_first = thrust::make_transform_iterator(
+        major_first,
+        detail::local_degree_op_t<vertex_t,
+                                  edge_t,
+                                  size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                  multi_gpu,
+                                  false>{
+          this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */});
+      cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+      d_tmp_storage.resize(tmp_storage_bytes, stream);
+      cub::DeviceReduce::Sum(d_tmp_storage.data(),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+    }
+  }
+
   __host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
-      thrust::transform(rmm::exec_policy(stream),
+      thrust::transform(rmm::exec_policy_nosync(stream),
                         thrust::make_counting_iterator(this->major_range_first()),
                         thrust::make_counting_iterator(this->major_range_last()),
                         local_degrees.begin(),
@@ -266,7 +328,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                           major_hypersparse_first_.value_or(vertex_t{0})});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         thrust::make_counting_iterator(this->major_range_first()),
         thrust::make_counting_iterator(this->major_range_last()),
         local_degrees.begin(),
@@ -284,7 +346,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
-      thrust::transform(rmm::exec_policy(stream),
+      thrust::transform(rmm::exec_policy_nosync(stream),
                         major_first,
                         major_last,
                         local_degrees.begin(),
@@ -295,7 +357,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                           major_hypersparse_first_.value_or(vertex_t{0})});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         major_first,
         major_last,
         local_degrees.begin(),
@@ -311,6 +373,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                                     MajorIterator major_last,
                                                     rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return dcs_nzd_vertices_ ? thrust::transform_reduce(
                                  rmm::exec_policy(stream),
                                  major_first,
@@ -355,7 +418,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         thrust::make_counting_iterator(this->major_range_first()),
         thrust::make_counting_iterator(this->major_range_last()),
         local_degrees.begin(),
@@ -368,7 +431,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
             mask_first});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         thrust::make_counting_iterator(this->major_range_first()),
         thrust::make_counting_iterator(this->major_range_last()),
         local_degrees.begin(),
@@ -394,7 +457,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         major_first,
         major_last,
         local_degrees.begin(),
@@ -407,7 +470,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
             mask_first});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         major_first,
         major_last,
         local_degrees.begin(),
@@ -515,6 +578,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return minor_range_first_ + minor_offset;
   }
 
+  // FIxME: better return thrust::optional<raft::device_span<vertex_t const>> for consistency (see
+  // dcs_nzd_range_bitmap())
   __host__ __device__ thrust::optional<vertex_t const*> dcs_nzd_vertices() const
   {
     return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
@@ -528,10 +593,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
              : thrust::nullopt;
   }
 
+  __host__ __device__ thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap()
+    const
+  {
+    return dcs_nzd_range_bitmap_
+             ? thrust::make_optional<raft::device_span<uint32_t const>>(
+                 (*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size())
+             : thrust::nullopt;
+  }
+
  private:
   // should be trivially copyable to device
 
   thrust::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{thrust::nullopt};
+  thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{thrust::nullopt};
   thrust::optional<vertex_t> major_hypersparse_first_{thrust::nullopt};
 
   vertex_t major_range_first_{0};
@@ -558,6 +633,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                           MajorIterator major_last,
                                           rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return thrust::transform_reduce(
       rmm::exec_policy(stream),
       major_first,
@@ -574,10 +650,48 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
       thrust::plus<size_t>());
   }
 
+  template <typename MajorIterator>
+  __host__ void compute_number_of_edges_async(MajorIterator major_first,
+                                              MajorIterator major_last,
+                                              raft::device_span<size_t> count /* size = 1 */,
+                                              rmm::cuda_stream_view stream) const
+  {
+    if (thrust::distance(major_first, major_last) == 0) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
+    }
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
+    size_t tmp_storage_bytes{0};
+
+    auto local_degree_first = thrust::make_transform_iterator(
+      major_first,
+      detail::local_degree_op_t<vertex_t,
+                                edge_t,
+                                size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                multi_gpu,
+                                false>{this->offsets_,
+                                       std::byte{0} /* dummy */,
+                                       std::byte{0} /* dummy */,
+                                       std::byte{0} /* dummy */});
+    cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                           tmp_storage_bytes,
+                           local_degree_first,
+                           count.data(),
+                           thrust::distance(major_first, major_last),
+                           stream);
+    d_tmp_storage.resize(tmp_storage_bytes, stream);
+    cub::DeviceReduce::Sum(d_tmp_storage.data(),
+                           tmp_storage_bytes,
+                           local_degree_first,
+                           count.data(),
+                           thrust::distance(major_first, major_last),
+                           stream);
+  }
+
   __host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
-    thrust::transform(rmm::exec_policy(stream),
+    thrust::transform(rmm::exec_policy_nosync(stream),
                       thrust::make_counting_iterator(this->major_range_first()),
                       thrust::make_counting_iterator(this->major_range_last()),
                       local_degrees.begin(),
@@ -595,7 +709,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                                              rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
-    thrust::transform(rmm::exec_policy(stream),
+    thrust::transform(rmm::exec_policy_nosync(stream),
                       major_first,
                       major_last,
                       local_degrees.begin(),
@@ -613,6 +727,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                                     MajorIterator major_last,
                                                     rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return thrust::transform_reduce(
       rmm::exec_policy(stream),
       major_first,
@@ -638,7 +753,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(this->major_range_first()),
       thrust::make_counting_iterator(this->major_range_last()),
       local_degrees.begin(),
@@ -660,7 +775,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       major_first,
       major_last,
       local_degrees.begin(),
diff --git a/cpp/include/cugraph/edge_partition_view.hpp b/cpp/include/cugraph/edge_partition_view.hpp
index 4246527371..27c5705dfc 100644
--- a/cpp/include/cugraph/edge_partition_view.hpp
+++ b/cpp/include/cugraph/edge_partition_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,6 +56,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
   edge_partition_view_t(raft::device_span<edge_t const> offsets,
                         raft::device_span<vertex_t const> indices,
                         std::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices,
+                        std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap,
                         std::optional<vertex_t> major_hypersparse_first,
                         vertex_t major_range_first,
                         vertex_t major_range_last,
@@ -64,6 +65,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
                         vertex_t major_value_start_offset)
     : detail::edge_partition_view_base_t<vertex_t, edge_t>(offsets, indices),
       dcs_nzd_vertices_(dcs_nzd_vertices),
+      dcs_nzd_range_bitmap_(dcs_nzd_range_bitmap),
       major_hypersparse_first_(major_hypersparse_first),
       major_range_first_(major_range_first),
       major_range_last_(major_range_last),
@@ -78,6 +80,11 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
     return dcs_nzd_vertices_;
   }
 
+  std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap() const
+  {
+    return dcs_nzd_range_bitmap_;
+  }
+
   std::optional<vertex_t> major_hypersparse_first() const { return major_hypersparse_first_; }
 
   vertex_t major_range_first() const { return major_range_first_; }
@@ -90,6 +97,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
  private:
   // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
   std::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{std::nullopt};
+  std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{std::nullopt};
   std::optional<vertex_t> major_hypersparse_first_{std::nullopt};
 
   vertex_t major_range_first_{0};
diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 0607b39153..290f4b3c4d 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -48,6 +48,7 @@ struct graph_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>> {
   partition_t<vertex_t> partition{};
 
   std::vector<vertex_t> edge_partition_segment_offsets{};
+  std::optional<std::vector<vertex_t>> edge_partition_hypersparse_degree_offsets{};
 
   vertex_t num_local_unique_edge_srcs{};
   vertex_t num_local_unique_edge_dsts{};
@@ -61,6 +62,7 @@ struct graph_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>> {
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+  std::optional<std::vector<vertex_t>> hypersparse_degree_offsets{std::nullopt};
 };
 
 // graph_t is an owning graph class (note that graph_view_t is a non-owning graph class)
@@ -101,6 +103,11 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
                               ? std::make_optional<std::vector<raft::device_span<vertex_t const>>>(
                                   (*edge_partition_dcs_nzd_vertices_).size())
                               : std::nullopt;
+    auto dcs_nzd_range_bitmaps =
+      edge_partition_dcs_nzd_range_bitmaps_
+        ? std::make_optional<std::vector<raft::device_span<uint32_t const>>>(
+            (*edge_partition_dcs_nzd_range_bitmaps_).size())
+        : std::nullopt;
     for (size_t i = 0; i < offsets.size(); ++i) {
       offsets[i] = raft::device_span<edge_t const>(edge_partition_offsets_[i].data(),
                                                    edge_partition_offsets_[i].size());
@@ -111,6 +118,11 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
           raft::device_span<vertex_t const>((*edge_partition_dcs_nzd_vertices_)[i].data(),
                                             (*edge_partition_dcs_nzd_vertices_)[i].size());
       }
+      if (dcs_nzd_range_bitmaps) {
+        (*dcs_nzd_range_bitmaps)[i] =
+          raft::device_span<uint32_t const>((*edge_partition_dcs_nzd_range_bitmaps_)[i].data(),
+                                            (*edge_partition_dcs_nzd_range_bitmaps_)[i].size());
+      }
     }
 
     std::conditional_t<store_transposed,
@@ -201,12 +213,14 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
       offsets,
       indices,
       dcs_nzd_vertices,
+      dcs_nzd_range_bitmaps,
       graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
         this->number_of_vertices(),
         this->number_of_edges(),
         this->properties_,
         partition_,
         edge_partition_segment_offsets_,
+        edge_partition_hypersparse_degree_offsets_,
         local_sorted_unique_edge_srcs,
         local_sorted_unique_edge_src_chunk_start_offsets,
         local_sorted_unique_edge_src_chunk_size_,
@@ -224,10 +238,13 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
   // nzd: nonzero (local) degree
   std::optional<std::vector<rmm::device_uvector<vertex_t>>> edge_partition_dcs_nzd_vertices_{
     std::nullopt};
+  std::optional<std::vector<rmm::device_uvector<uint32_t>>> edge_partition_dcs_nzd_range_bitmaps_{
+    std::nullopt};
   partition_t<vertex_t> partition_{};
 
   // segment offsets within the vertex partition based on vertex degree
   std::vector<vertex_t> edge_partition_segment_offsets_{};
+  std::optional<std::vector<vertex_t>> edge_partition_hypersparse_degree_offsets_{};
 
   // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge
   // sources/destinations << V / major_comm_size|minor_comm_size).
@@ -290,7 +307,11 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!m
       raft::device_span<edge_t const>(offsets_.data(), offsets_.size()),
       raft::device_span<vertex_t const>(indices_.data(), indices_.size()),
       graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
-        this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_});
+        this->number_of_vertices(),
+        this->number_of_edges(),
+        this->properties_,
+        segment_offsets_,
+        hypersparse_degree_offsets_});
   }
 
  private:
@@ -299,6 +320,7 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!m
 
   // segment offsets based on vertex degree, relevant only if sorted_by_global_degree is true
   std::optional<std::vector<vertex_t>> segment_offsets_{};
+  std::optional<std::vector<vertex_t>> hypersparse_degree_offsets_{};
 };
 
 template <typename T, typename Enable = void>
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 866ab16ee9..6a03b9a645 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -41,11 +41,13 @@ struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>>
   edge_t number_of_edges{};
   partition_t<vertex_t> partition{};
   std::vector<vertex_t> edge_partition_segment_offsets{};
+  std::optional<std::vector<vertex_t>> edge_partition_hypersparse_degree_offsets{};
 };
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>> {
   std::vector<vertex_t> segment_offsets{};
+  std::optional<std::vector<vertex_t>> hypersparse_degree_offsets{};
 };
 
 /**
@@ -244,7 +246,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle,
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -284,7 +286,7 @@ std::enable_if_t<multi_gpu, void> unrenumber_local_int_edges(
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -346,7 +348,7 @@ void renumber_local_ext_vertices(raft::handle_t const& handle,
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam edge_type_t Type of edge types. Needs to be an integral type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -388,7 +390,7 @@ decompress_to_edgelist(
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -421,7 +423,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -463,7 +465,7 @@ symmetrize_graph(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -505,7 +507,7 @@ transpose_graph(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -549,7 +551,7 @@ transpose_graph_storage(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -625,7 +627,7 @@ void relabel(raft::handle_t const& handle,
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -678,7 +680,7 @@ extract_induced_subgraphs(
  * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
  * supported
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -743,7 +745,7 @@ create_graph_from_edgelist(raft::handle_t const& handle,
  * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
  * supported
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -807,7 +809,7 @@ create_graph_from_edgelist(
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -829,7 +831,7 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> get_two
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -856,7 +858,7 @@ rmm::device_uvector<weight_t> compute_in_weight_sums(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -883,7 +885,7 @@ rmm::device_uvector<weight_t> compute_out_weight_sums(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -910,7 +912,7 @@ weight_t compute_max_in_weight_sum(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -937,7 +939,7 @@ weight_t compute_max_out_weight_sum(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -963,7 +965,7 @@ weight_t compute_total_edge_weight(
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if
- * true) as major indices in storing edges using a 2D sparse matrix. transposed.
+ * true) as major indices in storing edges using a 2D sparse matrix.
  * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -1114,7 +1116,8 @@ shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
  * @param edge_ids  Optional list of edge ids
  * @param edge_types Optional list of edge types
  * @return Tuple of vectors storing edge sources, destinations, optional weights,
- *          optional edge ids, optional edge types mapped to this GPU.
+ *          optional edge ids, optional edge types mapped to this GPU and a vector storing the
+ *          number of edges received from each GPU.
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
 std::tuple<rmm::device_uvector<vertex_t>,
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index a2ff3166fa..d109fbdac9 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -165,7 +165,12 @@ class partition_t {
     return vertex_partition_range_last(partition_idx) - vertex_partition_range_first(partition_idx);
   }
 
-  size_t number_of_local_edge_partitions() const { return minor_comm_size_; }
+  size_t number_of_local_edge_partitions() const { return static_cast<size_t>(minor_comm_size_); }
+  size_t coinciding_local_edge_partition_idx() const
+  {
+    return static_cast<size_t>(minor_comm_rank_);
+  }  // the major range of coinciding_local_edge_partition_idx()'th local edge partition coincides
+     // with the local vertex partition range
 
   // major: source of the edge partition (if not transposed) or destination of the edge partition
   // (if transposed).
@@ -249,9 +254,13 @@ double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_thres
 // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller
 // than minor_comm_size * hypersparse_threshold_ratio, should be less than 1.0
 double constexpr hypersparse_threshold_ratio = 0.5;
-size_t constexpr low_degree_threshold{raft::warp_size()};
-size_t constexpr mid_degree_threshold{1024};
-size_t constexpr num_sparse_segments_per_vertex_partition{3};
+size_t constexpr low_degree_threshold{
+  raft::warp_size()};  // belongs to the low degree segment if the global degree is smaller than
+                       // this value.
+size_t constexpr mid_degree_threshold{
+  1024};  // belongs to the medium degree segment if the global degree is smaller than this value,
+          // otherwise, belongs to the high degree segment.
+size_t constexpr num_sparse_segments_per_vertex_partition{3};  // high, mid, low
 
 // Common for both graph_view_t & graph_t and both single-GPU & multi-GPU versions
 template <typename vertex_t, typename edge_t>
@@ -313,6 +322,7 @@ struct graph_view_meta_t<vertex_t,
 
   // segment offsets based on vertex degree
   std::vector<vertex_t> edge_partition_segment_offsets{};
+  std::optional<std::vector<vertex_t>> edge_partition_hypersparse_degree_offsets{};
 
   std::conditional_t<store_transposed,
                      std::optional<raft::device_span<vertex_t const>>,
@@ -356,6 +366,7 @@ struct graph_view_meta_t<vertex_t,
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+  std::optional<std::vector<vertex_t>> hypersparse_degree_offsets{std::nullopt};
 };
 
 // graph_view_t is a non-owning graph class (note that graph_t is an owning graph class)
@@ -380,6 +391,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
                std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
                std::optional<std::vector<raft::device_span<vertex_t const>>> const&
                  edge_partition_dcs_nzd_vertices,
+               std::optional<std::vector<raft::device_span<uint32_t const>>> const&
+                 edge_partition_dcs_nzd_range_bitmaps,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta);
 
   std::vector<vertex_t> vertex_partition_range_offsets() const
@@ -552,6 +565,12 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
            (detail::num_sparse_segments_per_vertex_partition + size_t{2});
   }
 
+  std::optional<std::vector<vertex_t>> local_vertex_partition_segment_offsets() const
+  {
+    auto partition_idx = partition_.coinciding_local_edge_partition_idx();
+    return local_edge_partition_segment_offsets(partition_idx);
+  }
+
   std::optional<std::vector<vertex_t>> local_edge_partition_segment_offsets(
     size_t partition_idx) const
   {
@@ -563,6 +582,28 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
         (partition_idx + 1) * num_segments_per_vertex_partition);
   }
 
+  std::optional<std::vector<vertex_t>> local_vertex_partition_hypersparse_degree_offsets() const
+  {
+    auto partition_idx = partition_.coinciding_local_edge_partition_idx();
+    return local_edge_partition_hypersparse_degree_offsets(partition_idx);
+  }
+
+  std::optional<std::vector<vertex_t>> local_edge_partition_hypersparse_degree_offsets(
+    size_t partition_idx) const
+  {
+    auto num_degrees_per_vertex_partition =
+      edge_partition_hypersparse_degree_offsets_
+        ? ((*edge_partition_hypersparse_degree_offsets_).size() / edge_partition_offsets_.size())
+        : size_t{0};
+    return edge_partition_hypersparse_degree_offsets_
+             ? std::make_optional<std::vector<vertex_t>>(
+                 (*edge_partition_hypersparse_degree_offsets_).begin() +
+                   partition_idx * num_degrees_per_vertex_partition,
+                 (*edge_partition_hypersparse_degree_offsets_).begin() +
+                   (partition_idx + 1) * num_degrees_per_vertex_partition)
+             : std::nullopt;
+  }
+
   vertex_partition_view_t<vertex_t, true> local_vertex_partition_view() const
   {
     return vertex_partition_view_t<vertex_t, true>(this->number_of_vertices(),
@@ -605,6 +646,9 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       edge_partition_dcs_nzd_vertices_
         ? std::make_optional((*edge_partition_dcs_nzd_vertices_)[partition_idx])
         : std::nullopt,
+      edge_partition_dcs_nzd_range_bitmaps_
+        ? std::make_optional((*edge_partition_dcs_nzd_range_bitmaps_)[partition_idx])
+        : std::nullopt,
       major_hypersparse_first,
       major_range_first,
       major_range_last,
@@ -755,11 +799,14 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
 
   // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
   std::optional<std::vector<raft::device_span<vertex_t const>>> edge_partition_dcs_nzd_vertices_{};
+  std::optional<std::vector<raft::device_span<uint32_t const>>>
+    edge_partition_dcs_nzd_range_bitmaps_{};
 
   partition_t<vertex_t> partition_{};
 
   // segment offsets based on vertex degree
   std::vector<vertex_t> edge_partition_segment_offsets_{};
+  std::optional<std::vector<vertex_t>> edge_partition_hypersparse_degree_offsets_{};
 
   // if valid, store source/destination property values in key/value pairs (this saves memory if #
   // unique edge sources/destinations << V / major_comm_size|minor_comm_size).
@@ -903,6 +950,11 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
 
   bool use_dcs() const { return false; }
 
+  std::optional<std::vector<vertex_t>> local_vertex_partition_segment_offsets() const
+  {
+    return local_edge_partition_segment_offsets(size_t{0});
+  }
+
   std::optional<std::vector<vertex_t>> local_edge_partition_segment_offsets(
     size_t partition_idx = 0) const
   {
@@ -910,6 +962,18 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
     return segment_offsets_;
   }
 
+  std::optional<std::vector<vertex_t>> local_vertex_partition_hypersparse_degree_offsets() const
+  {
+    return local_edge_partition_hypersparse_degree_offsets(size_t{0});
+  }
+
+  std::optional<std::vector<vertex_t>> local_edge_partition_hypersparse_degree_offsets(
+    size_t partition_idx = 0) const
+  {
+    assert(partition_idx == 0);
+    return hypersparse_degree_offsets_;
+  }
+
   vertex_partition_view_t<vertex_t, false> local_vertex_partition_view() const
   {
     return vertex_partition_view_t<vertex_t, false>(this->number_of_vertices());
@@ -1050,6 +1114,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
+  std::optional<std::vector<vertex_t>> hypersparse_degree_offsets_{std::nullopt};
 
   std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view_{std::nullopt};
 };
diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp
index 309b169e64..2eb210fb7c 100644
--- a/cpp/include/cugraph/partition_manager.hpp
+++ b/cpp/include/cugraph/partition_manager.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,6 +71,30 @@ class partition_manager {
                                           : (major_comm_rank * minor_comm_size + minor_comm_rank);
   }
 
+#ifdef __CUDACC__
+  __host__ __device__
+#endif
+    static int
+    compute_major_comm_rank_from_global_comm_rank(int major_comm_size,
+                                                  int minor_comm_size,
+                                                  int comm_rank)
+  {
+    return map_major_comm_to_gpu_row_comm ? comm_rank % major_comm_size
+                                          : comm_rank / minor_comm_size;
+  }
+
+#ifdef __CUDACC__
+  __host__ __device__
+#endif
+    static int
+    compute_minor_comm_rank_from_global_comm_rank(int major_comm_size,
+                                                  int minor_comm_size,
+                                                  int comm_rank)
+  {
+    return map_major_comm_to_gpu_row_comm ? comm_rank / major_comm_size
+                                          : comm_rank % minor_comm_size;
+  }
+
 #ifdef __CUDACC__
   __host__ __device__
 #endif
diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
index a20613c65e..6d47ec540d 100644
--- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp
+++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
@@ -82,6 +82,53 @@ auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_
     std::make_index_sequence<tuple_size>(), buffer_size, stream_view);
 }
 
+template <typename T>
+struct dataframe_buffer_type {
+  using type = decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
+};
+
+template <typename T>
+using dataframe_buffer_type_t = typename dataframe_buffer_type<T>::type;
+
+template <typename T>
+std::optional<dataframe_buffer_type_t<T>> try_allocate_dataframe_buffer(
+  size_t buffer_size, rmm::cuda_stream_view stream_view)
+{
+  try {
+    return allocate_dataframe_buffer<T>(buffer_size, stream_view);
+  } catch (std::exception const& e) {
+    return std::nullopt;
+  }
+}
+
+template <typename T>
+struct dataframe_buffer_iterator_type {
+  using type = typename rmm::device_uvector<T>::iterator;
+};
+
+template <typename... Ts>
+struct dataframe_buffer_iterator_type<thrust::tuple<Ts...>> {
+  using type = thrust::zip_iterator<thrust::tuple<typename rmm::device_uvector<Ts>::iterator...>>;
+};
+
+template <typename T>
+using dataframe_buffer_iterator_type_t = typename dataframe_buffer_iterator_type<T>::type;
+
+template <typename T>
+struct dataframe_buffer_const_iterator_type {
+  using type = typename rmm::device_uvector<T>::const_iterator;
+};
+
+template <typename... Ts>
+struct dataframe_buffer_const_iterator_type<thrust::tuple<Ts...>> {
+  using type =
+    thrust::zip_iterator<thrust::tuple<typename rmm::device_uvector<Ts>::const_iterator...>>;
+};
+
+template <typename T>
+using dataframe_buffer_const_iterator_type_t =
+  typename dataframe_buffer_const_iterator_type<T>::type;
+
 template <typename BufferType>
 void reserve_dataframe_buffer(BufferType& buffer,
                               size_t new_buffer_capacity,
@@ -206,30 +253,4 @@ auto get_dataframe_buffer_cend(BufferType& buffer)
     std::make_index_sequence<std::tuple_size<BufferType>::value>(), buffer);
 }
 
-template <typename T>
-struct dataframe_buffer_value_type {
-  using type = void;
-};
-
-template <typename T>
-struct dataframe_buffer_value_type<rmm::device_uvector<T>> {
-  using type = T;
-};
-
-template <typename... Ts>
-struct dataframe_buffer_value_type<std::tuple<rmm::device_uvector<Ts>...>> {
-  using type = thrust::tuple<Ts...>;
-};
-
-template <typename BufferType>
-using dataframe_buffer_value_type_t = typename dataframe_buffer_value_type<BufferType>::type;
-
-template <typename T>
-struct dataframe_buffer_type {
-  using type = decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
-};
-
-template <typename T>
-using dataframe_buffer_type_t = typename dataframe_buffer_type<T>::type;
-
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/device_comm.hpp b/cpp/include/cugraph/utilities/device_comm.hpp
index ffb0f7d9e5..07de2d0646 100644
--- a/cpp/include/cugraph/utilities/device_comm.hpp
+++ b/cpp/include/cugraph/utilities/device_comm.hpp
@@ -55,7 +55,7 @@ auto iter_to_raw_ptr(thrust::detail::normal_iterator<thrust::device_ptr<T>> iter
 }
 
 template <typename InputIterator, typename OutputValueType>
-std::enable_if_t<std::is_same<OutputValueType, thrust::detail::any_assign>::value, void>
+std::enable_if_t<std::is_same_v<OutputValueType, thrust::detail::any_assign>, void>
 device_isend_impl(raft::comms::comms_t const& comm,
                   InputIterator input_first,
                   size_t count,
@@ -76,7 +76,7 @@ std::enable_if_t<std::is_arithmetic<OutputValueType>::value, void> device_isend_
   raft::comms::request_t* request)
 {
   static_assert(
-    std::is_same<typename std::iterator_traits<InputIterator>::value_type, OutputValueType>::value);
+    std::is_same_v<typename std::iterator_traits<InputIterator>::value_type, OutputValueType>);
   comm.isend(iter_to_raw_ptr(input_first), count, dst, tag, request);
 }
 
@@ -136,7 +136,7 @@ device_irecv_impl(raft::comms::comms_t const& comm,
 {
   static_assert(
 
-    std::is_same<InputValueType, typename std::iterator_traits<OutputIterator>::value_type>::value);
+    std::is_same_v<InputValueType, typename std::iterator_traits<OutputIterator>::value_type>);
   comm.irecv(iter_to_raw_ptr(output_first), count, src, tag, request);
 }
 
@@ -200,7 +200,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm,
 {
   using value_type = typename std::iterator_traits<InputIterator>::value_type;
   static_assert(
-    std::is_same<typename std::iterator_traits<OutputIterator>::value_type, value_type>::value);
+    std::is_same_v<typename std::iterator_traits<OutputIterator>::value_type, value_type>);
   comm.device_sendrecv(iter_to_raw_ptr(input_first),
                        tx_count,
                        dst,
@@ -286,7 +286,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm,
 {
   using value_type = typename std::iterator_traits<InputIterator>::value_type;
   static_assert(
-    std::is_same<typename std::iterator_traits<OutputIterator>::value_type, value_type>::value);
+    std::is_same_v<typename std::iterator_traits<OutputIterator>::value_type, value_type>);
   comm.device_multicast_sendrecv(iter_to_raw_ptr(input_first),
                                  tx_counts,
                                  tx_offsets,
@@ -379,8 +379,8 @@ device_bcast_impl(raft::comms::comms_t const& comm,
                   int root,
                   rmm::cuda_stream_view stream_view)
 {
-  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
-                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  static_assert(std::is_same_v<typename std::iterator_traits<InputIterator>::value_type,
+                               typename std::iterator_traits<OutputIterator>::value_type>);
   comm.bcast(
     iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, root, stream_view.value());
 }
@@ -440,8 +440,8 @@ device_allreduce_impl(raft::comms::comms_t const& comm,
                       raft::comms::op_t op,
                       rmm::cuda_stream_view stream_view)
 {
-  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
-                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  static_assert(std::is_same_v<typename std::iterator_traits<InputIterator>::value_type,
+                               typename std::iterator_traits<OutputIterator>::value_type>);
   comm.allreduce(
     iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream_view.value());
 }
@@ -503,8 +503,8 @@ device_reduce_impl(raft::comms::comms_t const& comm,
                    int root,
                    rmm::cuda_stream_view stream_view)
 {
-  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
-                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  static_assert(std::is_same_v<typename std::iterator_traits<InputIterator>::value_type,
+                               typename std::iterator_traits<OutputIterator>::value_type>);
   comm.reduce(iter_to_raw_ptr(input_first),
               iter_to_raw_ptr(output_first),
               count,
@@ -548,6 +548,62 @@ struct device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator,
   }
 };
 
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_allgather_impl(raft::comms::comms_t const& comm,
+                      InputIterator input_first,
+                      OutputIterator output_first,
+                      size_t sendcount,
+                      rmm::cuda_stream_view stream_view)
+{
+  // no-op
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allgather_impl(raft::comms::comms_t const& comm,
+                      InputIterator input_first,
+                      OutputIterator output_first,
+                      size_t sendcount,
+                      rmm::cuda_stream_view stream_view)
+{
+  static_assert(std::is_same_v<typename std::iterator_traits<InputIterator>::value_type,
+                               typename std::iterator_traits<OutputIterator>::value_type>);
+  comm.allgather(
+    iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, stream_view.value());
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_allgather_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t sendcount,
+           rmm::cuda_stream_view stream_view) const
+  {
+    device_allgather_impl(comm,
+                          thrust::get<I>(input_first.get_iterator_tuple()),
+                          thrust::get<I>(output_first.get_iterator_tuple()),
+                          sendcount,
+                          stream_view);
+    device_allgather_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, output_first, sendcount, stream_view);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_allgather_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t sendcount,
+           rmm::cuda_stream_view stream_view) const
+  {
+  }
+};
+
 template <typename InputIterator, typename OutputIterator>
 std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
 device_allgatherv_impl(raft::comms::comms_t const& comm,
@@ -571,8 +627,8 @@ device_allgatherv_impl(raft::comms::comms_t const& comm,
                        std::vector<size_t> const& displacements,
                        rmm::cuda_stream_view stream_view)
 {
-  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
-                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  static_assert(std::is_same_v<typename std::iterator_traits<InputIterator>::value_type,
+                               typename std::iterator_traits<OutputIterator>::value_type>);
   comm.allgatherv(iter_to_raw_ptr(input_first),
                   iter_to_raw_ptr(output_first),
                   recvcounts.data(),
@@ -639,8 +695,8 @@ device_gatherv_impl(raft::comms::comms_t const& comm,
                     int root,
                     rmm::cuda_stream_view stream_view)
 {
-  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
-                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  static_assert(std::is_same_v<typename std::iterator_traits<InputIterator>::value_type,
+                               typename std::iterator_traits<OutputIterator>::value_type>);
   comm.gatherv(iter_to_raw_ptr(input_first),
                iter_to_raw_ptr(output_first),
                sendcount,
@@ -1000,6 +1056,44 @@ device_reduce(raft::comms::comms_t const& comm,
     .run(comm, input_first, output_first, count, op, root, stream_view);
 }
 
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_allgather(raft::comms::comms_t const& comm,
+                 InputIterator input_first,
+                 OutputIterator output_first,
+                 size_t sendcount,
+                 rmm::cuda_stream_view stream_view)
+{
+  detail::device_allgather_impl(comm, input_first, output_first, sendcount, stream_view);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allgather(raft::comms::comms_t const& comm,
+                 InputIterator input_first,
+                 OutputIterator output_first,
+                 size_t sendcount,
+                 rmm::cuda_stream_view stream_view)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::device_allgather_tuple_iterator_element_impl<InputIterator,
+                                                       OutputIterator,
+                                                       size_t{0},
+                                                       tuple_size>()
+    .run(comm, input_first, output_first, sendcount, stream_view);
+}
+
 template <typename InputIterator, typename OutputIterator>
 std::enable_if_t<
   std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index 633dabe5b4..91a349007d 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -81,7 +81,7 @@ std::tuple<std::vector<vertex_t>, std::vector<offset_t>> compute_offset_aligned_
 
     return std::make_tuple(h_chunk_offsets, h_element_offsets);
   } else {
-    return std::make_tuple(std::vector<vertex_t>{{0, offsets.size() - 1}},
+    return std::make_tuple(std::vector<vertex_t>{{0, static_cast<vertex_t>(offsets.size() - 1)}},
                            std::vector<offset_t>{{0, num_elements}});
   }
 }
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 3cbd35b4bc..98fa2cb170 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -41,6 +41,7 @@
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
 #include <thrust/tuple.h>
+#include <thrust/unique.h>
 
 #include <algorithm>
 #include <numeric>
@@ -50,6 +51,8 @@ namespace cugraph {
 
 namespace detail {
 
+constexpr size_t cache_line_size = 128;
+
 template <typename GroupIdIterator>
 struct compute_group_id_count_pair_t {
   GroupIdIterator group_id_first{};
@@ -76,6 +79,7 @@ inline std::tuple<std::vector<size_t>,
                   std::vector<int>>
 compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm,
                                    rmm::device_uvector<size_t> const& d_tx_value_counts,
+                                   bool drop_empty_ranks,
                                    rmm::cuda_stream_view stream_view)
 {
   auto const comm_size = comm.get_size();
@@ -111,28 +115,30 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm,
   std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1);
   std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1);
 
-  int num_tx_dst_ranks{0};
-  int num_rx_src_ranks{0};
-  for (int i = 0; i < comm_size; ++i) {
-    if (tx_counts[i] != 0) {
-      tx_counts[num_tx_dst_ranks]    = tx_counts[i];
-      tx_offsets[num_tx_dst_ranks]   = tx_offsets[i];
-      tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i];
-      ++num_tx_dst_ranks;
-    }
-    if (rx_counts[i] != 0) {
-      rx_counts[num_rx_src_ranks]    = rx_counts[i];
-      rx_offsets[num_rx_src_ranks]   = rx_offsets[i];
-      rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i];
-      ++num_rx_src_ranks;
+  if (drop_empty_ranks) {
+    int num_tx_dst_ranks{0};
+    int num_rx_src_ranks{0};
+    for (int i = 0; i < comm_size; ++i) {
+      if (tx_counts[i] != 0) {
+        tx_counts[num_tx_dst_ranks]    = tx_counts[i];
+        tx_offsets[num_tx_dst_ranks]   = tx_offsets[i];
+        tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i];
+        ++num_tx_dst_ranks;
+      }
+      if (rx_counts[i] != 0) {
+        rx_counts[num_rx_src_ranks]    = rx_counts[i];
+        rx_offsets[num_rx_src_ranks]   = rx_offsets[i];
+        rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i];
+        ++num_rx_src_ranks;
+      }
     }
+    tx_counts.resize(num_tx_dst_ranks);
+    tx_offsets.resize(num_tx_dst_ranks);
+    tx_dst_ranks.resize(num_tx_dst_ranks);
+    rx_counts.resize(num_rx_src_ranks);
+    rx_offsets.resize(num_rx_src_ranks);
+    rx_src_ranks.resize(num_rx_src_ranks);
   }
-  tx_counts.resize(num_tx_dst_ranks);
-  tx_offsets.resize(num_tx_dst_ranks);
-  tx_dst_ranks.resize(num_tx_dst_ranks);
-  rx_counts.resize(num_rx_src_ranks);
-  rx_offsets.resize(num_rx_src_ranks);
-  rx_src_ranks.resize(num_rx_src_ranks);
 
   return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks);
 }
@@ -823,6 +829,8 @@ auto shuffle_values(raft::comms::comms_t const& comm,
                     std::vector<size_t> const& tx_value_counts,
                     rmm::cuda_stream_view stream_view)
 {
+  using value_t = typename thrust::iterator_traits<TxValueIterator>::value_type;
+
   auto const comm_size = comm.get_size();
 
   rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream_view);
@@ -836,11 +844,10 @@ auto shuffle_values(raft::comms::comms_t const& comm,
   std::vector<size_t> rx_offsets{};
   std::vector<int> rx_src_ranks{};
   std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
-    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view);
 
-  auto rx_value_buffer =
-    allocate_dataframe_buffer<typename thrust::iterator_traits<TxValueIterator>::value_type>(
-      rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
+  auto rx_value_buffer = allocate_dataframe_buffer<value_t>(
+    rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
 
   // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
   device_multicast_sendrecv(comm,
@@ -866,6 +873,236 @@ auto shuffle_values(raft::comms::comms_t const& comm,
   return std::make_tuple(std::move(rx_value_buffer), rx_counts);
 }
 
+// Add gaps in the receive buffer to enforce that the sent data offset and the received data offset
+// have the same alignment for every rank. This is faster assuming that @p alignment ensures cache
+// line alignment in both send & receive buffer (tested with NCCL 2.23.4)
+template <typename TxValueIterator>
+auto shuffle_values(
+  raft::comms::comms_t const& comm,
+  TxValueIterator tx_value_first,
+  std::vector<size_t> const& tx_value_counts,
+  size_t alignment,  // # elements
+  std::optional<typename thrust::iterator_traits<TxValueIterator>::value_type> fill_value,
+  rmm::cuda_stream_view stream_view)
+{
+  using value_t = typename thrust::iterator_traits<TxValueIterator>::value_type;
+
+  auto const comm_size = comm.get_size();
+
+  std::vector<size_t> tx_value_displacements(tx_value_counts.size());
+  std::exclusive_scan(
+    tx_value_counts.begin(), tx_value_counts.end(), tx_value_displacements.begin(), size_t{0});
+
+  std::vector<size_t> tx_unaligned_counts(comm_size);
+  std::vector<size_t> tx_displacements(comm_size);
+  std::vector<size_t> tx_aligned_counts(comm_size);
+  std::vector<size_t> tx_aligned_displacements(comm_size);
+  std::vector<size_t> rx_unaligned_counts(comm_size);
+  std::vector<size_t> rx_displacements(comm_size);
+  std::vector<size_t> rx_aligned_counts(comm_size);
+  std::vector<size_t> rx_aligned_displacements(comm_size);
+  std::vector<int> tx_ranks(comm_size);
+  std::iota(tx_ranks.begin(), tx_ranks.end(), int{0});
+  auto rx_ranks = tx_ranks;
+  for (size_t i = 0; i < tx_value_counts.size(); ++i) {
+    tx_unaligned_counts[i] = 0;
+    if (tx_value_displacements[i] % alignment != 0) {
+      tx_unaligned_counts[i] =
+        std::min(alignment - (tx_value_displacements[i] % alignment), tx_value_counts[i]);
+    }
+    tx_displacements[i]         = tx_value_displacements[i];
+    tx_aligned_counts[i]        = tx_value_counts[i] - tx_unaligned_counts[i];
+    tx_aligned_displacements[i] = tx_value_displacements[i] + tx_unaligned_counts[i];
+  }
+
+  rmm::device_uvector<size_t> d_tx_unaligned_counts(tx_unaligned_counts.size(), stream_view);
+  rmm::device_uvector<size_t> d_tx_aligned_counts(tx_aligned_counts.size(), stream_view);
+  rmm::device_uvector<size_t> d_rx_unaligned_counts(rx_unaligned_counts.size(), stream_view);
+  rmm::device_uvector<size_t> d_rx_aligned_counts(rx_aligned_counts.size(), stream_view);
+  raft::update_device(d_tx_unaligned_counts.data(),
+                      tx_unaligned_counts.data(),
+                      tx_unaligned_counts.size(),
+                      stream_view);
+  raft::update_device(
+    d_tx_aligned_counts.data(), tx_aligned_counts.data(), tx_aligned_counts.size(), stream_view);
+  std::vector<size_t> tx_counts(comm_size, size_t{1});
+  std::vector<size_t> tx_offsets(comm_size);
+  std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0});
+  auto rx_counts  = tx_counts;
+  auto rx_offsets = tx_offsets;
+  cugraph::device_multicast_sendrecv(comm,
+                                     d_tx_unaligned_counts.data(),
+                                     tx_counts,
+                                     tx_offsets,
+                                     tx_ranks,
+                                     d_rx_unaligned_counts.data(),
+                                     rx_counts,
+                                     rx_offsets,
+                                     rx_ranks,
+                                     stream_view);
+  cugraph::device_multicast_sendrecv(comm,
+                                     d_tx_aligned_counts.data(),
+                                     tx_counts,
+                                     tx_offsets,
+                                     tx_ranks,
+                                     d_rx_aligned_counts.data(),
+                                     rx_counts,
+                                     rx_offsets,
+                                     rx_ranks,
+                                     stream_view);
+  raft::update_host(rx_unaligned_counts.data(),
+                    d_rx_unaligned_counts.data(),
+                    d_rx_unaligned_counts.size(),
+                    stream_view);
+  raft::update_host(
+    rx_aligned_counts.data(), d_rx_aligned_counts.data(), d_rx_aligned_counts.size(), stream_view);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view));
+  size_t offset{0};
+  for (size_t i = 0; i < rx_counts.size(); ++i) {
+    auto target_alignment = (alignment - rx_unaligned_counts[i]) % alignment;
+    auto cur_alignment    = offset % alignment;
+    if (target_alignment >= cur_alignment) {
+      offset += target_alignment - cur_alignment;
+    } else {
+      offset += (target_alignment + alignment) - cur_alignment;
+    }
+    rx_displacements[i]         = offset;
+    rx_aligned_displacements[i] = rx_displacements[i] + rx_unaligned_counts[i];
+    offset                      = rx_aligned_displacements[i] + rx_aligned_counts[i];
+  }
+
+  auto rx_values = allocate_dataframe_buffer<value_t>(
+    rx_aligned_displacements.back() + rx_aligned_counts.back(), stream_view);
+  if (fill_value) {
+    thrust::fill(rmm::exec_policy_nosync(stream_view),
+                 get_dataframe_buffer_begin(rx_values),
+                 get_dataframe_buffer_end(rx_values),
+                 *fill_value);
+  }
+  cugraph::device_multicast_sendrecv(comm,
+                                     tx_value_first,
+                                     tx_unaligned_counts,
+                                     tx_displacements,
+                                     tx_ranks,
+                                     get_dataframe_buffer_begin(rx_values),
+                                     rx_unaligned_counts,
+                                     rx_displacements,
+                                     rx_ranks,
+                                     stream_view);
+  cugraph::device_multicast_sendrecv(comm,
+                                     tx_value_first,
+                                     tx_aligned_counts,
+                                     tx_aligned_displacements,
+                                     tx_ranks,
+                                     get_dataframe_buffer_begin(rx_values),
+                                     rx_aligned_counts,
+                                     rx_aligned_displacements,
+                                     rx_ranks,
+                                     stream_view);
+
+  return std::make_tuple(std::move(rx_values),
+                         tx_unaligned_counts,
+                         tx_aligned_counts,
+                         tx_displacements,
+                         rx_unaligned_counts,
+                         rx_aligned_counts,
+                         rx_displacements);
+}
+
+// this uses less memory than calling shuffle_values then sort & unique but requires comm.get_size()
+// - 1 communication steps
+template <typename TxValueIterator>
+auto shuffle_and_unique_segment_sorted_values(
+  raft::comms::comms_t const& comm,
+  TxValueIterator
+    segment_sorted_tx_value_first,  // sorted within each segment (segment sizes:
+                                    // tx_value_counts[i], where i = [0, comm_size); and bettter be
+                                    // unique to reduce communication volume
+  std::vector<size_t> const& tx_value_counts,
+  rmm::cuda_stream_view stream_view)
+{
+  using value_t = typename thrust::iterator_traits<TxValueIterator>::value_type;
+
+  auto const comm_rank = comm.get_rank();
+  auto const comm_size = comm.get_size();
+
+  auto sorted_unique_values = allocate_dataframe_buffer<value_t>(0, stream_view);
+  if (comm_size == 1) {
+    resize_dataframe_buffer(sorted_unique_values, tx_value_counts[comm_rank], stream_view);
+    thrust::copy(rmm::exec_policy_nosync(stream_view),
+                 segment_sorted_tx_value_first,
+                 segment_sorted_tx_value_first + tx_value_counts[comm_rank],
+                 get_dataframe_buffer_begin(sorted_unique_values));
+    resize_dataframe_buffer(
+      sorted_unique_values,
+      thrust::distance(get_dataframe_buffer_begin(sorted_unique_values),
+                       thrust::unique(rmm::exec_policy_nosync(stream_view),
+                                      get_dataframe_buffer_begin(sorted_unique_values),
+                                      get_dataframe_buffer_end(sorted_unique_values))),
+      stream_view);
+  } else {
+    rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream_view);
+    raft::update_device(
+      d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream_view.value());
+
+    std::vector<size_t> tx_counts{};
+    std::vector<size_t> tx_offsets{};
+    std::vector<size_t> rx_counts{};
+    std::vector<size_t> rx_offsets{};
+    std::tie(tx_counts, tx_offsets, std::ignore, rx_counts, rx_offsets, std::ignore) =
+      detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, false, stream_view);
+
+    d_tx_value_counts.resize(0, stream_view);
+    d_tx_value_counts.shrink_to_fit(stream_view);
+
+    for (int i = 1; i < comm_size; ++i) {
+      auto dst = (comm_rank + i) % comm_size;
+      auto src =
+        static_cast<int>((static_cast<size_t>(comm_rank) + static_cast<size_t>(comm_size - i)) %
+                         static_cast<size_t>(comm_size));
+      auto rx_sorted_values = allocate_dataframe_buffer<value_t>(rx_counts[src], stream_view);
+      device_sendrecv(comm,
+                      segment_sorted_tx_value_first + tx_offsets[dst],
+                      tx_counts[dst],
+                      dst,
+                      get_dataframe_buffer_begin(rx_sorted_values),
+                      rx_counts[src],
+                      src,
+                      stream_view);
+      auto merged_sorted_values = allocate_dataframe_buffer<value_t>(
+        (i == 1 ? tx_counts[comm_rank] : size_dataframe_buffer(sorted_unique_values)) +
+          rx_counts[src],
+        stream_view);
+      if (i == 1) {
+        thrust::merge(
+          rmm::exec_policy_nosync(stream_view),
+          segment_sorted_tx_value_first + tx_offsets[comm_rank],
+          segment_sorted_tx_value_first + (tx_offsets[comm_rank] + tx_counts[comm_rank]),
+          get_dataframe_buffer_begin(rx_sorted_values),
+          get_dataframe_buffer_end(rx_sorted_values),
+          get_dataframe_buffer_begin(merged_sorted_values));
+      } else {
+        thrust::merge(rmm::exec_policy_nosync(stream_view),
+                      get_dataframe_buffer_begin(sorted_unique_values),
+                      get_dataframe_buffer_end(sorted_unique_values),
+                      get_dataframe_buffer_begin(rx_sorted_values),
+                      get_dataframe_buffer_end(rx_sorted_values),
+                      get_dataframe_buffer_begin(merged_sorted_values));
+      }
+      resize_dataframe_buffer(
+        merged_sorted_values,
+        thrust::distance(get_dataframe_buffer_begin(merged_sorted_values),
+                         thrust::unique(rmm::exec_policy_nosync(stream_view),
+                                        get_dataframe_buffer_begin(merged_sorted_values),
+                                        get_dataframe_buffer_end(merged_sorted_values))),
+        stream_view);
+      sorted_unique_values = std::move(merged_sorted_values);
+    }
+  }
+  shrink_to_fit_dataframe_buffer(sorted_unique_values, stream_view);
+  return sorted_unique_values;
+}
+
 template <typename ValueIterator, typename ValueToGPUIdOp>
 auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm,
                                        ValueIterator tx_value_first /* [INOUT */,
@@ -889,7 +1126,7 @@ auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm,
   std::vector<size_t> rx_offsets{};
   std::vector<int> rx_src_ranks{};
   std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
-    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view);
 
   auto rx_value_buffer =
     allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
@@ -943,7 +1180,7 @@ auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm,
   std::vector<size_t> rx_offsets{};
   std::vector<int> rx_src_ranks{};
   std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
-    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view);
 
   rmm::device_uvector<typename thrust::iterator_traits<VertexIterator>::value_type> rx_keys(
     rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
index 2c36ed3335..29b9d132ef 100644
--- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
+++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
@@ -64,6 +64,18 @@ size_t sum_thrust_tuple_element_sizes(std::index_sequence<Is...>)
   return (... + sizeof(typename thrust::tuple_element<Is, TupleType>::type));
 }
 
+template <typename TupleType, std::size_t... Is>
+size_t min_thrust_tuple_element_sizes(std::index_sequence<Is...>)
+{
+  return std::min(sizeof(typename thrust::tuple_element<Is, TupleType>::type)...);
+}
+
+template <typename TupleType, std::size_t... Is>
+size_t max_thrust_tuple_element_sizes(std::index_sequence<Is...>)
+{
+  return std::max(sizeof(typename thrust::tuple_element<Is, TupleType>::type)...);
+}
+
 template <typename TupleType, std::size_t... Is>
 auto thrust_tuple_to_std_tuple(TupleType tup, std::index_sequence<Is...>)
 {
@@ -181,6 +193,20 @@ constexpr size_t sum_thrust_tuple_element_sizes()
     std::make_index_sequence<thrust::tuple_size<TupleType>::value>());
 }
 
+template <typename TupleType>
+constexpr size_t min_thrust_tuple_element_sizes()
+{
+  return detail::min_thrust_tuple_element_sizes<TupleType>(
+    std::make_index_sequence<thrust::tuple_size<TupleType>::value>());
+}
+
+template <typename TupleType>
+constexpr size_t max_thrust_tuple_element_sizes()
+{
+  return detail::max_thrust_tuple_element_sizes<TupleType>(
+    std::make_index_sequence<thrust::tuple_size<TupleType>::value>());
+}
+
 template <typename TupleType>
 auto thrust_tuple_to_std_tuple(TupleType tup)
 {
diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh
index 8ae49ed207..88ef3987a0 100644
--- a/cpp/src/centrality/betweenness_centrality_impl.cuh
+++ b/cpp/src/centrality/betweenness_centrality_impl.cuh
@@ -23,7 +23,7 @@
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_v.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -133,15 +133,15 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<edge_t>> brandes_b
     update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view());
     update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view());
 
-    auto [new_frontier, new_sigma] =
-      transform_reduce_v_frontier_outgoing_e_by_dst(handle,
-                                                    graph_view,
-                                                    vertex_frontier.bucket(bucket_idx_cur),
-                                                    src_sigmas.view(),
-                                                    dst_distances.view(),
-                                                    cugraph::edge_dummy_property_t{}.view(),
-                                                    brandes_e_op_t<vertex_t>{},
-                                                    reduce_op::plus<vertex_t>());
+    auto [new_frontier, new_sigma] = cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
+      handle,
+      graph_view,
+      vertex_frontier.bucket(bucket_idx_cur),
+      src_sigmas.view(),
+      dst_distances.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      brandes_e_op_t<vertex_t>{},
+      reduce_op::plus<vertex_t>());
 
     update_v_frontier(handle,
                       graph_view,
diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh
index a0ccfa52ff..869ed4e7ae 100644
--- a/cpp/src/community/approx_weighted_matching_impl.cuh
+++ b/cpp/src/community/approx_weighted_matching_impl.cuh
@@ -243,11 +243,12 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
         major_comm_size,
         minor_comm_size};
 
-      candidates_of_candidates = cugraph::collect_values_for_keys(handle,
+      candidates_of_candidates = cugraph::collect_values_for_keys(comm,
                                                                   target_candidate_map.view(),
                                                                   candidates.begin(),
                                                                   candidates.end(),
-                                                                  vertex_to_gpu_id_op);
+                                                                  vertex_to_gpu_id_op,
+                                                                  handle.get_stream());
     } else {
       candidates_of_candidates.resize(candidates.size(), handle.get_stream());
 
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index e17abdb370..18fb3fdb25 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -289,11 +289,12 @@ rmm::device_uvector<vertex_t> update_clustering_by_delta_modularity(
       invalid_vertex_id<vertex_t>::value,
       std::numeric_limits<weight_t>::max(),
       handle.get_stream());
-    vertex_cluster_weights_v = cugraph::collect_values_for_keys(handle,
+    vertex_cluster_weights_v = cugraph::collect_values_for_keys(comm,
                                                                 cluster_key_weight_map.view(),
                                                                 next_clusters_v.begin(),
                                                                 next_clusters_v.end(),
-                                                                vertex_to_gpu_id_op);
+                                                                vertex_to_gpu_id_op,
+                                                                handle.get_stream());
 
     src_cluster_weights =
       edge_src_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, weight_t>(handle,
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 62b66ed5f4..d69c1463ed 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -181,11 +181,12 @@ refine_clustering(
       comm_size, major_comm_size, minor_comm_size};
 
     vertex_louvain_cluster_weights =
-      cugraph::collect_values_for_keys(handle,
+      cugraph::collect_values_for_keys(comm,
                                        cluster_key_weight_map.view(),
                                        louvain_assignment_of_vertices.begin(),
                                        louvain_assignment_of_vertices.end(),
-                                       vertex_to_gpu_id_op);
+                                       vertex_to_gpu_id_op,
+                                       handle.get_stream());
 
   } else {
     vertex_louvain_cluster_weights.resize(louvain_assignment_of_vertices.size(),
@@ -473,11 +474,12 @@ refine_clustering(
       //   comm_size, major_comm_size, minor_comm_size};
 
       louvain_of_leiden_keys_used_in_edge_reduction =
-        cugraph::collect_values_for_keys(handle,
+        cugraph::collect_values_for_keys(comm,
                                          leiden_to_louvain_map.view(),
                                          leiden_keys_used_in_edge_reduction.begin(),
                                          leiden_keys_used_in_edge_reduction.end(),
-                                         vertex_to_gpu_id_op);
+                                         vertex_to_gpu_id_op,
+                                         handle.get_stream());
     } else {
       louvain_of_leiden_keys_used_in_edge_reduction.resize(
         leiden_keys_used_in_edge_reduction.size(), handle.get_stream());
@@ -864,11 +866,12 @@ refine_clustering(
     //   comm_size, major_comm_size, minor_comm_size};
 
     lovain_of_leiden_cluster_keys =
-      cugraph::collect_values_for_keys(handle,
+      cugraph::collect_values_for_keys(comm,
                                        leiden_to_louvain_map.view(),
                                        leiden_keys_to_read_louvain.begin(),
                                        leiden_keys_to_read_louvain.end(),
-                                       vertex_to_gpu_id_op);
+                                       vertex_to_gpu_id_op,
+                                       handle.get_stream());
 
   } else {
     lovain_of_leiden_cluster_keys.resize(leiden_keys_to_read_louvain.size(), handle.get_stream());
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 468f4f7280..219bc3c4d1 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "prims/fill_edge_src_dst_property.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -550,24 +550,25 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
       auto old_num_edge_inserts = num_edge_inserts.value(handle.get_stream());
       resize_dataframe_buffer(edge_buffer, old_num_edge_inserts + max_pushes, handle.get_stream());
 
-      auto new_frontier_tagged_vertex_buffer = transform_reduce_v_frontier_outgoing_e_by_dst(
-        handle,
-        level_graph_view,
-        vertex_frontier.bucket(bucket_idx_cur),
-        edge_src_dummy_property_t{}.view(),
-        edge_dst_dummy_property_t{}.view(),
-        edge_dummy_property_t{}.view(),
-        e_op_t<vertex_t, decltype(get_dataframe_buffer_begin(edge_buffer))>{
-          GraphViewType::is_multi_gpu
-            ? detail::edge_partition_endpoint_property_device_view_t<vertex_t, vertex_t*>(
-                edge_dst_components.mutable_view())
-            : detail::edge_partition_endpoint_property_device_view_t<vertex_t, vertex_t*>(
-                detail::edge_minor_property_view_t<vertex_t, vertex_t*>(level_components,
-                                                                        vertex_t{0})),
-          level_graph_view.local_edge_partition_dst_range_first(),
-          get_dataframe_buffer_begin(edge_buffer),
-          num_edge_inserts.data()},
-        reduce_op::null());
+      auto new_frontier_tagged_vertex_buffer =
+        cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
+          handle,
+          level_graph_view,
+          vertex_frontier.bucket(bucket_idx_cur),
+          edge_src_dummy_property_t{}.view(),
+          edge_dst_dummy_property_t{}.view(),
+          edge_dummy_property_t{}.view(),
+          e_op_t<vertex_t, decltype(get_dataframe_buffer_begin(edge_buffer))>{
+            GraphViewType::is_multi_gpu
+              ? detail::edge_partition_endpoint_property_device_view_t<vertex_t, vertex_t*>(
+                  edge_dst_components.mutable_view())
+              : detail::edge_partition_endpoint_property_device_view_t<vertex_t, vertex_t*>(
+                  detail::edge_minor_property_view_t<vertex_t, vertex_t*>(level_components,
+                                                                          vertex_t{0})),
+            level_graph_view.local_edge_partition_dst_range_first(),
+            get_dataframe_buffer_begin(edge_buffer),
+            num_edge_inserts.data()},
+          reduce_op::null());
 
       update_v_frontier(handle,
                         level_graph_view,
diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh
index d807ccac5a..a2b6f6430f 100644
--- a/cpp/src/cores/core_number_impl.cuh
+++ b/cpp/src/cores/core_number_impl.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "prims/reduce_v.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -222,14 +222,15 @@ void core_number(raft::handle_t const& handle,
         if (graph_view.is_symmetric() || ((degree_type == k_core_degree_type_t::IN) ||
                                           (degree_type == k_core_degree_type_t::INOUT))) {
           auto [new_frontier_vertex_buffer, delta_buffer] =
-            transform_reduce_v_frontier_outgoing_e_by_dst(handle,
-                                                          graph_view,
-                                                          vertex_frontier.bucket(bucket_idx_cur),
-                                                          edge_src_dummy_property_t{}.view(),
-                                                          dst_core_numbers.view(),
-                                                          edge_dummy_property_t{}.view(),
-                                                          e_op_t<vertex_t, edge_t>{k, delta},
-                                                          reduce_op::plus<edge_t>());
+            cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
+              handle,
+              graph_view,
+              vertex_frontier.bucket(bucket_idx_cur),
+              edge_src_dummy_property_t{}.view(),
+              dst_core_numbers.view(),
+              edge_dummy_property_t{}.view(),
+              e_op_t<vertex_t, edge_t>{k, delta},
+              reduce_op::plus<edge_t>());
 
           update_v_frontier(
             handle,
diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh
index 1c8c39fd6d..45bbf870d8 100644
--- a/cpp/src/lookup/lookup_src_dst_impl.cuh
+++ b/cpp/src/lookup/lookup_src_dst_impl.cuh
@@ -115,12 +115,13 @@ struct lookup_container_t<edge_id_t, edge_type_t, vertex_t, value_t>::lookup_con
       auto const minor_comm_size = minor_comm.get_size();
 
       value_buffer = cugraph::collect_values_for_keys(
-        handle,
+        comm,
         kv_store_object->view(),
         edge_ids_to_lookup.begin(),
         edge_ids_to_lookup.end(),
         cugraph::detail::compute_gpu_id_from_ext_edge_id_t<edge_id_t>{
-          comm_size, major_comm_size, minor_comm_size});
+          comm_size, major_comm_size, minor_comm_size},
+        handle.get_stream());
     } else {
       cugraph::resize_dataframe_buffer(
         value_buffer, edge_ids_to_lookup.size(), handle.get_stream());
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index 177c79ace8..2b89d214fd 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -15,9 +15,11 @@
  */
 #pragma once
 
+#include "prims/detail/multi_stream_utils.cuh"
 #include "prims/detail/optional_dataframe_buffer.hpp"
 #include "prims/detail/prim_functors.cuh"
 #include "prims/property_op_utils.cuh"
+#include "prims/vertex_frontier.cuh"
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_edge_property_device_view.cuh>
@@ -72,9 +74,9 @@ __device__ void push_buffer_element(BufferKeyOutputIterator buffer_key_output_fi
                                     e_op_result_t e_op_result)
 {
   using output_key_t =
-    typename optional_dataframe_buffer_value_type_t<BufferKeyOutputIterator>::value;
+    typename optional_dataframe_buffer_iterator_value_type_t<BufferKeyOutputIterator>::value;
   using output_value_t =
-    typename optional_dataframe_buffer_value_type_t<BufferValueOutputIterator>::value;
+    typename optional_dataframe_buffer_iterator_value_type_t<BufferValueOutputIterator>::value;
 
   assert(e_op_result.has_value());
 
@@ -118,7 +120,6 @@ __device__ void warp_push_buffer_elements(
 }
 
 template <bool hypersparse,
-          bool max_one_e_per_frontier_key,
           typename GraphViewType,
           typename KeyIterator,
           typename EdgePartitionSrcValueInputWrapper,
@@ -166,18 +167,12 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
 
   cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
-  int32_t constexpr shared_array_size = max_one_e_per_frontier_key
-                                          ? int32_t{1} /* dummy */
-                                          : extract_transform_v_frontier_e_kernel_block_size;
-  __shared__ std::conditional_t<max_one_e_per_frontier_key, std::byte /* dummy */, edge_t>
-    warp_local_degree_inclusive_sums[shared_array_size];
-  __shared__ std::conditional_t<max_one_e_per_frontier_key, std::byte /* dummy */, edge_t>
-    warp_key_local_edge_offsets[shared_array_size];
+  __shared__ edge_t
+    warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size];
+  __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size];
 
   using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
-  __shared__ std::
-    conditional_t<max_one_e_per_frontier_key, std::byte /* dummy */, typename WarpScan::TempStorage>
-      temp_storage;
+  __shared__ typename WarpScan::TempStorage temp_storage;
 
   auto indices = edge_partition.indices();
 
@@ -216,98 +211,74 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
       }
     }
 
-    if constexpr (max_one_e_per_frontier_key) {
-      // each thread processes one frontier key, exits if any edge returns a valid output
+    auto min_key_idx = static_cast<vertex_t>(idx - (idx % raft::warp_size()));  // inclusive
+    auto max_key_idx =
+      static_cast<vertex_t>(std::min(static_cast<size_t>(min_key_idx) + raft::warp_size(),
+                                     static_cast<size_t>(num_keys)));  // exclusive
 
-      e_op_result_t e_op_result{thrust::nullopt};
-      auto key = *(key_first + idx);
+    // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets
 
-      if (edge_partition_e_mask) {
-        for (edge_t i = 0; i < local_degree; ++i) {
-          if ((*edge_partition_e_mask).get(edge_offset + i)) {
-            e_op_result = call_e_op(key, edge_offset + i);
-            if (e_op_result) { break; }
-          }
-        }
-      } else {
-        for (edge_t i = 0; i < local_degree; ++i) {
-          e_op_result = call_e_op(key, edge_offset + i);
-          if (e_op_result) { break; }
-        }
-      }
-      warp_push_buffer_elements(
-        buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
-    } else {
-      auto min_key_idx = static_cast<vertex_t>(idx - (idx % raft::warp_size()));  // inclusive
-      auto max_key_idx =
-        static_cast<vertex_t>(std::min(static_cast<size_t>(min_key_idx) + raft::warp_size(),
-                                       static_cast<size_t>(num_keys)));  // exclusive
-
-      // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets
-
-      warp_key_local_edge_offsets[threadIdx.x] = edge_offset;
-      WarpScan(temp_storage)
-        .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]);
-      __syncwarp();
+    warp_key_local_edge_offsets[threadIdx.x] = edge_offset;
+    WarpScan(temp_storage)
+      .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]);
+    __syncwarp();
 
-      // all the threads in a warp collectively process local edges for the keys in [key_first +
-      // min_key_idx, key_first + max_key_idx)
+    // all the threads in a warp collectively process local edges for the keys in [key_first +
+    // min_key_idx, key_first + max_key_idx)
 
-      auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
-                                                                  (max_key_idx - min_key_idx) - 1];
-      auto rounded_up_num_edges_this_warp =
-        ((static_cast<size_t>(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) *
-        raft::warp_size();
+    auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
+                                                                (max_key_idx - min_key_idx) - 1];
+    auto rounded_up_num_edges_this_warp =
+      ((static_cast<size_t>(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) *
+      raft::warp_size();
 
-      auto this_warp_inclusive_sum_first =
-        warp_local_degree_inclusive_sums + warp_id * raft::warp_size();
-      auto this_warp_inclusive_sum_last =
-        this_warp_inclusive_sum_first + (max_key_idx - min_key_idx);
+    auto this_warp_inclusive_sum_first =
+      warp_local_degree_inclusive_sums + warp_id * raft::warp_size();
+    auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx);
 
-      if (edge_partition_e_mask) {
-        for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-          e_op_result_t e_op_result{thrust::nullopt};
-
-          if (i < static_cast<size_t>(num_edges_this_warp)) {
-            auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
-              this_warp_inclusive_sum_first,
-              thrust::upper_bound(
-                thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
-            auto local_edge_offset =
-              warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
-              static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
-                                                                : *(this_warp_inclusive_sum_first +
-                                                                    (key_idx_this_warp - 1))));
-            if ((*edge_partition_e_mask).get(local_edge_offset)) {
-              auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
-              e_op_result = call_e_op(key, local_edge_offset);
-            }
-          }
+    if (edge_partition_e_mask) {
+      for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
+        e_op_result_t e_op_result{thrust::nullopt};
 
-          warp_push_buffer_elements(
-            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
-        }
-      } else {
-        for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-          e_op_result_t e_op_result{thrust::nullopt};
-
-          if (i < static_cast<size_t>(num_edges_this_warp)) {
-            auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
-              this_warp_inclusive_sum_first,
-              thrust::upper_bound(
-                thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
-            auto local_edge_offset =
-              warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
-              static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
-                                                                : *(this_warp_inclusive_sum_first +
-                                                                    (key_idx_this_warp - 1))));
+        if (i < static_cast<size_t>(num_edges_this_warp)) {
+          auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
+            this_warp_inclusive_sum_first,
+            thrust::upper_bound(
+              thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
+          auto local_edge_offset =
+            warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
+            static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
+                                                              : *(this_warp_inclusive_sum_first +
+                                                                  (key_idx_this_warp - 1))));
+          if ((*edge_partition_e_mask).get(local_edge_offset)) {
             auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
             e_op_result = call_e_op(key, local_edge_offset);
           }
+        }
 
-          warp_push_buffer_elements(
-            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+      }
+    } else {
+      for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
+        e_op_result_t e_op_result{thrust::nullopt};
+
+        if (i < static_cast<size_t>(num_edges_this_warp)) {
+          auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
+            this_warp_inclusive_sum_first,
+            thrust::upper_bound(
+              thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
+          auto local_edge_offset =
+            warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
+            static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
+                                                              : *(this_warp_inclusive_sum_first +
+                                                                  (key_idx_this_warp - 1))));
+          auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
+          e_op_result = call_e_op(key, local_edge_offset);
         }
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
     }
 
@@ -315,8 +286,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree(
   }
 }
 
-template <bool max_one_e_per_frontier_key,
-          typename GraphViewType,
+template <typename GraphViewType,
           typename KeyIterator,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
@@ -359,25 +329,17 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
 
   cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
-  using WarpReduce = cub::WarpReduce<int32_t>;
-  __shared__ std::conditional_t<max_one_e_per_frontier_key,
-                                typename WarpReduce::TempStorage,
-                                std::byte /* dummy */>
-    temp_storage[max_one_e_per_frontier_key
-                   ? (extract_transform_v_frontier_e_kernel_block_size / raft::warp_size())
-                   : int32_t{1} /* dummy */];
-
   while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
     auto key          = *(key_first + idx);
     auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
     auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     vertex_t const* indices{nullptr};
     edge_t local_edge_offset{};
-    edge_t local_out_degree{};
-    thrust::tie(indices, local_edge_offset, local_out_degree) =
+    edge_t local_degree{};
+    thrust::tie(indices, local_edge_offset, local_degree) =
       edge_partition.local_edges(major_offset);
-    auto rounded_up_local_out_degree =
-      ((static_cast<size_t>(local_out_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
+    auto rounded_up_local_degree =
+      ((static_cast<size_t>(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
       raft::warp_size();
 
     auto call_e_op = call_e_op_t<GraphViewType,
@@ -396,49 +358,23 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
                                          local_edge_offset};
 
     if (edge_partition_e_mask) {
-      for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) {
+      for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
         e_op_result_t e_op_result{thrust::nullopt};
-        if ((i < static_cast<size_t>(local_out_degree)) &&
+        if ((i < static_cast<size_t>(local_degree)) &&
             ((*edge_partition_e_mask).get(local_edge_offset + i))) {
           e_op_result = call_e_op(i);
         }
 
-        if constexpr (max_one_e_per_frontier_key) {
-          auto first_valid_lane_id =
-            WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
-              .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min());
-          first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0});
-          if (lane_id == first_valid_lane_id) {
-            auto push_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
-            push_buffer_element(
-              buffer_key_output_first, buffer_value_output_first, push_idx, e_op_result);
-          }
-          if (first_valid_lane_id != raft::warp_size()) { break; }
-        } else {
-          warp_push_buffer_elements(
-            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
-        }
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
     } else {
-      for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) {
+      for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
         e_op_result_t e_op_result{thrust::nullopt};
-        if (i < static_cast<size_t>(local_out_degree)) { e_op_result = call_e_op(i); }
-
-        if constexpr (max_one_e_per_frontier_key) {
-          auto first_valid_lane_id =
-            WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
-              .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min());
-          first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0});
-          if (lane_id == first_valid_lane_id) {
-            auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
-            push_buffer_element(
-              buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result);
-          }
-          if (first_valid_lane_id != raft::warp_size()) { break; }
-        } else {
-          warp_push_buffer_elements(
-            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
-        }
+        if (i < static_cast<size_t>(local_degree)) { e_op_result = call_e_op(i); }
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
     }
 
@@ -446,8 +382,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree(
   }
 }
 
-template <bool max_one_e_per_frontier_key,
-          typename GraphViewType,
+template <typename GraphViewType,
           typename KeyIterator,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
@@ -461,7 +396,7 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
   KeyIterator key_first,
-  KeyIterator key_last,
+  raft::device_span<size_t const> key_local_degree_offsets,
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
@@ -482,123 +417,234 @@ __global__ static void extract_transform_v_frontier_e_high_degree(
                                  typename EdgePartitionEdgeValueInputWrapper::value_type,
                                  EdgeOp>::type;
 
-  auto const warp_id = threadIdx.x / raft::warp_size();
+  auto const tid     = threadIdx.x + blockIdx.x * blockDim.x;
   auto const lane_id = threadIdx.x % raft::warp_size();
-  auto idx           = static_cast<size_t>(blockIdx.x);
-
-  cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
-
-  using BlockReduce = cub::BlockReduce<int32_t, extract_transform_v_frontier_e_kernel_block_size>;
-  __shared__ std::conditional_t<max_one_e_per_frontier_key,
-                                typename BlockReduce::TempStorage,
-                                std::byte /* dummy */>
-    temp_storage;
-  __shared__ int32_t output_thread_id;
-
-  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
-    auto key          = *(key_first + idx);
-    auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
-    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-    vertex_t const* indices{nullptr};
-    edge_t local_edge_offset{};
-    edge_t local_out_degree{};
-    thrust::tie(indices, local_edge_offset, local_out_degree) =
-      edge_partition.local_edges(major_offset);
-    auto rounded_up_local_out_degree = ((static_cast<size_t>(local_out_degree) +
-                                         (extract_transform_v_frontier_e_kernel_block_size - 1)) /
-                                        extract_transform_v_frontier_e_kernel_block_size) *
-                                       extract_transform_v_frontier_e_kernel_block_size;
 
-    auto call_e_op = call_e_op_t<GraphViewType,
-                                 key_t,
-                                 EdgePartitionSrcValueInputWrapper,
-                                 EdgePartitionDstValueInputWrapper,
-                                 EdgePartitionEdgeValueInputWrapper,
-                                 EdgeOp>{edge_partition,
-                                         edge_partition_src_value_input,
-                                         edge_partition_dst_value_input,
-                                         edge_partition_e_value_input,
-                                         e_op,
-                                         key,
-                                         major_offset,
-                                         indices,
-                                         local_edge_offset};
+  auto idx = static_cast<size_t>(tid);
 
-    if (edge_partition_e_mask) {
-      for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) {
-        e_op_result_t e_op_result{thrust::nullopt};
-        if ((i < static_cast<size_t>(local_out_degree)) &&
-            ((*edge_partition_e_mask).get(local_edge_offset + i))) {
-          e_op_result = call_e_op(i);
-        }
+  cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
-        if constexpr (max_one_e_per_frontier_key) {
-          auto first_valid_thread_id =
-            BlockReduce(temp_storage)
-              .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size,
-                      cub::Min());
-          if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; }
-          __syncthreads();
-          if (threadIdx.x == output_thread_id) {
-            auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
-            push_buffer_element(
-              buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result);
-          }
-          if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; }
-        } else {
-          warp_push_buffer_elements(
-            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
-        }
-      }
-    } else {
-      for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) {
-        e_op_result_t e_op_result{thrust::nullopt};
-        if (i < static_cast<size_t>(local_out_degree)) { e_op_result = call_e_op(i); }
-
-        if constexpr (max_one_e_per_frontier_key) {
-          auto first_valid_thread_id =
-            BlockReduce(temp_storage)
-              .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size,
-                      cub::Min());
-          if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; }
-          __syncthreads();
-          if (threadIdx.x == output_thread_id) {
-            auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed);
-            push_buffer_element(
-              buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result);
-          }
-          if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; }
-        } else {
-          warp_push_buffer_elements(
-            buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
+  auto num_edges = *(key_local_degree_offsets.rbegin());
+  size_t rounded_up_num_edges =
+    ((static_cast<size_t>(num_edges) + (raft::warp_size() - 1)) / raft::warp_size()) *
+    raft::warp_size();
+  while (idx < rounded_up_num_edges) {
+    e_op_result_t e_op_result{thrust::nullopt};
+    if (idx < num_edges) {
+      auto key_idx = thrust::distance(
+        key_local_degree_offsets.begin() + 1,
+        thrust::upper_bound(
+          thrust::seq, key_local_degree_offsets.begin() + 1, key_local_degree_offsets.end(), idx));
+      auto key          = *(key_first + key_idx);
+      auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t local_edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, local_edge_offset, local_degree) =
+        edge_partition.local_edges(major_offset);
+
+      auto call_e_op = call_e_op_t<GraphViewType,
+                                   key_t,
+                                   EdgePartitionSrcValueInputWrapper,
+                                   EdgePartitionDstValueInputWrapper,
+                                   EdgePartitionEdgeValueInputWrapper,
+                                   EdgeOp>{edge_partition,
+                                           edge_partition_src_value_input,
+                                           edge_partition_dst_value_input,
+                                           edge_partition_e_value_input,
+                                           e_op,
+                                           key,
+                                           major_offset,
+                                           indices,
+                                           local_edge_offset};
+
+      auto e_idx = static_cast<edge_t>(idx - key_local_degree_offsets[key_idx]);
+      if (edge_partition_e_mask) {
+        if ((*edge_partition_e_mask).get(local_edge_offset + e_idx)) {
+          e_op_result = call_e_op(e_idx);
         }
+      } else {
+        e_op_result = call_e_op(e_idx);
       }
     }
+    warp_push_buffer_elements(
+      buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
 
-    idx += gridDim.x;
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <typename GraphViewType,
+          typename InputKeyIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename OptionalOutputKeyIterator,
+          typename OptionalOutputValueIterator,
+          typename EdgeOp>
+void extract_transform_v_frontier_e_edge_partition(
+  raft::handle_t const& handle,
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  InputKeyIterator edge_partition_frontier_key_first,
+  InputKeyIterator edge_partition_frontier_key_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  OptionalOutputKeyIterator output_key_first,
+  OptionalOutputValueIterator output_value_first,
+  raft::device_span<size_t> count /* size = 1 */,
+  EdgeOp e_op,
+  std::optional<raft::device_span<size_t const>> high_segment_key_local_degree_offsets,
+  std::optional<size_t> high_segment_edge_count,
+  std::optional<raft::host_span<size_t const>> key_segment_offsets,
+  std::optional<raft::host_span<size_t const>> const& edge_partition_stream_pool_indices)
+{
+  size_t stream_pool_size{0};
+  if (edge_partition_stream_pool_indices) {
+    stream_pool_size = (*edge_partition_stream_pool_indices).size();
+  }
+  if (key_segment_offsets) {
+    if (((*key_segment_offsets)[1] > 0) && ((*high_segment_edge_count) > 0)) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[0 % stream_pool_size])
+                           : handle.get_stream();
+
+      raft::grid_1d_thread_t update_grid((*high_segment_edge_count),
+                                         extract_transform_v_frontier_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+      extract_transform_v_frontier_e_high_degree<GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          edge_partition_frontier_key_first,
+          raft::device_span<size_t const>((*high_segment_key_local_degree_offsets).data(),
+                                          (*high_segment_key_local_degree_offsets).size()),
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          output_key_first,
+          output_value_first,
+          count.data(),
+          e_op);
+    }
+    if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[1 % stream_pool_size])
+                           : handle.get_stream();
+      raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1],
+                                       extract_transform_v_frontier_e_kernel_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);
+      extract_transform_v_frontier_e_mid_degree<GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          edge_partition_frontier_key_first + (*key_segment_offsets)[1],
+          edge_partition_frontier_key_first + (*key_segment_offsets)[2],
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          output_key_first,
+          output_value_first,
+          count.data(),
+          e_op);
+    }
+    if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[2 % stream_pool_size])
+                           : handle.get_stream();
+      raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2],
+                                         extract_transform_v_frontier_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+      extract_transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          edge_partition_frontier_key_first + (*key_segment_offsets)[2],
+          edge_partition_frontier_key_first + (*key_segment_offsets)[3],
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          output_key_first,
+          output_value_first,
+          count.data(),
+          e_op);
+    }
+    if (edge_partition.dcs_nzd_vertex_count() &&
+        ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[3 % stream_pool_size])
+                           : handle.get_stream();
+      raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3],
+                                         extract_transform_v_frontier_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+      extract_transform_v_frontier_e_hypersparse_or_low_degree<true, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          edge_partition_frontier_key_first + (*key_segment_offsets)[3],
+          edge_partition_frontier_key_first + (*key_segment_offsets)[4],
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          output_key_first,
+          output_value_first,
+          count.data(),
+          e_op);
+    }
+  } else {
+    auto exec_stream = edge_partition_stream_pool_indices
+                         ? handle.get_stream_from_stream_pool(
+                             (*edge_partition_stream_pool_indices)[0 % stream_pool_size])
+                         : handle.get_stream();
+
+    auto frontier_size = static_cast<size_t>(
+      thrust::distance(edge_partition_frontier_key_first, edge_partition_frontier_key_last));
+    if (frontier_size > 0) {
+      raft::grid_1d_thread_t update_grid(frontier_size,
+                                         extract_transform_v_frontier_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+
+      extract_transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          edge_partition_frontier_key_first,
+          edge_partition_frontier_key_last,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          output_key_first,
+          output_value_first,
+          count.data(),
+          e_op);
+    }
   }
 }
 
 template <bool incoming,  // iterate over incoming edges (incoming == true) or outgoing edges
                           // (incoming == false)
-          bool max_one_e_per_frontier_key,  // extract maximum one edge per key in the input
-                                            // frontier (if multiple e_op calls return valid output
-                                            // values, all but one will be discarded (the remaining
-                                            // one will be arbitrarily selected)
           typename OutputKeyT,
           typename OutputValueT,
           typename GraphViewType,
-          typename VertexFrontierBucketType,
+          typename KeyBucketType,
           typename EdgeSrcValueInputWrapper,
           typename EdgeDstValueInputWrapper,
           typename EdgeValueInputWrapper,
           typename EdgeOp>
-std::tuple<
-  decltype(allocate_optional_dataframe_buffer<OutputKeyT>(size_t{0}, rmm::cuda_stream_view{})),
-  decltype(allocate_optional_dataframe_buffer<OutputValueT>(size_t{0}, rmm::cuda_stream_view{}))>
+std::tuple<optional_dataframe_buffer_type_t<OutputKeyT>,
+           optional_dataframe_buffer_type_t<OutputValueT>>
 extract_transform_v_frontier_e(raft::handle_t const& handle,
                                GraphViewType const& graph_view,
-                               VertexFrontierBucketType const& frontier,
+                               KeyBucketType const& frontier,
                                EdgeSrcValueInputWrapper edge_src_value_input,
                                EdgeDstValueInputWrapper edge_dst_value_input,
                                EdgeValueInputWrapper edge_value_input,
@@ -607,7 +653,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
 {
   using vertex_t       = typename GraphViewType::vertex_type;
   using edge_t         = typename GraphViewType::edge_type;
-  using key_t          = typename VertexFrontierBucketType::key_type;
+  using key_t          = typename KeyBucketType::key_type;
   using output_key_t   = OutputKeyT;
   using output_value_t = OutputValueT;
 
@@ -653,6 +699,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
                                                          thrust::optional<output_key_t>,
                                                          thrust::optional<output_value_t>>>>);
 
+  constexpr bool try_bitmap = GraphViewType::is_multi_gpu && std::is_same_v<key_t, vertex_t> &&
+                              KeyBucketType::is_sorted_unique;
+
   if (do_expensive_check) {
     auto frontier_vertex_first =
       thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
@@ -673,10 +722,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
                     "Invalid input argument: frontier includes out-of-range keys.");
   }
 
+  [[maybe_unused]] constexpr auto max_segments =
+    detail::num_sparse_segments_per_vertex_partition + size_t{1};
+
+  // 1. pre-process frontier data
+
   auto frontier_key_first = frontier.begin();
   auto frontier_key_last  = frontier.end();
   auto frontier_keys      = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
-  if constexpr (!VertexFrontierBucketType::is_sorted_unique) {
+  if constexpr (!KeyBucketType::is_sorted_unique) {
     resize_dataframe_buffer(frontier_keys, frontier.size(), handle.get_stream());
     thrust::copy(handle.get_thrust_policy(),
                  frontier_key_first,
@@ -689,209 +743,708 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
     frontier_key_last  = get_dataframe_buffer_end(frontier_keys);
   }
 
-  // 1. fill the buffers
+  std::optional<std::vector<size_t>> key_segment_offsets{std::nullopt};
+  {  // drop zero degree vertices & compute key_segment_offsets
+    size_t partition_idx{0};
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      partition_idx    = static_cast<size_t>(minor_comm.get_rank());
+    }
+    auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx);
+    if (segment_offsets) {
+      if (thrust::distance(frontier_key_first, frontier_key_last) > 0) {
+        key_segment_offsets = compute_key_segment_offsets(
+          frontier_key_first,
+          frontier_key_last,
+          raft::host_span<vertex_t const>((*segment_offsets).data(), (*segment_offsets).size()),
+          graph_view.local_vertex_partition_range_first(),
+          handle.get_stream());
+        (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1);
+        frontier_key_last             = frontier_key_first + (*key_segment_offsets).back();
+      } else {
+        key_segment_offsets = std::vector<size_t>((*segment_offsets).size(), 0);
+      }
+    }
+  }
+
+  // 2. compute local max_pushes
 
-  auto key_buffer =
-    allocate_optional_dataframe_buffer<output_key_t>(size_t{0}, handle.get_stream());
-  auto value_buffer =
-    allocate_optional_dataframe_buffer<output_value_t>(size_t{0}, handle.get_stream());
-  rmm::device_scalar<size_t> buffer_idx(size_t{0}, handle.get_stream());
+  size_t local_max_pushes{};
+  {
+    size_t partition_idx{};
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_rank = minor_comm.get_rank();
+      partition_idx              = static_cast<size_t>(minor_comm_rank);
+    }
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(partition_idx));
+    auto frontier_major_first =
+      thrust_tuple_get_or_identity<decltype(frontier_key_first), 0>(frontier_key_first);
+    auto frontier_major_last =
+      thrust_tuple_get_or_identity<decltype(frontier_key_last), 0>(frontier_key_last);
+    // for an edge-masked graph, we can pass edge mask to compute tighter bound (at the expense of
+    // additional computing)
+    local_max_pushes = edge_partition.compute_number_of_edges(
+      frontier_major_first, frontier_major_last, handle.get_stream());
+  }
+
+  // 3. communication over minor_comm
 
   std::vector<size_t> local_frontier_sizes{};
+  std::conditional_t<GraphViewType::is_multi_gpu, std::vector<size_t>, std::byte /* dummy */>
+    max_tmp_buffer_sizes{};
+  std::conditional_t<GraphViewType::is_multi_gpu, std::vector<size_t>, std::byte /* dummy */>
+    tmp_buffer_size_per_loop_approximations{};
+  std::conditional_t<try_bitmap, std::vector<vertex_t>, std::byte /* dummy */>
+    local_frontier_range_firsts{};
+  std::conditional_t<try_bitmap, std::vector<vertex_t>, std::byte /* dummy */>
+    local_frontier_range_lasts{};
+  std::optional<std::vector<std::vector<size_t>>> key_segment_offset_vectors{std::nullopt};
   if constexpr (GraphViewType::is_multi_gpu) {
-    auto& minor_comm     = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-    local_frontier_sizes = host_scalar_allgather(
-      minor_comm,
-      static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)),
-      handle.get_stream());
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
+
+    auto max_tmp_buffer_size =
+      static_cast<size_t>(static_cast<double>(handle.get_device_properties().totalGlobalMem) * 0.2);
+    size_t approx_tmp_buffer_size_per_loop{};
+    {
+      size_t key_size{0};
+      if constexpr (std::is_arithmetic_v<key_t>) {
+        key_size = sizeof(key_t);
+      } else {
+        key_size = cugraph::sum_thrust_tuple_element_sizes<key_t>();
+      }
+      size_t output_key_size{0};
+      if constexpr (!std::is_same_v<output_key_t, void>) {
+        if constexpr (std::is_arithmetic_v<output_key_t>) {
+          output_key_size = sizeof(output_key_t);
+        } else {
+          output_key_size = cugraph::sum_thrust_tuple_element_sizes<output_key_t>();
+        }
+      }
+      size_t output_value_size{0};
+      if constexpr (!std::is_same_v<output_value_t, void>) {
+        if constexpr (std::is_arithmetic_v<output_value_t>) {
+          output_value_size = sizeof(output_value_t);
+        } else {
+          output_value_size = cugraph::sum_thrust_tuple_element_sizes<output_value_t>();
+        }
+      }
+      approx_tmp_buffer_size_per_loop =
+        static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)) * key_size +
+        local_max_pushes * (output_key_size + output_value_size);
+    }
+
+    size_t num_scalars =
+      3;  // local_frontier_size, max_tmp_buffer_size, approx_tmp_buffer_size_per_loop
+    if constexpr (try_bitmap) {
+      num_scalars += 2;  // local_frontier_range_first, local_frontier_range_last
+    }
+    if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); }
+    rmm::device_uvector<size_t> d_aggregate_tmps(minor_comm_size * num_scalars,
+                                                 handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      d_aggregate_tmps.begin() + num_scalars * minor_comm_rank,
+      d_aggregate_tmps.begin() + (num_scalars * minor_comm_rank + (try_bitmap ? 5 : 3)),
+      [frontier_key_first,
+       max_tmp_buffer_size,
+       approx_tmp_buffer_size_per_loop,
+       v_list_size = static_cast<size_t>(thrust::distance(frontier_key_first, frontier_key_last)),
+       vertex_partition_range_first =
+         graph_view.local_vertex_partition_range_first()] __device__(size_t i) {
+        if (i == 0) {
+          return v_list_size;
+        } else if (i == 1) {
+          return max_tmp_buffer_size;
+        } else if (i == 2) {
+          return approx_tmp_buffer_size_per_loop;
+        }
+        if constexpr (try_bitmap) {
+          if (i == 3) {
+            vertex_t first{};
+            if (v_list_size > 0) {
+              first = *frontier_key_first;
+            } else {
+              first = vertex_partition_range_first;
+            }
+            assert(static_cast<vertex_t>(static_cast<size_t>(first)) == first);
+            return static_cast<size_t>(first);
+          } else if (i == 4) {
+            assert(i == 4);
+            vertex_t last{};
+            if (v_list_size > 0) {
+              last = *(frontier_key_first + (v_list_size - 1)) + 1;
+            } else {
+              last = vertex_partition_range_first;
+            }
+            assert(static_cast<vertex_t>(static_cast<size_t>(last)) == last);
+            return static_cast<size_t>(last);
+          }
+        }
+        assert(false);
+        return size_t{0};
+      });
+    if (key_segment_offsets) {
+      raft::update_device(
+        d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 5 : 3)),
+        (*key_segment_offsets).data(),
+        (*key_segment_offsets).size(),
+        handle.get_stream());
+    }
+
+    if (minor_comm_size > 1) {
+      device_allgather(minor_comm,
+                       d_aggregate_tmps.data() + minor_comm_rank * num_scalars,
+                       d_aggregate_tmps.data(),
+                       num_scalars,
+                       handle.get_stream());
+    }
+
+    std::vector<size_t> h_aggregate_tmps(d_aggregate_tmps.size());
+    raft::update_host(h_aggregate_tmps.data(),
+                      d_aggregate_tmps.data(),
+                      d_aggregate_tmps.size(),
+                      handle.get_stream());
+    handle.sync_stream();
+    local_frontier_sizes                    = std::vector<size_t>(minor_comm_size);
+    max_tmp_buffer_sizes                    = std::vector<size_t>(minor_comm_size);
+    tmp_buffer_size_per_loop_approximations = std::vector<size_t>(minor_comm_size);
+    if constexpr (try_bitmap) {
+      local_frontier_range_firsts = std::vector<vertex_t>(minor_comm_size);
+      local_frontier_range_lasts  = std::vector<vertex_t>(minor_comm_size);
+    }
+    if (key_segment_offsets) {
+      key_segment_offset_vectors = std::vector<std::vector<size_t>>{};
+      (*key_segment_offset_vectors).reserve(minor_comm_size);
+    }
+    for (int i = 0; i < minor_comm_size; ++i) {
+      local_frontier_sizes[i]                    = h_aggregate_tmps[i * num_scalars];
+      max_tmp_buffer_sizes[i]                    = h_aggregate_tmps[i * num_scalars + 1];
+      tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 2];
+      if constexpr (try_bitmap) {
+        local_frontier_range_firsts[i] =
+          static_cast<vertex_t>(h_aggregate_tmps[i * num_scalars + 3]);
+        local_frontier_range_lasts[i] =
+          static_cast<vertex_t>(h_aggregate_tmps[i * num_scalars + 4]);
+      }
+      if (key_segment_offsets) {
+        (*key_segment_offset_vectors)
+          .emplace_back(h_aggregate_tmps.begin() + (i * num_scalars + (try_bitmap ? 5 : 3)),
+                        h_aggregate_tmps.begin() +
+                          (i * num_scalars + (try_bitmap ? 5 : 3) + (*key_segment_offsets).size()));
+      }
+    }
   } else {
     local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(
       static_cast<vertex_t>(thrust::distance(frontier_key_first, frontier_key_last)))};
+    if (key_segment_offsets) {
+      key_segment_offset_vectors       = std::vector<std::vector<size_t>>(1);
+      (*key_segment_offset_vectors)[0] = *key_segment_offsets;
+    }
+  }
+
+  // update frontier bitmap (used to reduce broadcast bandwidth size)
+
+  bool v_compressible{false};
+  std::
+    conditional_t<try_bitmap, std::optional<rmm::device_uvector<uint32_t>>, std::byte /* dummy */>
+      frontier_bitmap{};
+  std::
+    conditional_t<try_bitmap, std::optional<rmm::device_uvector<uint32_t>>, std::byte /* dummy */>
+      compressed_frontier{};
+  if constexpr (try_bitmap) {
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+    if (minor_comm_size > 1) {
+      auto const minor_comm_rank = minor_comm.get_rank();
+
+      if constexpr (sizeof(vertex_t) == 8) {
+        vertex_t local_frontier_max_range_size{0};
+        for (int i = 0; i < minor_comm_size; ++i) {
+          auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i];
+          local_frontier_max_range_size = std::max(range_size, local_frontier_max_range_size);
+        }
+        if (local_frontier_max_range_size <=
+            std::numeric_limits<uint32_t>::max()) {  // broadcast 32 bit offset values instead of 64
+                                                     // bit vertex IDs
+          v_compressible = true;
+        }
+      }
+
+      double avg_fill_ratio{0.0};
+      for (int i = 0; i < minor_comm_size; ++i) {
+        auto num_keys   = static_cast<double>(local_frontier_sizes[i]);
+        auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i];
+        avg_fill_ratio +=
+          (range_size > 0) ? (num_keys / static_cast<double>(range_size)) : double{0.0};
+      }
+      avg_fill_ratio /= static_cast<double>(minor_comm_size);
+      constexpr double threshold_ratio =
+        8.0 /* tuning parameter */ / static_cast<double>(sizeof(vertex_t) * 8);
+      auto avg_frontier_size =
+        std::reduce(local_frontier_sizes.begin(), local_frontier_sizes.end()) /
+        static_cast<vertex_t>(minor_comm_size);
+
+      if ((avg_fill_ratio > threshold_ratio) &&
+          (static_cast<size_t>(avg_frontier_size) >
+           packed_bools_per_word() *
+             32 /* tuning parameter, to consider additional kernel launch overhead */)) {
+        frontier_bitmap =
+          compute_vertex_list_bitmap_info(frontier_key_first,
+                                          frontier_key_last,
+                                          local_frontier_range_firsts[minor_comm_rank],
+                                          local_frontier_range_lasts[minor_comm_rank],
+                                          handle.get_stream());
+      } else if (v_compressible) {
+        rmm::device_uvector<uint32_t> tmps(local_frontier_sizes[minor_comm_rank],
+                                           handle.get_stream());
+        thrust::transform(handle.get_thrust_policy(),
+                          frontier_key_first,
+                          frontier_key_last,
+                          tmps.begin(),
+                          cuda::proclaim_return_type<uint32_t>(
+                            [range_first = local_frontier_range_firsts[minor_comm_rank]] __device__(
+                              auto v) { return static_cast<uint32_t>(v - range_first); }));
+        compressed_frontier = std::move(tmps);
+      }
+    }
+  }
+
+  // set-up stream ppol
+
+  std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+    auto max_tmp_buffer_size =
+      std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) /
+      static_cast<size_t>(minor_comm_size);
+    auto approx_tmp_buffer_size_per_loop =
+      std::reduce(tmp_buffer_size_per_loop_approximations.begin(),
+                  tmp_buffer_size_per_loop_approximations.end()) /
+      static_cast<size_t>(minor_comm_size);
+    size_t num_streams_per_loop{1};
+    if (graph_view.local_vertex_partition_segment_offsets() &&
+        (handle.get_stream_pool_size() >= max_segments)) {
+      num_streams_per_loop = std::max(
+        std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments),
+        size_t{
+          1});  // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets
+                // the number of queues, if the total number of streams exceeds this number, jobs on
+                // different streams can be sent to one queue leading to false dependency. Setting
+                // num_concurrent_loops above the number of queues has some benefits in NCCL
+                // communications but creating too many streams just for compute may not help.
+    }
+    stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size,
+                                                   approx_tmp_buffer_size_per_loop,
+                                                   graph_view.number_of_local_edge_partitions(),
+                                                   num_streams_per_loop,
+                                                   handle.get_stream_pool_size());
+    if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; }
+  }
+
+  size_t num_concurrent_loops{1};
+  std::optional<std::vector<size_t>> loop_stream_pool_indices{
+    std::nullopt};  // first num_concurrent_loopos streams from stream_pool_indices
+  if (stream_pool_indices) {
+    num_concurrent_loops =
+      std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size());
+    loop_stream_pool_indices = std::vector<size_t>(num_concurrent_loops);
+    std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0});
   }
 
+  rmm::device_uvector<size_t> counters(num_concurrent_loops, handle.get_stream());
+
+  if constexpr (!GraphViewType::is_multi_gpu) {
+    if (loop_stream_pool_indices) { handle.sync_stream(); }
+  }
+
+  // 2. fill the buffers
+
+  std::vector<optional_dataframe_buffer_type_t<output_key_t>> key_buffers{};
+  std::vector<optional_dataframe_buffer_type_t<output_value_t>> value_buffers{};
+  key_buffers.reserve(graph_view.number_of_local_edge_partitions());
+  value_buffers.reserve(graph_view.number_of_local_edge_partitions());
+
   auto edge_mask_view = graph_view.edge_mask_view();
 
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto edge_partition =
-      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-    auto edge_partition_e_mask =
-      edge_mask_view
-        ? thrust::make_optional<
-            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
-            *edge_mask_view, i)
-        : thrust::nullopt;
-
-    auto edge_partition_frontier_key_buffer =
-      allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
-    vertex_t edge_partition_frontier_size  = static_cast<vertex_t>(local_frontier_sizes[i]);
-    auto edge_partition_frontier_key_first = frontier_key_first;
-    auto edge_partition_frontier_key_last  = frontier_key_last;
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) {
+    auto loop_count =
+      std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i);
+
+    std::conditional_t<
+      GraphViewType::is_multi_gpu,
+      std::conditional_t<
+        try_bitmap,
+        std::vector<std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<vertex_t>>>,
+        std::vector<dataframe_buffer_type_t<key_t>>>,
+      std::byte /* dummy */>
+      edge_partition_key_buffers{};
     if constexpr (GraphViewType::is_multi_gpu) {
       auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
       auto const minor_comm_rank = minor_comm.get_rank();
+      auto const minor_comm_size = minor_comm.get_size();
+
+      edge_partition_key_buffers.reserve(loop_count);
+
+      std::conditional_t<try_bitmap,
+                         std::optional<std::vector<rmm::device_uvector<uint32_t>>>,
+                         std::byte /* dummy */>
+        edge_partition_bitmap_buffers{};
+      if constexpr (try_bitmap) {
+        if (frontier_bitmap) {
+          edge_partition_bitmap_buffers = std::vector<rmm::device_uvector<uint32_t>>{};
+          (*edge_partition_bitmap_buffers).reserve(loop_count);
+        }
+      }
 
-      resize_dataframe_buffer(
-        edge_partition_frontier_key_buffer, edge_partition_frontier_size, handle.get_stream());
-
-      device_bcast(minor_comm,
-                   frontier_key_first,
-                   get_dataframe_buffer_begin(edge_partition_frontier_key_buffer),
-                   edge_partition_frontier_size,
-                   static_cast<int>(i),
-                   handle.get_stream());
-
-      edge_partition_frontier_key_first =
-        get_dataframe_buffer_begin(edge_partition_frontier_key_buffer);
-      edge_partition_frontier_key_last =
-        get_dataframe_buffer_end(edge_partition_frontier_key_buffer);
-    }
-
-    auto edge_partition_frontier_major_first =
-      thrust_tuple_get_or_identity<decltype(edge_partition_frontier_key_first), 0>(
-        edge_partition_frontier_key_first);
-    auto edge_partition_frontier_major_last =
-      thrust_tuple_get_or_identity<decltype(edge_partition_frontier_key_last), 0>(
-        edge_partition_frontier_key_last);
-
-    auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
-    auto max_pushes      = max_one_e_per_frontier_key ? edge_partition_frontier_size
-                                                      : edge_partition.compute_number_of_edges(
-                                                     edge_partition_frontier_major_first,
-                                                     edge_partition_frontier_major_last,
-                                                     handle.get_stream());
-
-    auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes;
-    resize_optional_dataframe_buffer<output_key_t>(
-      key_buffer, new_buffer_size, handle.get_stream());
-    resize_optional_dataframe_buffer<output_value_t>(
-      value_buffer, new_buffer_size, handle.get_stream());
-
-    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
-    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
-    if constexpr (GraphViewType::is_storage_transposed) {
-      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
-      edge_partition_dst_value_input =
-        edge_partition_dst_input_device_view_t(edge_dst_value_input, i);
-    } else {
-      edge_partition_src_value_input =
-        edge_partition_src_input_device_view_t(edge_src_value_input, i);
-      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
+      for (size_t j = 0; j < loop_count; ++j) {
+        auto partition_idx = i + j;
+
+        bool use_bitmap_buffer = false;
+        if constexpr (try_bitmap) {
+          if (edge_partition_bitmap_buffers) {
+            (*edge_partition_bitmap_buffers)
+              .emplace_back(packed_bool_size(local_frontier_range_lasts[partition_idx] -
+                                             local_frontier_range_firsts[partition_idx]),
+                            handle.get_stream());
+            use_bitmap_buffer = true;
+          }
+        }
+        if (!use_bitmap_buffer) {
+          bool allocated{false};
+          if constexpr (try_bitmap) {
+            if (v_compressible) {
+              edge_partition_key_buffers.push_back(rmm::device_uvector<uint32_t>(
+                local_frontier_sizes[partition_idx], handle.get_stream()));
+              allocated = true;
+            }
+          }
+          if (!allocated) {
+            edge_partition_key_buffers.push_back(allocate_dataframe_buffer<key_t>(
+              local_frontier_sizes[partition_idx], handle.get_stream()));
+          }
+        }
+      }
+
+      device_group_start(minor_comm);
+      for (size_t j = 0; j < loop_count; ++j) {
+        auto partition_idx = i + j;
+
+        if constexpr (try_bitmap) {
+          if (frontier_bitmap) {
+            device_bcast(minor_comm,
+                         (*frontier_bitmap).data(),
+                         get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]),
+                         size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]),
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          } else if (compressed_frontier) {
+            device_bcast(minor_comm,
+                         (*compressed_frontier).data(),
+                         get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])),
+                         local_frontier_sizes[partition_idx],
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          } else {
+            device_bcast(minor_comm,
+                         frontier_key_first,
+                         get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])),
+                         local_frontier_sizes[partition_idx],
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          }
+        } else {
+          device_bcast(minor_comm,
+                       frontier_key_first,
+                       get_dataframe_buffer_begin(edge_partition_key_buffers[j]),
+                       local_frontier_sizes[partition_idx],
+                       static_cast<int>(partition_idx),
+                       handle.get_stream());
+        }
+      }
+      device_group_end(minor_comm);
+      if (loop_stream_pool_indices) { handle.sync_stream(); }
+
+      if constexpr (try_bitmap) {
+        if (edge_partition_bitmap_buffers) {
+          for (size_t j = 0; j < loop_count; ++j) {
+            auto partition_idx = i + j;
+            auto loop_stream =
+              loop_stream_pool_indices
+                ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                : handle.get_stream();
+
+            std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<vertex_t>> keys =
+              rmm::device_uvector<uint32_t>(0, loop_stream);
+            if (v_compressible) {
+              std::get<0>(keys).resize(local_frontier_sizes[partition_idx], loop_stream);
+            } else {
+              keys =
+                rmm::device_uvector<vertex_t>(local_frontier_sizes[partition_idx], loop_stream);
+            }
+
+            auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j];
+
+            auto range_first = local_frontier_range_firsts[partition_idx];
+            auto range_last  = local_frontier_range_lasts[partition_idx];
+            if (keys.index() == 0) {
+              retrieve_vertex_list_from_bitmap(
+                raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                get_dataframe_buffer_begin(std::get<0>(keys)),
+                raft::device_span<size_t>(counters.data() + j,
+                                          size_t{1}),  // dummy, we already know the counts
+                uint32_t{0},
+                static_cast<uint32_t>(range_last - range_first),
+                loop_stream);
+            } else {
+              retrieve_vertex_list_from_bitmap(
+                raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                get_dataframe_buffer_begin(std::get<1>(keys)),
+                raft::device_span<size_t>(counters.data() + j,
+                                          size_t{1}),  // dummy, we already know the counts
+                range_first,
+                range_last,
+                loop_stream);
+            }
+
+            edge_partition_key_buffers.push_back(std::move(keys));
+          }
+          if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+          (*edge_partition_bitmap_buffers).clear();
+        }
+      }
     }
-    auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i);
 
-    if (segment_offsets) {
-      static_assert(num_sparse_segments_per_vertex_partition == 3);
-      std::vector<vertex_t> h_thresholds(num_sparse_segments_per_vertex_partition +
-                                         (graph_view.use_dcs() ? 1 : 0) - 1);
-      h_thresholds[0] = edge_partition.major_range_first() + (*segment_offsets)[1];
-      h_thresholds[1] = edge_partition.major_range_first() + (*segment_offsets)[2];
-      if (graph_view.use_dcs()) {
-        h_thresholds[2] = edge_partition.major_range_first() + (*segment_offsets)[3];
+    std::vector<optional_dataframe_buffer_type_t<output_key_t>> output_key_buffers{};
+    output_key_buffers.reserve(loop_count);
+    std::vector<optional_dataframe_buffer_type_t<output_value_t>> output_value_buffers{};
+    output_value_buffers.reserve(loop_count);
+    std::vector<size_t> edge_partition_max_push_counts(loop_count);
+
+    std::optional<std::vector<rmm::device_uvector<size_t>>>
+      high_segment_key_local_degree_offset_vectors{std::nullopt};
+    std::optional<std::vector<size_t>> high_segment_edge_counts{std::nullopt};
+    if (key_segment_offset_vectors) {
+      high_segment_key_local_degree_offset_vectors = std::vector<rmm::device_uvector<size_t>>{};
+      (*high_segment_key_local_degree_offset_vectors).reserve(loop_count);
+      high_segment_edge_counts = std::vector<size_t>(loop_count);
+    }
+
+    edge_partition_max_push_counts[0] = local_max_pushes;
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_rank = minor_comm.get_rank();
+      auto const minor_comm_size = minor_comm.get_size();
+      if (minor_comm_size > 1) {
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+          auto loop_stream   = loop_stream_pool_indices
+                                 ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                                 : handle.get_stream();
+
+          if (static_cast<int>(partition_idx) != minor_comm_rank) {
+            auto edge_partition =
+              edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+                graph_view.local_edge_partition_view(partition_idx));
+
+            auto const& keys = edge_partition_key_buffers[j];
+
+            bool computed{false};
+            if constexpr (try_bitmap) {
+              if (keys.index() == 0) {
+                auto major_first = thrust::make_transform_iterator(
+                  std::get<0>(keys).begin(),
+                  cuda::proclaim_return_type<vertex_t>(
+                    [range_first =
+                       local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) {
+                      return range_first + static_cast<vertex_t>(v_offset);
+                    }));
+                edge_partition.compute_number_of_edges_async(
+                  major_first,
+                  major_first + std::get<0>(keys).size(),
+                  raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                  loop_stream);
+                computed = true;
+              }
+            }
+            if (!computed) {
+              dataframe_buffer_const_iterator_type_t<key_t> key_first{};
+              size_t num_keys{};
+              if constexpr (try_bitmap) {
+                assert(keys.index() == 1);
+                key_first = get_dataframe_buffer_begin(std::get<1>(keys));
+                num_keys  = std::get<1>(keys).size();
+              } else {
+                key_first = get_dataframe_buffer_begin(keys);
+                num_keys  = size_dataframe_buffer(keys);
+              }
+              auto major_first = thrust_tuple_get_or_identity<decltype(key_first), 0>(key_first);
+              edge_partition.compute_number_of_edges_async(
+                major_first,
+                major_first + num_keys,
+                raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                loop_stream);
+            }
+          }
+        }
+        if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+        raft::update_host(
+          edge_partition_max_push_counts.data(), counters.data(), loop_count, handle.get_stream());
+        handle.sync_stream();
+        if (static_cast<size_t>(minor_comm_rank / num_concurrent_loops) ==
+            (i / num_concurrent_loops)) {
+          edge_partition_max_push_counts[minor_comm_rank % num_concurrent_loops] = local_max_pushes;
+        }
       }
-      rmm::device_uvector<vertex_t> d_thresholds(h_thresholds.size(), handle.get_stream());
-      raft::update_device(
-        d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream());
-      rmm::device_uvector<vertex_t> d_offsets(d_thresholds.size(), handle.get_stream());
-      thrust::lower_bound(handle.get_thrust_policy(),
-                          edge_partition_frontier_major_first,
-                          edge_partition_frontier_major_last,
-                          d_thresholds.begin(),
-                          d_thresholds.end(),
-                          d_offsets.begin());
-      std::vector<vertex_t> h_offsets(d_offsets.size());
-      raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream());
-      RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-      h_offsets.push_back(edge_partition_frontier_size);
-      // FIXME: we may further improve performance by 1) concurrently running kernels on different
-      // segments; 2) individually tuning block sizes for different segments; and 3) adding one
-      // more segment for very high degree vertices and running segmented reduction
-      if (h_offsets[0] > 0) {
-        raft::grid_1d_block_t update_grid(h_offsets[0],
-                                          extract_transform_v_frontier_e_kernel_block_size,
-                                          handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_high_degree<max_one_e_per_frontier_key, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-            edge_partition,
-            edge_partition_frontier_key_first,
-            edge_partition_frontier_key_first + h_offsets[0],
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
-            get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
-            buffer_idx.data(),
-            e_op);
+    }
+
+    if (key_segment_offset_vectors) {
+      for (size_t j = 0; j < loop_count; ++j) {
+        auto partition_idx = i + j;
+        auto loop_stream   = loop_stream_pool_indices
+                               ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                               : handle.get_stream();
+
+        auto edge_partition =
+          edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+            graph_view.local_edge_partition_view(partition_idx));
+
+        auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx];
+        rmm::device_uvector<size_t> high_segment_key_local_degree_offsets(
+          key_segment_offsets[1] + 1, loop_stream);
+        high_segment_key_local_degree_offsets.set_element_to_zero_async(0, loop_stream);
+        bool computed{false};
+        if constexpr (try_bitmap) {
+          auto const& keys = edge_partition_key_buffers[j];
+          if (keys.index() == 0) {
+            auto key_local_degree_first = thrust::make_transform_iterator(
+              std::get<0>(keys).begin(),
+              cuda::proclaim_return_type<size_t>(
+                [edge_partition,
+                 range_first =
+                   local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) {
+                  auto major        = range_first + static_cast<vertex_t>(v_offset);
+                  auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+                  return static_cast<size_t>(edge_partition.local_degree(major_offset));
+                }));
+            thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream),
+                                   key_local_degree_first,
+                                   key_local_degree_first + key_segment_offsets[1],
+                                   high_segment_key_local_degree_offsets.begin() + 1);
+            computed = true;
+          }
+        }
+        if (!computed) {
+          auto key_first = frontier_key_first;
+          if constexpr (GraphViewType::is_multi_gpu) {
+            auto const& keys = edge_partition_key_buffers[j];
+            if constexpr (try_bitmap) {
+              assert(keys.index() == 1);
+              key_first = get_dataframe_buffer_begin(std::get<1>(keys));
+            } else {
+              key_first = get_dataframe_buffer_begin(keys);
+            }
+          }
+          auto key_local_degree_first = thrust::make_transform_iterator(
+            key_first, cuda::proclaim_return_type<size_t>([edge_partition] __device__(auto key) {
+              auto major        = thrust_tuple_get_or_identity<key_t, 0>(key);
+              auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+              return static_cast<size_t>(edge_partition.local_degree(major_offset));
+            }));
+          thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream),
+                                 key_local_degree_first,
+                                 key_local_degree_first + key_segment_offsets[1],
+                                 high_segment_key_local_degree_offsets.begin() + 1);
+        }
+        raft::update_host((*high_segment_edge_counts).data() + j,
+                          high_segment_key_local_degree_offsets.data() + key_segment_offsets[1],
+                          1,
+                          loop_stream);
+        (*high_segment_key_local_degree_offset_vectors)
+          .push_back(std::move(high_segment_key_local_degree_offsets));
       }
-      if (h_offsets[1] - h_offsets[0] > 0) {
-        raft::grid_1d_warp_t update_grid(h_offsets[1] - h_offsets[0],
-                                         extract_transform_v_frontier_e_kernel_block_size,
-                                         handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_mid_degree<max_one_e_per_frontier_key, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-            edge_partition,
-            edge_partition_frontier_key_first + h_offsets[0],
-            edge_partition_frontier_key_first + h_offsets[1],
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
-            get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
-            buffer_idx.data(),
-            e_op);
+
+      // to ensure that *high_segment_edge_counts[] is valid
+      if (loop_stream_pool_indices) {
+        handle.sync_stream_pool(*loop_stream_pool_indices);
+      } else {
+        handle.sync_stream();
       }
-      if (h_offsets[2] - h_offsets[1] > 0) {
-        raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1],
-                                           extract_transform_v_frontier_e_kernel_block_size,
-                                           handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_hypersparse_or_low_degree<false,
-                                                                 max_one_e_per_frontier_key,
-                                                                 GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-            edge_partition,
-            edge_partition_frontier_key_first + h_offsets[1],
-            edge_partition_frontier_key_first + h_offsets[2],
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
-            get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
-            buffer_idx.data(),
-            e_op);
+    }
+
+    for (size_t j = 0; j < loop_count; ++j) {
+      auto loop_stream = loop_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                           : handle.get_stream();
+
+      output_key_buffers.push_back(allocate_optional_dataframe_buffer<output_key_t>(
+        edge_partition_max_push_counts[j], loop_stream));
+      output_value_buffers.push_back(allocate_optional_dataframe_buffer<output_value_t>(
+        edge_partition_max_push_counts[j], loop_stream));
+    }
+    if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+
+    thrust::fill(
+      handle.get_thrust_policy(), counters.begin(), counters.begin() + loop_count, size_t{0});
+    if (loop_stream_pool_indices) { handle.sync_stream(); }
+
+    for (size_t j = 0; j < loop_count; ++j) {
+      auto partition_idx = i + j;
+
+      auto edge_partition =
+        edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+          graph_view.local_edge_partition_view(partition_idx));
+      auto edge_partition_e_mask =
+        edge_mask_view
+          ? thrust::make_optional<
+              detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+              *edge_mask_view, partition_idx)
+          : thrust::nullopt;
+      size_t num_streams_per_loop{1};
+      if (stream_pool_indices) {
+        assert((*stream_pool_indices).size() >= num_concurrent_loops);
+        num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops;
       }
-      if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[3] - h_offsets[2] > 0)) {
-        raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2],
-                                           extract_transform_v_frontier_e_kernel_block_size,
-                                           handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_hypersparse_or_low_degree<true,
-                                                                 max_one_e_per_frontier_key,
-                                                                 GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-            edge_partition,
-            edge_partition_frontier_key_first + h_offsets[2],
-            edge_partition_frontier_key_first + h_offsets[3],
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
-            get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
-            buffer_idx.data(),
-            e_op);
+      auto edge_partition_stream_pool_indices =
+        stream_pool_indices
+          ? std::make_optional<raft::host_span<size_t const>>(
+              (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop)
+          : std::nullopt;
+
+      edge_partition_src_input_device_view_t edge_partition_src_value_input{};
+      edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
+      if constexpr (GraphViewType::is_storage_transposed) {
+        edge_partition_src_value_input =
+          edge_partition_src_input_device_view_t(edge_src_value_input);
+        edge_partition_dst_value_input =
+          edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx);
+      } else {
+        edge_partition_src_value_input =
+          edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx);
+        edge_partition_dst_value_input =
+          edge_partition_dst_input_device_view_t(edge_dst_value_input);
       }
-    } else {
-      if (edge_partition_frontier_size > 0) {
-        raft::grid_1d_thread_t update_grid(edge_partition_frontier_size,
-                                           extract_transform_v_frontier_e_kernel_block_size,
-                                           handle.get_device_properties().maxGridSize[0]);
-
-        extract_transform_v_frontier_e_hypersparse_or_low_degree<false,
-                                                                 max_one_e_per_frontier_key,
-                                                                 GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+      auto edge_partition_e_value_input =
+        edge_partition_e_input_device_view_t(edge_value_input, partition_idx);
+
+      bool computed{false};
+      if constexpr (try_bitmap) {
+        auto const& keys = edge_partition_key_buffers[j];
+        if (keys.index() == 0) {
+          auto edge_partition_frontier_key_first = thrust::make_transform_iterator(
+            std::get<0>(keys).begin(),
+            cuda::proclaim_return_type<vertex_t>(
+              [range_first = local_frontier_range_firsts[partition_idx]] __device__(
+                uint32_t v_offset) { return range_first + static_cast<vertex_t>(v_offset); }));
+          auto edge_partition_frontier_key_last =
+            edge_partition_frontier_key_first + std::get<0>(keys).size();
+          extract_transform_v_frontier_e_edge_partition<GraphViewType>(
+            handle,
             edge_partition,
             edge_partition_frontier_key_first,
             edge_partition_frontier_key_last,
@@ -899,24 +1452,150 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
             edge_partition_e_mask,
-            get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
-            get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
-            buffer_idx.data(),
-            e_op);
+            get_optional_dataframe_buffer_begin<output_key_t>(output_key_buffers[j]),
+            get_optional_dataframe_buffer_begin<output_value_t>(output_value_buffers[j]),
+            raft::device_span<size_t>(counters.data() + j, size_t{1}),
+            e_op,
+            high_segment_key_local_degree_offset_vectors
+              ? std::make_optional<raft::device_span<size_t const>>(
+                  (*high_segment_key_local_degree_offset_vectors)[j].data(),
+                  (*high_segment_key_local_degree_offset_vectors)[j].size())
+              : std::nullopt,
+            high_segment_edge_counts ? std::make_optional<size_t>((*high_segment_edge_counts)[j])
+                                     : std::nullopt,
+            key_segment_offset_vectors ? std::make_optional<raft::host_span<size_t const>>(
+                                           (*key_segment_offset_vectors)[partition_idx].data(),
+                                           (*key_segment_offset_vectors)[partition_idx].size())
+                                       : std::nullopt,
+            edge_partition_stream_pool_indices);
+          computed = true;
+        }
+      }
+      if (!computed) {
+        auto edge_partition_frontier_key_first = frontier_key_first;
+        auto edge_partition_frontier_key_last  = frontier_key_last;
+        if constexpr (GraphViewType::is_multi_gpu) {
+          auto const& keys = edge_partition_key_buffers[j];
+          if constexpr (try_bitmap) {
+            assert(keys.index() == 1);
+            edge_partition_frontier_key_first = std::get<1>(keys).begin();
+            edge_partition_frontier_key_last  = std::get<1>(keys).end();
+          } else {
+            edge_partition_frontier_key_first = get_dataframe_buffer_begin(keys);
+            edge_partition_frontier_key_last  = get_dataframe_buffer_end(keys);
+          }
+        }
+
+        extract_transform_v_frontier_e_edge_partition<GraphViewType>(
+          handle,
+          edge_partition,
+          edge_partition_frontier_key_first,
+          edge_partition_frontier_key_last,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          get_optional_dataframe_buffer_begin<output_key_t>(output_key_buffers[j]),
+          get_optional_dataframe_buffer_begin<output_value_t>(output_value_buffers[j]),
+          raft::device_span<size_t>(counters.data() + j, size_t{1}),
+          e_op,
+          high_segment_key_local_degree_offset_vectors
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*high_segment_key_local_degree_offset_vectors)[j].data(),
+                (*high_segment_key_local_degree_offset_vectors)[j].size())
+            : std::nullopt,
+          high_segment_edge_counts ? std::make_optional<size_t>((*high_segment_edge_counts)[j])
+                                   : std::nullopt,
+          key_segment_offset_vectors ? std::make_optional<raft::host_span<size_t const>>(
+                                         (*key_segment_offset_vectors)[partition_idx].data(),
+                                         (*key_segment_offset_vectors)[partition_idx].size())
+                                     : std::nullopt,
+          edge_partition_stream_pool_indices);
       }
     }
-  }
 
-  // 2. resize and return the buffers
+    if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
 
-  auto new_buffer_size = buffer_idx.value(handle.get_stream());
+    std::vector<size_t> h_counts(loop_count);
+    raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream());
+    handle.sync_stream();
 
-  resize_optional_dataframe_buffer<output_key_t>(key_buffer, new_buffer_size, handle.get_stream());
-  shrink_to_fit_optional_dataframe_buffer<output_key_t>(key_buffer, handle.get_stream());
+    for (size_t j = 0; j < loop_count; ++j) {
+      auto loop_stream = loop_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                           : handle.get_stream();
 
-  resize_optional_dataframe_buffer<output_value_t>(
-    value_buffer, new_buffer_size, handle.get_stream());
-  shrink_to_fit_optional_dataframe_buffer<output_value_t>(value_buffer, handle.get_stream());
+      auto tmp_buffer_size = h_counts[j];
+      if (tmp_buffer_size > 0) {
+        auto& tmp_key_buffer   = output_key_buffers[j];
+        auto& tmp_value_buffer = output_value_buffers[j];
+
+        resize_optional_dataframe_buffer<output_key_t>(
+          tmp_key_buffer, tmp_buffer_size, loop_stream);
+        // skip shrink_to_fit before return to cut execution time
+
+        resize_optional_dataframe_buffer<output_value_t>(
+          tmp_value_buffer, tmp_buffer_size, loop_stream);
+        // skip shrink_to_fit before return to cut execution time
+
+        key_buffers.push_back(std::move(tmp_key_buffer));
+        value_buffers.push_back(std::move(tmp_value_buffer));
+      }
+    }
+    if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+  }
+
+  // 3. concatenate and return the buffers
+
+  auto key_buffer   = allocate_optional_dataframe_buffer<output_key_t>(0, handle.get_stream());
+  auto value_buffer = allocate_optional_dataframe_buffer<output_value_t>(0, handle.get_stream());
+  if (key_buffers.size() == 0) {
+    /* nothing to do */
+  } else if (key_buffers.size() == 1) {
+    key_buffer   = std::move(key_buffers[0]);
+    value_buffer = std::move(value_buffers[0]);
+    shrink_to_fit_optional_dataframe_buffer<output_key_t>(key_buffer, handle.get_stream());
+    shrink_to_fit_optional_dataframe_buffer<output_value_t>(value_buffer, handle.get_stream());
+  } else {
+    std::vector<size_t> buffer_sizes(key_buffers.size());
+    static_assert(!std::is_same_v<output_key_t, void> || !std::is_same_v<output_value_t, void>);
+    for (size_t i = 0; i < key_buffers.size(); ++i) {
+      if constexpr (!std::is_same_v<output_key_t, void>) {
+        buffer_sizes[i] = size_optional_dataframe_buffer<output_key_t>(key_buffers[i]);
+      } else {
+        buffer_sizes[i] = size_optional_dataframe_buffer<output_value_t>(value_buffers[i]);
+      }
+    }
+    auto buffer_size = std::reduce(buffer_sizes.begin(), buffer_sizes.end());
+    resize_optional_dataframe_buffer<output_key_t>(key_buffer, buffer_size, handle.get_stream());
+    resize_optional_dataframe_buffer<output_value_t>(
+      value_buffer, buffer_size, handle.get_stream());
+    std::vector<size_t> buffer_displacements(buffer_sizes.size());
+    std::exclusive_scan(
+      buffer_sizes.begin(), buffer_sizes.end(), buffer_displacements.begin(), size_t{0});
+    handle.sync_stream();
+    for (size_t i = 0; i < key_buffers.size(); ++i) {
+      auto loop_stream = loop_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[i])
+                           : handle.get_stream();
+      if constexpr (!std::is_same_v<output_key_t, void>) {
+        thrust::copy(
+          rmm::exec_policy_nosync(loop_stream),
+          get_optional_dataframe_buffer_cbegin<output_key_t>(key_buffers[i]),
+          get_optional_dataframe_buffer_cend<output_key_t>(key_buffers[i]),
+          get_optional_dataframe_buffer_begin<output_key_t>(key_buffer) + buffer_displacements[i]);
+      }
+
+      if constexpr (!std::is_same_v<output_value_t, void>) {
+        thrust::copy(rmm::exec_policy_nosync(loop_stream),
+                     get_optional_dataframe_buffer_cbegin<output_value_t>(value_buffers[i]),
+                     get_optional_dataframe_buffer_cend<output_value_t>(value_buffers[i]),
+                     get_optional_dataframe_buffer_begin<output_value_t>(value_buffer) +
+                       buffer_displacements[i]);
+      }
+    }
+    if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+  }
 
   return std::make_tuple(std::move(key_buffer), std::move(value_buffer));
 }
diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh
new file mode 100644
index 0000000000..76ef3fb0de
--- /dev/null
+++ b/cpp/src/prims/detail/multi_stream_utils.cuh
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/edge_partition_device_view.cuh>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/cub.cuh>
+
+#include <numeric>
+#include <vector>
+
+namespace cugraph {
+
+namespace detail {
+
+inline std::vector<size_t> init_stream_pool_indices(size_t max_tmp_buffer_size,
+                                                    size_t approx_tmp_buffer_size_per_loop,
+                                                    size_t loop_count,
+                                                    size_t num_streams_per_loop,
+                                                    size_t max_streams)
+{
+  size_t num_streams = std::min(loop_count * num_streams_per_loop,
+                                raft::round_down_safe(max_streams, num_streams_per_loop));
+
+  auto num_concurrent_loops =
+    (approx_tmp_buffer_size_per_loop > 0)
+      ? std::max(max_tmp_buffer_size / approx_tmp_buffer_size_per_loop, size_t{1})
+      : loop_count;
+  num_streams = std::min(num_concurrent_loops * num_streams_per_loop, num_streams);
+
+  std::vector<size_t> stream_pool_indices(num_streams);
+  std::iota(stream_pool_indices.begin(), stream_pool_indices.end(), size_t{0});
+
+  return stream_pool_indices;
+}
+
+// this assumes that the caller already knows how many items will be copied.
+template <typename InputIterator, typename FlagIterator, typename OutputIterator>
+void copy_if_nosync(InputIterator input_first,
+                    InputIterator input_last,
+                    FlagIterator flag_first,
+                    OutputIterator output_first,
+                    raft::device_span<size_t> count /* size = 1 */,
+                    rmm::cuda_stream_view stream_view)
+{
+  CUGRAPH_EXPECTS(
+    static_cast<size_t>(thrust::distance(input_first, input_last)) <=
+      static_cast<size_t>(std::numeric_limits<int>::max()),
+    "cugraph::detail::copy_if_nosync relies on cub::DeviceSelect::Flagged which uses int for input "
+    "size, but thrust::distance(input_first, input_last) exceeds std::numeric_limits<int>::max().");
+
+  size_t tmp_storage_bytes{0};
+  size_t input_size = static_cast<int>(thrust::distance(input_first, input_last));
+
+  cub::DeviceSelect::Flagged(static_cast<void*>(nullptr),
+                             tmp_storage_bytes,
+                             input_first,
+                             flag_first,
+                             output_first,
+                             count.data(),
+                             input_size,
+                             stream_view);
+
+  auto d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, stream_view);
+
+  cub::DeviceSelect::Flagged(d_tmp_storage.data(),
+                             tmp_storage_bytes,
+                             input_first,
+                             flag_first,
+                             output_first,
+                             count.data(),
+                             input_size,
+                             stream_view);
+}
+
+template <typename InputIterator>
+void count_nosync(InputIterator input_first,
+                  InputIterator input_last,
+                  raft::device_span<size_t> count /* size = 1 */,
+                  typename thrust::iterator_traits<InputIterator>::value_type value,
+                  rmm::cuda_stream_view stream_view)
+{
+  CUGRAPH_EXPECTS(
+    static_cast<size_t>(thrust::distance(input_first, input_last)) <=
+      static_cast<size_t>(std::numeric_limits<int>::max()),
+    "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, "
+    "but thrust::distance(input_first, input_last) exceeds std::numeric_limits<int>::max().");
+
+  size_t tmp_storage_bytes{0};
+  size_t input_size = static_cast<int>(thrust::distance(input_first, input_last));
+
+  cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                         tmp_storage_bytes,
+                         input_first,
+                         count.data(),
+                         input_size,
+                         stream_view);
+
+  auto d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, stream_view);
+
+  cub::DeviceReduce::Sum(
+    d_tmp_storage.data(), tmp_storage_bytes, input_first, count.data(), input_size, stream_view);
+}
+
+template <typename InputIterator>
+void sum_nosync(
+  InputIterator input_first,
+  InputIterator input_last,
+  raft::device_span<typename thrust::iterator_traits<InputIterator>::value_type> sum /* size = 1 */,
+  rmm::cuda_stream_view stream_view)
+{
+  CUGRAPH_EXPECTS(
+    static_cast<size_t>(thrust::distance(input_first, input_last)) <=
+      static_cast<size_t>(std::numeric_limits<int>::max()),
+    "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, "
+    "but thrust::distance(input_first, input_last) exceeds std::numeric_limits<int>::max().");
+
+  size_t tmp_storage_bytes{0};
+  size_t input_size = static_cast<int>(thrust::distance(input_first, input_last));
+
+  cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                         tmp_storage_bytes,
+                         input_first,
+                         sum.data(),
+                         input_size,
+                         stream_view);
+
+  auto d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, stream_view);
+
+  cub::DeviceReduce::Sum(
+    d_tmp_storage.data(), tmp_storage_bytes, input_first, sum.data(), input_size, stream_view);
+}
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/detail/optional_dataframe_buffer.hpp b/cpp/src/prims/detail/optional_dataframe_buffer.hpp
index 87c095f8e8..6657b91f13 100644
--- a/cpp/src/prims/detail/optional_dataframe_buffer.hpp
+++ b/cpp/src/prims/detail/optional_dataframe_buffer.hpp
@@ -26,152 +26,130 @@ namespace detail {
 // we cannot use thrust::iterator_traits<Iterator>::value_type if Iterator is void* (reference to
 // void is not allowed)
 template <typename Iterator, typename Enable = void>
-struct optional_dataframe_buffer_value_type_t;
+struct optional_dataframe_buffer_iterator_value_type_t;
 
 template <typename Iterator>
-struct optional_dataframe_buffer_value_type_t<Iterator,
-                                              std::enable_if_t<!std::is_same_v<Iterator, void*>>> {
+struct optional_dataframe_buffer_iterator_value_type_t<
+  Iterator,
+  std::enable_if_t<!std::is_same_v<Iterator, void*>>> {
   using value = typename thrust::iterator_traits<Iterator>::value_type;
 };
 
 template <typename Iterator>
-struct optional_dataframe_buffer_value_type_t<Iterator,
-                                              std::enable_if_t<std::is_same_v<Iterator, void*>>> {
+struct optional_dataframe_buffer_iterator_value_type_t<
+  Iterator,
+  std::enable_if_t<std::is_same_v<Iterator, void*>>> {
   using value = void;
 };
 
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-std::byte allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream)
-{
-  return std::byte{0};  // dummy
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 auto allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream)
 {
-  return allocate_dataframe_buffer<T>(size, stream);
+  if constexpr (std::is_same_v<T, void>) {
+    return std::byte{0};  // dummy
+  } else {
+    return allocate_dataframe_buffer<T>(size, stream);
+  }
 }
 
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void* get_optional_dataframe_buffer_begin(std::byte& optional_dataframe_buffer)
-{
-  return static_cast<void*>(nullptr);
-}
+template <typename T>
+struct optional_dataframe_buffer_type {
+  using type = decltype(allocate_optional_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
+};
 
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
-auto get_optional_dataframe_buffer_begin(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>&
-    optional_dataframe_buffer)
-{
-  return get_dataframe_buffer_begin(optional_dataframe_buffer);
-}
+template <typename T>
+using optional_dataframe_buffer_type_t = typename optional_dataframe_buffer_type<T>::type;
 
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void* get_optional_dataframe_buffer_end(std::byte& optional_dataframe_buffer)
+template <typename T>
+auto get_optional_dataframe_buffer_begin(
+  optional_dataframe_buffer_type_t<T>& optional_dataframe_buffer)
 {
-  return static_cast<void*>(nullptr);
+  if constexpr (std::is_same_v<T, void>) {
+    return static_cast<void*>(nullptr);
+  } else {
+    return get_dataframe_buffer_begin(optional_dataframe_buffer);
+  }
 }
 
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 auto get_optional_dataframe_buffer_end(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>&
-    optional_dataframe_buffer)
+  optional_dataframe_buffer_type_t<T>& optional_dataframe_buffer)
 {
-  return get_dataframe_buffer_end(optional_dataframe_buffer);
+  if constexpr (std::is_same_v<T, void>) {
+    return static_cast<void*>(nullptr);
+  } else {
+    return get_dataframe_buffer_end(optional_dataframe_buffer);
+  }
 }
 
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void const* get_optional_dataframe_buffer_cbegin(std::byte const& optional_dataframe_buffer)
-{
-  return static_cast<void*>(nullptr);
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 auto get_optional_dataframe_buffer_cbegin(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))> const&
-    optional_dataframe_buffer)
+  optional_dataframe_buffer_type_t<T> const& optional_dataframe_buffer)
 {
-  return get_dataframe_buffer_cbegin(optional_dataframe_buffer);
+  if constexpr (std::is_same_v<T, void>) {
+    return static_cast<void const*>(nullptr);
+  } else {
+    return get_dataframe_buffer_cbegin(optional_dataframe_buffer);
+  }
 }
 
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void const* get_optional_dataframe_buffer_cend(std::byte const& optional_dataframe_buffer)
-{
-  return static_cast<void*>(nullptr);
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 auto get_optional_dataframe_buffer_cend(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))> const&
-    optional_dataframe_buffer)
-{
-  return get_dataframe_buffer_cend(optional_dataframe_buffer);
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void reserve_optional_dataframe_buffer(std::byte& optional_dataframe_buffer,
-                                       size_t new_buffer_capacity,
-                                       rmm::cuda_stream_view stream_view)
+  optional_dataframe_buffer_type_t<T> const& optional_dataframe_buffer)
 {
-  return;
+  if constexpr (std::is_same_v<T, void>) {
+    return static_cast<void const*>(nullptr);
+  } else {
+    return get_dataframe_buffer_cend(optional_dataframe_buffer);
+  }
 }
 
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 void reserve_optional_dataframe_buffer(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>&
-    optional_dataframe_buffer,
+  optional_dataframe_buffer_type_t<T>& optional_dataframe_buffer,
   size_t new_buffer_capacity,
   rmm::cuda_stream_view stream_view)
 {
-  return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view);
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void resize_optional_dataframe_buffer(std::byte& optional_dataframe_buffer,
-                                      size_t new_buffer_size,
-                                      rmm::cuda_stream_view stream_view)
-{
-  return;
+  if constexpr (std::is_same_v<T, void>) {
+    return;
+  } else {
+    return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view);
+  }
 }
 
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 void resize_optional_dataframe_buffer(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>&
-    optional_dataframe_buffer,
+  optional_dataframe_buffer_type_t<T>& optional_dataframe_buffer,
   size_t new_buffer_size,
   rmm::cuda_stream_view stream_view)
 {
-  return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view);
+  if constexpr (std::is_same_v<T, void>) {
+    return;
+  } else {
+    return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view);
+  }
 }
 
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-void shrink_to_fit_optional_dataframe_buffer(std::byte& optional_dataframe_buffer,
-                                             rmm::cuda_stream_view stream_view)
-{
-  return;
-}
-
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 void shrink_to_fit_optional_dataframe_buffer(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>&
-    optional_dataframe_buffer,
-  rmm::cuda_stream_view stream_view)
-{
-  return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view);
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
-size_t size_optional_dataframe_buffer(std::byte const& optional_dataframe_buffer)
+  optional_dataframe_buffer_type_t<T>& optional_dataframe_buffer, rmm::cuda_stream_view stream_view)
 {
-  return size_t{0};
+  if constexpr (std::is_same_v<T, void>) {
+    return;
+  } else {
+    return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view);
+  }
 }
 
-template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+template <typename T>
 size_t size_optional_dataframe_buffer(
-  std::decay_t<decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))> const&
-    optional_dataframe_buffer)
+  optional_dataframe_buffer_type_t<T>& optional_dataframe_buffer)
 {
-  return size_dataframe_buffer(optional_dataframe_buffer);
+  if constexpr (std::is_same_v<T, void>) {
+    return size_t{0};
+  } else {
+    return size_dataframe_buffer(optional_dataframe_buffer);
+  }
 }
 
 }  // namespace detail
diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
new file mode 100644
index 0000000000..311b16e71e
--- /dev/null
+++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh
@@ -0,0 +1,4374 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "detail/graph_partition_utils.cuh"
+#include "prims/detail/multi_stream_utils.cuh"
+#include "prims/detail/optional_dataframe_buffer.hpp"
+#include "prims/detail/prim_functors.cuh"
+#include "prims/detail/prim_utils.cuh"
+#include "prims/fill_edge_src_dst_property.cuh"
+#include "prims/property_op_utils.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/vertex_frontier.cuh"
+
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/device_comm.hpp>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/packed_bool_utils.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/core/host_span.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <cub/cub.cuh>
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/scatter.h>
+#include <thrust/set_operations.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+namespace cugraph {
+
+namespace detail {
+
+int32_t constexpr per_v_transform_reduce_e_kernel_block_size                        = 512;
+int32_t constexpr per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size = 128;
+
+template <typename Iterator, typename default_t, typename Enable = void>
+struct iterator_value_type_or_default_t;
+
+template <typename Iterator, typename default_t>
+struct iterator_value_type_or_default_t<Iterator,
+                                        default_t,
+                                        std::enable_if_t<std::is_same_v<Iterator, void*>>> {
+  using value_type = default_t;  // if Iterator is invalid (void*), value_type = default_t
+};
+
+template <typename Iterator, typename default_t>
+struct iterator_value_type_or_default_t<Iterator,
+                                        default_t,
+                                        std::enable_if_t<!std::is_same_v<Iterator, void*>>> {
+  using value_type = typename thrust::iterator_traits<
+    Iterator>::value_type;  // if iterator is valid, value_type = typename
+                            // thrust::iterator_traits<Iterator>::value_type
+};
+
+template <typename GraphViewType,
+          typename key_t,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename PredOp>
+__device__ auto init_pred_op(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> const& edge_partition,
+  EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input,
+  PredOp const& pred_op,
+  key_t key,
+  typename GraphViewType::vertex_type major_offset,
+  typename GraphViewType::vertex_type const* indices,
+  typename GraphViewType::edge_type edge_offset)
+{
+  if constexpr (std::is_same_v<
+                  PredOp,
+                  const_true_e_op_t<key_t,
+                                    typename GraphViewType::vertex_type,
+                                    typename EdgePartitionSrcValueInputWrapper::value_type,
+                                    typename EdgePartitionDstValueInputWrapper::value_type,
+                                    typename EdgePartitionEdgeValueInputWrapper::value_type,
+                                    GraphViewType::is_storage_transposed>>) {
+    return call_const_true_e_op_t<typename GraphViewType::edge_type>{};
+  } else {
+    return call_e_op_t<GraphViewType,
+                       key_t,
+                       EdgePartitionSrcValueInputWrapper,
+                       EdgePartitionDstValueInputWrapper,
+                       EdgePartitionEdgeValueInputWrapper,
+                       PredOp>{edge_partition,
+                               edge_partition_src_value_input,
+                               edge_partition_dst_value_input,
+                               edge_partition_e_value_input,
+                               pred_op,
+                               key,
+                               major_offset,
+                               indices,
+                               edge_offset};
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename ResultValueOutputIteratorOrWrapper>
+struct transform_and_atomic_reduce_t {
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition{};
+  vertex_t const* indices{nullptr};
+  TransformOp const& transform_op{};
+  PredOp const& pred_op{};
+  ResultValueOutputIteratorOrWrapper& result_value_output{};
+
+  __device__ void operator()(edge_t i) const
+  {
+    if (pred_op(i)) {
+      auto e_op_result  = transform_op(i);
+      auto minor        = indices[i];
+      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+      if constexpr (multi_gpu) {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+      } else {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+      }
+    }
+  }
+};
+
+template <bool update_major,
+          typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename ResultValueOutputIteratorOrWrapper>
+__device__ void update_result_value_output(
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition,
+  vertex_t const* indices,
+  edge_t local_degree,
+  TransformOp const& transform_op,
+  result_t init,
+  ReduceOp const& reduce_op,
+  PredOp const& pred_op,
+  size_t output_idx /* relevent only when update_major === true */,
+  ResultValueOutputIteratorOrWrapper& result_value_output)
+{
+  if constexpr (update_major) {
+    result_t val{};
+    if constexpr (std::is_same_v<PredOp, call_const_true_e_op_t<edge_t>>) {
+      if constexpr (std::is_same_v<ReduceOp,
+                                   reduce_op::any<result_t>>) {  // init is selected only when no
+                                                                 // edges return a valid value
+        val = init;
+        for (edge_t i = 0; i < local_degree; ++i) {
+          auto tmp = transform_op(i);
+          val      = tmp;
+          break;
+        }
+      } else {
+        val = thrust::transform_reduce(thrust::seq,
+                                       thrust::make_counting_iterator(edge_t{0}),
+                                       thrust::make_counting_iterator(local_degree),
+                                       transform_op,
+                                       init,
+                                       reduce_op);
+      }
+    } else {
+      val = init;
+      for (edge_t i = 0; i < local_degree; ++i) {
+        if (pred_op(i)) {
+          auto tmp = transform_op(i);
+          if constexpr (std::is_same_v<ReduceOp,
+                                       reduce_op::any<result_t>>) {  // init is selected only when
+                                                                     // no edges return a valid
+                                                                     // value
+            val = tmp;
+            break;
+          } else {
+            val = reduce_op(val, tmp);
+          }
+        }
+      }
+    }
+    *(result_value_output + output_idx) = val;
+  } else {
+    thrust::for_each(thrust::seq,
+                     thrust::make_counting_iterator(edge_t{0}),
+                     thrust::make_counting_iterator(local_degree),
+                     transform_and_atomic_reduce_t<vertex_t,
+                                                   edge_t,
+                                                   multi_gpu,
+                                                   result_t,
+                                                   TransformOp,
+                                                   ReduceOp,
+                                                   PredOp,
+                                                   ResultValueOutputIteratorOrWrapper>{
+                       edge_partition, indices, transform_op, pred_op, result_value_output});
+  }
+}
+
+template <bool update_major,
+          typename GraphViewType,
+          typename OptionalKeyIterator,  // invalid if void*
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
+                                                         GraphViewType::is_multi_gpu, iterator
+                                                         otherwise */
+          ,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T>
+__global__ static void per_v_transform_reduce_e_hypersparse(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  OptionalKeyIterator key_first,
+  OptionalKeyIterator key_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  ResultValueOutputIteratorOrWrapper result_value_output,
+  EdgeOp e_op,
+  T init /* relevant only if update_major == true */,
+  ReduceOp reduce_op,
+  PredOp pred_op)
+{
+  constexpr bool use_input_key = !std::is_same_v<OptionalKeyIterator, void*>;
+  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
+                                  ReduceOp>);  // atomic_reduce is defined only when
+                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
+  static_assert(update_major || !use_input_key);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t =
+    typename iterator_value_type_or_default_t<OptionalKeyIterator, vertex_t>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto idx       = static_cast<size_t>(tid);
+
+  size_t key_count{};
+  if constexpr (use_input_key) {
+    key_count = static_cast<size_t>(thrust::distance(key_first, key_last));
+  } else {
+    key_count = *(edge_partition.dcs_nzd_vertex_count());
+  }
+
+  while (idx < key_count) {
+    key_t key{};
+    vertex_t major{};
+    thrust::optional<vertex_t> major_idx{};
+    if constexpr (use_input_key) {
+      key       = *(key_first + idx);
+      major     = thrust_tuple_get_or_identity<key_t, 0>(key);
+      major_idx = edge_partition.major_idx_from_major_nocheck(major);
+    } else {
+      key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+      major                   = key;
+      auto major_start_offset = static_cast<size_t>(*(edge_partition.major_hypersparse_first()) -
+                                                    edge_partition.major_range_first());
+      major_idx = major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
+    }
+
+    size_t output_idx = use_input_key ? idx : (major - *(edge_partition).major_hypersparse_first());
+    if (major_idx) {
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, edge_offset, local_degree) =
+        edge_partition.local_edges(static_cast<vertex_t>(*major_idx));
+
+      auto call_e_op = call_e_op_t<GraphViewType,
+                                   key_t,
+                                   EdgePartitionSrcValueInputWrapper,
+                                   EdgePartitionDstValueInputWrapper,
+                                   EdgePartitionEdgeValueInputWrapper,
+                                   EdgeOp>{edge_partition,
+                                           edge_partition_src_value_input,
+                                           edge_partition_dst_value_input,
+                                           edge_partition_e_value_input,
+                                           e_op,
+                                           key,
+                                           major_offset,
+                                           indices,
+                                           edge_offset};
+
+      auto call_pred_op = init_pred_op<GraphViewType>(edge_partition,
+                                                      edge_partition_src_value_input,
+                                                      edge_partition_dst_value_input,
+                                                      edge_partition_e_value_input,
+                                                      pred_op,
+                                                      key,
+                                                      major_offset,
+                                                      indices,
+                                                      edge_offset);
+
+      if (edge_partition_e_mask) {
+        update_result_value_output<update_major>(
+          edge_partition,
+          indices,
+          local_degree,
+          call_e_op,
+          init,
+          reduce_op,
+          [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) {
+            if ((*edge_partition_e_mask).get(edge_offset + i)) {
+              return call_pred_op(i);
+            } else {
+              return false;
+            }
+          },
+          output_idx,
+          result_value_output);
+      } else {
+        update_result_value_output<update_major>(edge_partition,
+                                                 indices,
+                                                 local_degree,
+                                                 call_e_op,
+                                                 init,
+                                                 reduce_op,
+                                                 call_pred_op,
+                                                 output_idx,
+                                                 result_value_output);
+      }
+    } else {
+      if constexpr (update_major) { *(result_value_output + output_idx) = init; }
+    }
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <bool update_major,
+          typename GraphViewType,
+          typename KeyIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
+                                                         GraphViewType::is_multi_gpu, iterator
+                                                         otherwise */
+          ,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T>
+__global__ static void per_v_transform_reduce_e_low_degree(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  KeyIterator key_first,
+  KeyIterator key_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  ResultValueOutputIteratorOrWrapper result_value_output,
+  EdgeOp e_op,
+  T init /* relevant only if update_major == true */,
+  ReduceOp reduce_op,
+  PredOp pred_op)
+{
+  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
+                                  ReduceOp>);  // atomic_reduce is defined only when
+                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto idx       = static_cast<size_t>(tid);
+
+  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
+    auto key   = *(key_first + idx);
+    auto major = thrust_tuple_get_or_identity<key_t, 0>(key);
+
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    edge_t edge_offset{};
+    edge_t local_degree{};
+    thrust::tie(indices, edge_offset, local_degree) =
+      edge_partition.local_edges(static_cast<vertex_t>(major_offset));
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 key_t,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         key,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    auto call_pred_op = init_pred_op<GraphViewType>(edge_partition,
+                                                    edge_partition_src_value_input,
+                                                    edge_partition_dst_value_input,
+                                                    edge_partition_e_value_input,
+                                                    pred_op,
+                                                    key,
+                                                    major_offset,
+                                                    indices,
+                                                    edge_offset);
+
+    if (edge_partition_e_mask) {
+      update_result_value_output<update_major>(
+        edge_partition,
+        indices,
+        local_degree,
+        call_e_op,
+        init,
+        reduce_op,
+        [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_pred_op(i);
+          } else {
+            return false;
+          }
+        },
+        idx,
+        result_value_output);
+    } else {
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               call_e_op,
+                                               init,
+                                               reduce_op,
+                                               call_pred_op,
+                                               idx,
+                                               result_value_output);
+    }
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <bool update_major,
+          typename GraphViewType,
+          typename KeyIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
+                                                         GraphViewType::is_multi_gpu, iterator
+                                                         otherwise */
+          ,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T>
+__global__ static void per_v_transform_reduce_e_mid_degree(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  KeyIterator key_first,
+  KeyIterator key_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  ResultValueOutputIteratorOrWrapper result_value_output,
+  EdgeOp e_op,
+  T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
+  ReduceOp reduce_op,
+  PredOp pred_op)
+{
+  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
+                                  ReduceOp>);  // atomic_reduce is defined only when
+                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
+
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using e_op_result_t = T;
+  using key_t         = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0);
+  auto const lane_id = tid % raft::warp_size();
+  auto idx           = static_cast<size_t>(tid / raft::warp_size());
+
+  using WarpReduce = cub::WarpReduce<
+    std::conditional_t<std::is_same_v<ReduceOp, reduce_op::any<T>>, int32_t, e_op_result_t>>;
+  [[maybe_unused]] __shared__
+    std::conditional_t<update_major, typename WarpReduce::TempStorage, std::byte /* dummy */>
+      temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size())
+                                : int32_t{1} /* dummy */];
+
+  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
+    auto key   = *(key_first + idx);
+    auto major = thrust_tuple_get_or_identity<key_t, 0>(key);
+
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    edge_t edge_offset{};
+    edge_t local_degree{};
+    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 key_t,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         key,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    auto call_pred_op = init_pred_op<GraphViewType>(edge_partition,
+                                                    edge_partition_src_value_input,
+                                                    edge_partition_dst_value_input,
+                                                    edge_partition_e_value_input,
+                                                    pred_op,
+                                                    key,
+                                                    major_offset,
+                                                    indices,
+                                                    edge_offset);
+
+    [[maybe_unused]] std::conditional_t<update_major, T, std::byte /* dummy */>
+      reduced_e_op_result{};
+    [[maybe_unused]] std::conditional_t<update_major && std::is_same_v<ReduceOp, reduce_op::any<T>>,
+                                        int32_t,
+                                        std::byte /* dummy */>
+      first_valid_lane_id{};
+    if constexpr (update_major) {
+      reduced_e_op_result =
+        (lane_id == 0) ? init : identity_element;  // init == identity_element for reduce_op::any<T>
+      if constexpr (std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        first_valid_lane_id = raft::warp_size();
+      }
+    }
+
+    if (edge_partition_e_mask) {
+      if constexpr (update_major && std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        auto rounded_up_local_degree =
+          ((static_cast<size_t>(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
+          raft::warp_size();
+        for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
+          thrust::optional<T> e_op_result{thrust::nullopt};
+          if ((i < static_cast<size_t>(local_degree)) &&
+              (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) {
+            e_op_result = call_e_op(i);
+          }
+          first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
+                                  .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min());
+          first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0});
+          if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; }
+          if (first_valid_lane_id != raft::warp_size()) { break; }
+        }
+      } else {
+        for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+          if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(i)) {
+            auto e_op_result = call_e_op(i);
+            if constexpr (update_major) {
+              reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+            } else {
+              auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+              if constexpr (GraphViewType::is_multi_gpu) {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+              } else {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+              }
+            }
+          }
+        }
+      }
+    } else {
+      if constexpr (update_major && std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        auto rounded_up_local_degree =
+          ((static_cast<size_t>(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
+          raft::warp_size();
+        for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) {
+          thrust::optional<T> e_op_result{thrust::nullopt};
+          if (i < static_cast<size_t>(local_degree) && call_pred_op(i)) {
+            e_op_result = call_e_op(i);
+          }
+          first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
+                                  .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min());
+          first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0});
+          if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; }
+          if (first_valid_lane_id != raft::warp_size()) { break; }
+        }
+      } else {
+        for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+          if (call_pred_op(i)) {
+            auto e_op_result = call_e_op(i);
+            if constexpr (update_major) {
+              reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+            } else {
+              auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+              if constexpr (GraphViewType::is_multi_gpu) {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+              } else {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if constexpr (update_major) {
+      if constexpr (std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        if (lane_id == ((first_valid_lane_id == raft::warp_size()) ? 0 : first_valid_lane_id)) {
+          *(result_value_output + idx) = reduced_e_op_result;
+        }
+      } else {
+        reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
+                                .Reduce(reduced_e_op_result, reduce_op);
+        if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; }
+      }
+    }
+
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <bool update_major,
+          typename GraphViewType,
+          typename KeyIterator,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
+                                                         GraphViewType::is_multi_gpu, iterator
+                                                         otherwise */
+          ,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T>
+__global__ static void per_v_transform_reduce_e_high_degree(
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  KeyIterator key_first,
+  KeyIterator key_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  ResultValueOutputIteratorOrWrapper result_value_output,
+  EdgeOp e_op,
+  T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
+  ReduceOp reduce_op,
+  PredOp pred_op)
+{
+  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
+                                  ReduceOp>);  // atomic_reduce is defined only when
+                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
+
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using e_op_result_t = T;
+  using key_t         = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  auto idx = static_cast<size_t>(blockIdx.x);
+
+  using BlockReduce = cub::BlockReduce<
+    std::conditional_t<std::is_same_v<ReduceOp, reduce_op::any<T>>, int32_t, e_op_result_t>,
+    std::is_same_v<ReduceOp, reduce_op::any<T>>
+      ? per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size
+      : per_v_transform_reduce_e_kernel_block_size>;
+  [[maybe_unused]] __shared__
+    std::conditional_t<update_major, typename BlockReduce::TempStorage, std::byte /* dummy */>
+      temp_storage;
+  [[maybe_unused]] __shared__
+    std::conditional_t<update_major && std::is_same_v<ReduceOp, reduce_op::any<T>>,
+                       int32_t,
+                       std::byte /* dummy */>
+      output_thread_id;
+
+  while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
+    auto key   = *(key_first + idx);
+    auto major = thrust_tuple_get_or_identity<key_t, 0>(key);
+
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    vertex_t const* indices{nullptr};
+    edge_t edge_offset{};
+    edge_t local_degree{};
+    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 key_t,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         key,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    auto call_pred_op = init_pred_op<GraphViewType>(edge_partition,
+                                                    edge_partition_src_value_input,
+                                                    edge_partition_dst_value_input,
+                                                    edge_partition_e_value_input,
+                                                    pred_op,
+                                                    key,
+                                                    major_offset,
+                                                    indices,
+                                                    edge_offset);
+
+    [[maybe_unused]] std::conditional_t<update_major, T, std::byte /* dummy */>
+      reduced_e_op_result{};
+    [[maybe_unused]] std::conditional_t<update_major && std::is_same_v<ReduceOp, reduce_op::any<T>>,
+                                        int32_t,
+                                        std::byte /* dummy */>
+      first_valid_thread_id{};
+    if constexpr (update_major) {
+      reduced_e_op_result = threadIdx.x == 0
+                              ? init
+                              : identity_element;  // init == identity_element for reduce_op::any<T>
+      if constexpr (std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        first_valid_thread_id = per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size;
+      }
+    }
+
+    if (edge_partition_e_mask) {
+      if constexpr (update_major && std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        auto rounded_up_local_degree =
+          ((static_cast<size_t>(local_degree) +
+            (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) /
+           per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) *
+          per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size;
+        for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) {
+          thrust::optional<T> e_op_result{thrust::nullopt};
+          if ((i < static_cast<size_t>(local_degree)) &&
+              (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) {
+            e_op_result = call_e_op(i);
+          }
+          first_valid_thread_id =
+            BlockReduce(temp_storage)
+              .Reduce(e_op_result
+                        ? threadIdx.x
+                        : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size,
+                      cub::Min());
+          if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; }
+          __syncthreads();
+          first_valid_thread_id = output_thread_id;
+          if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; }
+          if (first_valid_thread_id !=
+              per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) {
+            break;
+          }
+        }
+      } else {
+        for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+          if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) {
+            auto e_op_result = call_e_op(i);
+            if constexpr (update_major) {
+              reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+            } else {
+              auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+              if constexpr (GraphViewType::is_multi_gpu) {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+              } else {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+              }
+            }
+          }
+        }
+      }
+    } else {
+      if constexpr (update_major && std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        auto rounded_up_local_degree =
+          ((static_cast<size_t>(local_degree) +
+            (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) /
+           per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) *
+          per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size;
+        for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) {
+          thrust::optional<T> e_op_result{thrust::nullopt};
+          if ((i < static_cast<size_t>(local_degree)) && call_pred_op(i)) {
+            e_op_result = call_e_op(i);
+          }
+          first_valid_thread_id =
+            BlockReduce(temp_storage)
+              .Reduce(e_op_result
+                        ? threadIdx.x
+                        : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size,
+                      cub::Min());
+          if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; }
+          __syncthreads();
+          first_valid_thread_id = output_thread_id;
+          if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; }
+          if (first_valid_thread_id !=
+              per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) {
+            break;
+          }
+        }
+      } else {
+        for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+          if (call_pred_op(i)) {
+            auto e_op_result = call_e_op(i);
+            if constexpr (update_major) {
+              reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+            } else {
+              auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+              if constexpr (GraphViewType::is_multi_gpu) {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+              } else {
+                reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if constexpr (update_major) {
+      if constexpr (std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        if (threadIdx.x == ((first_valid_thread_id ==
+                             per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size)
+                              ? 0
+                              : first_valid_thread_id)) {
+          *(result_value_output + idx) = reduced_e_op_result;
+        }
+      } else {
+        reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op);
+        if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; }
+      }
+    }
+
+    idx += gridDim.x;
+  }
+}
+
+template <typename vertex_t, typename priority_t, typename ValueIterator>
+void compute_priorities(
+  raft::comms::comms_t const& comm,
+  ValueIterator value_first,
+  raft::device_span<priority_t> priorities,
+  std::optional<std::variant<raft::device_span<uint32_t const>, raft::device_span<size_t const>>>
+    hypersparse_key_offsets,  // we may not have values for the entire "range_size" if
+                              // hypersparse_key_offsets.has_value() is true
+  size_t contiguous_size,
+  int root,
+  int subgroup_size /* faster interconnect within a subgroup */,
+  typename thrust::iterator_traits<ValueIterator>::value_type init,
+  bool ignore_local_values,
+  rmm::cuda_stream_view stream_view)
+{
+  auto const comm_rank = comm.get_rank();
+  auto const comm_size = comm.get_size();
+
+  // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are
+  // more than one, the GPU with (comm_rank == root) has the highest priority, GPUs in the same DGX
+  // node should be the next)
+
+  if (ignore_local_values) {
+    thrust::fill(rmm::exec_policy_nosync(stream_view),
+                 priorities.begin(),
+                 priorities.end(),
+                 std::numeric_limits<priority_t>::max());
+  } else {
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream_view),
+      priorities.begin(),
+      priorities.begin() + contiguous_size,
+      [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) {
+        auto val = *(value_first + offset);
+        return (val != init)
+                 ? rank_to_priority<vertex_t, priority_t>(
+                     comm_rank, root, subgroup_size, comm_size, static_cast<vertex_t>(offset))
+                 : std::numeric_limits<priority_t>::max();  // lowest priority
+      });
+    if (hypersparse_key_offsets) {
+      thrust::fill(rmm::exec_policy_nosync(stream_view),
+                   priorities.begin() + contiguous_size,
+                   priorities.end(),
+                   std::numeric_limits<priority_t>::max());
+      if ((*hypersparse_key_offsets).index() == 0) {
+        auto priority_first = thrust::make_transform_iterator(
+          std::get<0>(*hypersparse_key_offsets).begin(),
+          cuda::proclaim_return_type<priority_t>(
+            [root, subgroup_size, comm_rank, comm_size] __device__(uint32_t offset) {
+              return rank_to_priority<vertex_t, priority_t>(
+                comm_rank, root, subgroup_size, comm_size, static_cast<vertex_t>(offset));
+            }));
+        thrust::scatter_if(
+          rmm::exec_policy_nosync(stream_view),
+          priority_first,
+          priority_first + std::get<0>(*hypersparse_key_offsets).size(),
+          std::get<0>(*hypersparse_key_offsets).begin(),
+          value_first + contiguous_size,
+          priorities.begin(),
+          is_not_equal_t<typename thrust::iterator_traits<ValueIterator>::value_type>{init});
+      } else {
+        auto priority_first = thrust::make_transform_iterator(
+          std::get<1>(*hypersparse_key_offsets).begin(),
+          cuda::proclaim_return_type<priority_t>(
+            [root, subgroup_size, comm_rank, comm_size] __device__(size_t offset) {
+              return rank_to_priority<vertex_t, priority_t>(
+                comm_rank, root, subgroup_size, comm_size, static_cast<vertex_t>(offset));
+            }));
+        thrust::scatter_if(
+          rmm::exec_policy_nosync(stream_view),
+          priority_first,
+          priority_first + std::get<1>(*hypersparse_key_offsets).size(),
+          std::get<1>(*hypersparse_key_offsets).begin(),
+          value_first + contiguous_size,
+          priorities.begin(),
+          is_not_equal_t<typename thrust::iterator_traits<ValueIterator>::value_type>{init});
+      }
+    }
+  }
+}
+
+// return selected ranks if root.
+// otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are
+// selected or not.
+template <typename vertex_t, typename priority_t>
+std::variant<rmm::device_uvector<std::conditional_t<std::is_same_v<priority_t, uint32_t>,
+                                                    int,
+                                                    priority_t>> /* root, store selected ranks */,
+             std::optional<rmm::device_uvector<uint32_t>> /* store bitmap */>
+compute_selected_ranks_from_priorities(
+  raft::comms::comms_t const& comm,
+  raft::device_span<priority_t const> priorities,
+  std::optional<std::variant<raft::device_span<uint32_t const>, raft::device_span<size_t const>>>
+    hypersparse_key_offsets,  // we may not have values for the entire "range_size" if
+                              // hypersparse_key_offsets.has_value() is true
+  size_t contiguous_size,
+  int root,
+  int subgroup_size /* faster interconnect within a subgroup */,
+  bool ignore_local_values,
+  rmm::cuda_stream_view stream_view)
+{
+  auto const comm_rank = comm.get_rank();
+  auto const comm_size = comm.get_size();
+
+  using rank_t = std::conditional_t<std::is_same_v<priority_t, uint32_t>, int, priority_t>;
+
+  if (comm_rank == root) {
+    rmm::device_uvector<rank_t> selected_ranks(priorities.size(), stream_view);
+    auto offset_priority_pair_first =
+      thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin());
+    thrust::transform(rmm::exec_policy_nosync(stream_view),
+                      offset_priority_pair_first,
+                      offset_priority_pair_first + priorities.size(),
+                      selected_ranks.begin(),
+                      [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) {
+                        auto offset   = thrust::get<0>(pair);
+                        auto priority = thrust::get<1>(pair);
+                        auto rank     = (priority == std::numeric_limits<priority_t>::max())
+                                          ? comm_size
+                                          : priority_to_rank<vertex_t, priority_t>(
+                                          priority, root, subgroup_size, comm_size, offset);
+                        return static_cast<rank_t>(rank);
+                      });
+    return selected_ranks;
+  } else {
+    std::optional<rmm::device_uvector<uint32_t>> keep_flags{std::nullopt};
+    if (!ignore_local_values) {
+      keep_flags = rmm::device_uvector<uint32_t>(
+        packed_bool_size(hypersparse_key_offsets
+                           ? (contiguous_size + ((*hypersparse_key_offsets).index() == 0
+                                                   ? std::get<0>(*hypersparse_key_offsets).size()
+                                                   : std::get<1>(*hypersparse_key_offsets).size()))
+                           : contiguous_size),
+        stream_view);
+      thrust::fill(rmm::exec_policy_nosync(stream_view),
+                   (*keep_flags).begin(),
+                   (*keep_flags).end(),
+                   packed_bool_empty_mask());
+      auto offset_priority_pair_first =
+        thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin());
+      thrust::for_each(
+        rmm::exec_policy_nosync(stream_view),
+        offset_priority_pair_first,
+        offset_priority_pair_first + contiguous_size,
+        [keep_flags = raft::device_span<uint32_t>((*keep_flags).data(), (*keep_flags).size()),
+         root,
+         subgroup_size,
+         comm_rank,
+         comm_size] __device__(auto pair) {
+          auto offset   = thrust::get<0>(pair);
+          auto priority = thrust::get<1>(pair);
+          auto rank     = (priority == std::numeric_limits<priority_t>::max())
+                            ? comm_size
+                            : priority_to_rank<vertex_t, priority_t>(
+                            priority, root, subgroup_size, comm_size, offset);
+          if (rank == comm_rank) {
+            cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+              keep_flags[packed_bool_offset(offset)]);
+            word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed);
+          }
+        });
+      if (hypersparse_key_offsets) {
+        if ((*hypersparse_key_offsets).index() == 0) {
+          auto pair_first =
+            thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}),
+                                      std::get<0>(*hypersparse_key_offsets).begin());
+          thrust::for_each(
+            rmm::exec_policy_nosync(stream_view),
+            pair_first,
+            pair_first + std::get<0>(*hypersparse_key_offsets).size(),
+            [priorities = raft::device_span<priority_t const>(priorities.data(), priorities.size()),
+             keep_flags = raft::device_span<uint32_t>((*keep_flags).data(), (*keep_flags).size()),
+             root,
+             subgroup_size,
+             comm_rank,
+             comm_size] __device__(auto pair) {
+              auto offset   = thrust::get<1>(pair);
+              auto priority = priorities[offset];
+              auto rank =
+                (priority == std::numeric_limits<priority_t>::max())
+                  ? comm_size
+                  : priority_to_rank<vertex_t, priority_t>(
+                      priority, root, subgroup_size, comm_size, static_cast<vertex_t>(offset));
+              if (rank == comm_rank) {
+                cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+                  keep_flags[packed_bool_offset(thrust::get<0>(pair))]);
+                word.fetch_or(packed_bool_mask(thrust::get<0>(pair)),
+                              cuda::std::memory_order_relaxed);
+              }
+            });
+        } else {
+          auto pair_first =
+            thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}),
+                                      std::get<1>(*hypersparse_key_offsets).begin());
+          thrust::for_each(
+            rmm::exec_policy_nosync(stream_view),
+            pair_first,
+            pair_first + std::get<1>(*hypersparse_key_offsets).size(),
+            [priorities = raft::device_span<priority_t const>(priorities.data(), priorities.size()),
+             keep_flags = raft::device_span<uint32_t>((*keep_flags).data(), (*keep_flags).size()),
+             root,
+             subgroup_size,
+             comm_rank,
+             comm_size] __device__(auto pair) {
+              auto offset   = thrust::get<1>(pair);
+              auto priority = priorities[offset];
+              auto rank =
+                (priority == std::numeric_limits<priority_t>::max())
+                  ? comm_size
+                  : priority_to_rank<vertex_t, priority_t>(
+                      priority, root, subgroup_size, comm_size, static_cast<vertex_t>(offset));
+              if (rank == comm_rank) {
+                cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+                  keep_flags[packed_bool_offset(thrust::get<0>(pair))]);
+                word.fetch_or(packed_bool_mask(thrust::get<0>(pair)),
+                              cuda::std::memory_order_relaxed);
+              }
+            });
+        }
+      }
+    }
+    return keep_flags;
+  }
+}
+
+template <bool update_major,
+          typename GraphViewType,
+          typename OptionalKeyIterator,  // invalid if void*
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
+                                                         GraphViewType::is_multi_gpu, iterator
+                                                         otherwise */
+          ,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T>
+void per_v_transform_reduce_e_edge_partition(
+  raft::handle_t const& handle,
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> edge_partition,
+  OptionalKeyIterator edge_partition_key_first,
+  OptionalKeyIterator edge_partition_key_last,
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
+  EdgePartitionValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  ResultValueOutputIteratorOrWrapper output_buffer,
+  EdgeOp e_op,
+  T major_init,
+  T major_identity_element,
+  ReduceOp reduce_op,
+  PredOp pred_op,
+  std::optional<raft::host_span<size_t const>> key_segment_offsets,
+  std::optional<raft::host_span<size_t const>> const& edge_partition_stream_pool_indices)
+{
+  constexpr bool use_input_key = !std::is_same_v<OptionalKeyIterator, void*>;
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using segment_key_iterator_t =
+    std::conditional_t<use_input_key,
+                       decltype(edge_partition_key_first),
+                       decltype(thrust::make_counting_iterator(vertex_t{0}))>;
+
+  size_t stream_pool_size{0};
+  if (edge_partition_stream_pool_indices) {
+    stream_pool_size = (*edge_partition_stream_pool_indices).size();
+  }
+  if (key_segment_offsets) {
+    static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+
+    if (edge_partition.dcs_nzd_vertex_count()) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[0 % stream_pool_size])
+                           : handle.get_stream();
+
+      if constexpr (update_major && !use_input_key) {  // this is necessary as we don't visit
+                                                       // every vertex in the hypersparse segment
+        thrust::fill(rmm::exec_policy_nosync(exec_stream),
+                     output_buffer + (*key_segment_offsets)[3],
+                     output_buffer + (*key_segment_offsets)[4],
+                     major_init);
+      }
+
+      auto segment_size = use_input_key
+                            ? ((*key_segment_offsets)[4] - (*key_segment_offsets)[3])
+                            : static_cast<size_t>(*(edge_partition.dcs_nzd_vertex_count()));
+      if (segment_size > 0) {
+        raft::grid_1d_thread_t update_grid(segment_size,
+                                           detail::per_v_transform_reduce_e_kernel_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        auto segment_output_buffer = output_buffer;
+        if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; }
+        auto segment_key_first = edge_partition_key_first;
+        auto segment_key_last  = edge_partition_key_last;
+        if constexpr (use_input_key) {
+          segment_key_first += (*key_segment_offsets)[3];
+          segment_key_last =
+            segment_key_first + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]);
+        } else {
+          assert(segment_key_first == nullptr);
+          assert(segment_key_last == nullptr);
+        }
+        detail::per_v_transform_reduce_e_hypersparse<update_major, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+            edge_partition,
+            segment_key_first,
+            segment_key_last,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_e_mask,
+            segment_output_buffer,
+            e_op,
+            major_init,
+            reduce_op,
+            pred_op);
+      }
+    }
+    if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[1 % stream_pool_size])
+                           : handle.get_stream();
+      raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2],
+                                         detail::per_v_transform_reduce_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+      auto segment_output_buffer = output_buffer;
+      if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; }
+      std::optional<segment_key_iterator_t>
+        segment_key_first{};  // std::optional as thrust::transform_iterator's default constructor
+                              // is a deleted function, segment_key_first should always have a value
+      if constexpr (use_input_key) {
+        segment_key_first = edge_partition_key_first;
+      } else {
+        segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first());
+      }
+      *segment_key_first += (*key_segment_offsets)[2];
+      detail::per_v_transform_reduce_e_low_degree<update_major, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          *segment_key_first,
+          *segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]),
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          segment_output_buffer,
+          e_op,
+          major_init,
+          reduce_op,
+          pred_op);
+    }
+    if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[2 % stream_pool_size])
+                           : handle.get_stream();
+      raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1],
+                                       detail::per_v_transform_reduce_e_kernel_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);
+      auto segment_output_buffer = output_buffer;
+      if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; }
+      std::optional<segment_key_iterator_t>
+        segment_key_first{};  // std::optional as thrust::transform_iterator's default constructor
+                              // is a deleted function, segment_key_first should always have a value
+      if constexpr (use_input_key) {
+        segment_key_first = edge_partition_key_first;
+      } else {
+        segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first());
+      }
+      *segment_key_first += (*key_segment_offsets)[1];
+      detail::per_v_transform_reduce_e_mid_degree<update_major, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          *segment_key_first,
+          *segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]),
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          segment_output_buffer,
+          e_op,
+          major_init,
+          major_identity_element,
+          reduce_op,
+          pred_op);
+    }
+    if ((*key_segment_offsets)[1] > 0) {
+      auto exec_stream = edge_partition_stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*edge_partition_stream_pool_indices)[3 % stream_pool_size])
+                           : handle.get_stream();
+      raft::grid_1d_block_t update_grid(
+        (*key_segment_offsets)[1],
+        std::is_same_v<ReduceOp, reduce_op::any<T>>
+          ? detail::per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size
+          : detail::per_v_transform_reduce_e_kernel_block_size,
+        handle.get_device_properties().maxGridSize[0]);
+      std::optional<segment_key_iterator_t>
+        segment_key_first{};  // std::optional as thrust::transform_iterator's default constructor
+                              // is a deleted function, segment_key_first should always have a value
+      if constexpr (use_input_key) {
+        segment_key_first = edge_partition_key_first;
+      } else {
+        segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first());
+      }
+      detail::per_v_transform_reduce_e_high_degree<update_major, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          *segment_key_first,
+          *segment_key_first + (*key_segment_offsets)[1],
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          output_buffer,
+          e_op,
+          major_init,
+          major_identity_element,
+          reduce_op,
+          pred_op);
+    }
+  } else {
+    auto exec_stream = edge_partition_stream_pool_indices
+                         ? handle.get_stream_from_stream_pool(
+                             (*edge_partition_stream_pool_indices)[0 % stream_pool_size])
+                         : handle.get_stream();
+
+    size_t num_keys{};
+    if constexpr (use_input_key) {
+      num_keys =
+        static_cast<size_t>(thrust::distance(edge_partition_key_first, edge_partition_key_last));
+    } else {
+      num_keys = static_cast<size_t>(edge_partition.major_range_size());
+    }
+
+    if (num_keys > size_t{0}) {
+      raft::grid_1d_thread_t update_grid(num_keys,
+                                         detail::per_v_transform_reduce_e_kernel_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+      std::optional<segment_key_iterator_t>
+        segment_key_first{};  // std::optional as thrust::transform_iterator's default constructor
+                              // is a deleted function, segment_key_first should always have a value
+      if constexpr (use_input_key) {
+        segment_key_first = edge_partition_key_first;
+      } else {
+        segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first());
+      }
+      detail::per_v_transform_reduce_e_low_degree<update_major, GraphViewType>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+          edge_partition,
+          *segment_key_first,
+          *segment_key_first + num_keys,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_e_mask,
+          output_buffer,
+          e_op,
+          major_init,
+          reduce_op,
+          pred_op);
+    }
+  }
+}
+
+template <bool incoming,  // iterate over incoming edges (incoming == true) or outgoing edges
+                          // (incoming == false)
+          typename GraphViewType,
+          typename OptionalKeyIterator,  // invalid if void*
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void per_v_transform_reduce_e(raft::handle_t const& handle,
+                              GraphViewType const& graph_view,
+                              OptionalKeyIterator sorted_unique_key_first,
+                              OptionalKeyIterator sorted_unique_key_last,
+                              EdgeSrcValueInputWrapper edge_src_value_input,
+                              EdgeDstValueInputWrapper edge_dst_value_input,
+                              EdgeValueInputWrapper edge_value_input,
+                              EdgeOp e_op,
+                              T init,
+                              ReduceOp reduce_op,
+                              PredOp pred_op,
+                              VertexValueOutputIterator vertex_value_output_first)
+{
+  constexpr bool update_major  = (incoming == GraphViewType::is_storage_transposed);
+  constexpr bool use_input_key = !std::is_same_v<OptionalKeyIterator, void*>;
+  static_assert(update_major || !use_input_key);
+  constexpr bool filter_input_key =
+    GraphViewType::is_multi_gpu && use_input_key &&
+    std::is_same_v<ReduceOp,
+                   reduce_op::any<T>>;  // if GraphViewType::is_multi_gpu && update_major &&
+                                        // std::is_same_v<ReduceOp, reduce_op::any<T>>, for any
+                                        // vertex in the frontier, we need to visit only local edges
+                                        // if we find any valid local edge (FIXME: this is
+                                        // applicable even when use_input_key is false).
+
+  static_assert(
+    ReduceOp::pure_function &&
+    ((reduce_op::has_compatible_raft_comms_op_v<ReduceOp> &&
+      reduce_op::has_identity_element_v<ReduceOp>) ||
+     (update_major &&
+      std::is_same_v<ReduceOp, reduce_op::any<T>>)));  // current restriction, to support general
+                                                       // reduction, we may need to take a less
+                                                       // efficient code path
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t =
+    typename iterator_value_type_or_default_t<OptionalKeyIterator, vertex_t>::value_type;
+
+  using edge_partition_src_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
+  using edge_partition_dst_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
+  using edge_partition_e_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_edge_property_device_view_t<
+      edge_t,
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
+
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  constexpr bool try_bitmap =
+    GraphViewType::is_multi_gpu && use_input_key && std::is_same_v<key_t, vertex_t>;
+
+  [[maybe_unused]] constexpr auto max_segments =
+    detail::num_sparse_segments_per_vertex_partition + size_t{1};
+
+  // 1. drop zero degree keys & compute key_segment_offsets
+
+  auto local_vertex_partition_segment_offsets = graph_view.local_vertex_partition_segment_offsets();
+
+  std::conditional_t<use_input_key, std::optional<std::vector<size_t>>, std::byte /* dummy */>
+    key_segment_offsets{};
+  auto sorted_unique_nzd_key_last = sorted_unique_key_last;
+  if constexpr (use_input_key) {
+    if (local_vertex_partition_segment_offsets) {
+      key_segment_offsets = compute_key_segment_offsets(
+        sorted_unique_key_first,
+        sorted_unique_nzd_key_last,
+        raft::host_span<vertex_t const>((*local_vertex_partition_segment_offsets).data(),
+                                        (*local_vertex_partition_segment_offsets).size()),
+        graph_view.local_vertex_partition_range_first(),
+        handle.get_stream());
+      (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1);
+      sorted_unique_nzd_key_last    = sorted_unique_key_first + (*key_segment_offsets).back();
+    }
+  }
+
+  // 2. initialize vertex value output buffer
+
+  if constexpr (update_major) {  // no vertices in the zero degree segment are visited (otherwise,
+                                 // no need to initialize)
+    if constexpr (use_input_key) {
+      thrust::fill(handle.get_thrust_policy(),
+                   vertex_value_output_first +
+                     thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last),
+                   vertex_value_output_first +
+                     thrust::distance(sorted_unique_key_first, sorted_unique_key_last),
+                   init);
+    } else {
+      if (local_vertex_partition_segment_offsets) {
+        thrust::fill(
+          handle.get_thrust_policy(),
+          vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin() + 1),
+          vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin()),
+          init);
+      }
+    }
+  } else {
+    if constexpr (GraphViewType::is_multi_gpu) {
+      /* no need to initialize (we use minor_tmp_buffer) */
+    } else {
+      thrust::fill(handle.get_thrust_policy(),
+                   vertex_value_output_first,
+                   vertex_value_output_first + graph_view.local_vertex_partition_range_size(),
+                   init);
+    }
+  }
+
+  // 3. filter input keys & update key_segment_offsets
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  auto tmp_key_buffer =
+    allocate_optional_dataframe_buffer<std::conditional_t<filter_input_key, key_t, void>>(
+      0, handle.get_stream());
+  auto tmp_output_indices =
+    allocate_optional_dataframe_buffer<std::conditional_t<filter_input_key, size_t, void>>(
+      0, handle.get_stream());
+  std::conditional_t<filter_input_key,
+                     thrust::permutation_iterator<VertexValueOutputIterator, size_t const*>,
+                     VertexValueOutputIterator>
+    tmp_vertex_value_output_first{};
+  if constexpr (filter_input_key) {
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(static_cast<size_t>(minor_comm_rank)));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, static_cast<size_t>(minor_comm_rank))
+        : thrust::nullopt;
+
+    std::optional<std::vector<size_t>> edge_partition_stream_pool_indices{std::nullopt};
+    if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) {
+      edge_partition_stream_pool_indices = std::vector<size_t>(max_segments);
+      std::iota((*edge_partition_stream_pool_indices).begin(),
+                (*edge_partition_stream_pool_indices).end(),
+                size_t{0});
+    }
+
+    if (edge_partition_stream_pool_indices) { handle.sync_stream(); }
+
+    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
+    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
+      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(
+        edge_dst_value_input, static_cast<size_t>(minor_comm_rank));
+    } else {
+      edge_partition_src_value_input = edge_partition_src_input_device_view_t(
+        edge_src_value_input, static_cast<size_t>(minor_comm_rank));
+      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
+    }
+    auto edge_partition_e_value_input =
+      edge_partition_e_input_device_view_t(edge_value_input, static_cast<size_t>(minor_comm_rank));
+
+    per_v_transform_reduce_e_edge_partition<update_major, GraphViewType>(
+      handle,
+      edge_partition,
+      sorted_unique_key_first,
+      sorted_unique_nzd_key_last,
+      edge_partition_src_value_input,
+      edge_partition_dst_value_input,
+      edge_partition_e_value_input,
+      edge_partition_e_mask,
+      vertex_value_output_first,
+      e_op,
+      init,
+      init,
+      reduce_op,
+      pred_op,
+      key_segment_offsets ? std::make_optional<raft::host_span<size_t const>>(
+                              (*key_segment_offsets).data(), (*key_segment_offsets).size())
+                          : std::nullopt,
+      edge_partition_stream_pool_indices ? std::make_optional<raft::host_span<size_t const>>(
+                                             (*edge_partition_stream_pool_indices).data(),
+                                             (*edge_partition_stream_pool_indices).size())
+                                         : std::nullopt);
+
+    if (edge_partition_stream_pool_indices) {
+      handle.sync_stream_pool(*edge_partition_stream_pool_indices);
+    }
+
+    auto num_tmp_keys = thrust::count(
+      handle.get_thrust_policy(),
+      vertex_value_output_first,
+      vertex_value_output_first +
+        thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last),
+      init);  // we allow false positives (some edge operations may actually return init)
+
+    resize_optional_dataframe_buffer<key_t>(tmp_key_buffer, num_tmp_keys, handle.get_stream());
+    resize_optional_dataframe_buffer<size_t>(tmp_output_indices, num_tmp_keys, handle.get_stream());
+
+    auto input_first =
+      thrust::make_zip_iterator(sorted_unique_key_first, thrust::make_counting_iterator(size_t{0}));
+    thrust::copy_if(
+      handle.get_thrust_policy(),
+      input_first,
+      input_first + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last),
+      vertex_value_output_first,
+      thrust::make_zip_iterator(get_optional_dataframe_buffer_begin<key_t>(tmp_key_buffer),
+                                get_optional_dataframe_buffer_begin<size_t>(tmp_output_indices)),
+      is_equal_t<T>{init});
+
+    sorted_unique_key_first       = get_optional_dataframe_buffer_begin<key_t>(tmp_key_buffer);
+    sorted_unique_nzd_key_last    = get_optional_dataframe_buffer_end<key_t>(tmp_key_buffer);
+    tmp_vertex_value_output_first = thrust::make_permutation_iterator(
+      vertex_value_output_first, get_optional_dataframe_buffer_begin<size_t>(tmp_output_indices));
+
+    if (key_segment_offsets) {
+      key_segment_offsets = compute_key_segment_offsets(
+        sorted_unique_key_first,
+        sorted_unique_nzd_key_last,
+        raft::host_span<vertex_t const>((*local_vertex_partition_segment_offsets).data(),
+                                        (*local_vertex_partition_segment_offsets).size()),
+        edge_partition.major_range_first(),
+        handle.get_stream());
+      assert((*key_segment_offsets).back() == *((*key_segment_offsets).rbegin() + 1));
+      assert(sorted_uniue_nzd_key_last == sorted_unique_key_first + (*key_segment_offsets).back());
+    }
+  } else {
+    tmp_vertex_value_output_first = vertex_value_output_first;
+  }
+
+  /* 4. compute subgroup_size (used to compute priority in device_gatherv) */
+
+  [[maybe_unused]] std::conditional_t<GraphViewType::is_multi_gpu && update_major &&
+                                        std::is_same_v<ReduceOp, reduce_op::any<T>>,
+                                      int,
+                                      std::byte /* dummy */>
+    subgroup_size{};
+  if constexpr (GraphViewType::is_multi_gpu && update_major &&
+                std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+    auto& comm                 = handle.get_comms();
+    auto const comm_size       = comm.get_size();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+
+    int num_gpus_per_node{};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+    if (comm_size <= num_gpus_per_node) {
+      subgroup_size = minor_comm_size;
+    } else {
+      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      subgroup_size              = partition_manager::map_major_comm_to_gpu_row_comm
+                                     ? std::max(num_gpus_per_node / major_comm_size, int{1})
+                                     : std::min(minor_comm_size, num_gpus_per_node);
+    }
+  }
+
+  // 5. collect max_tmp_buffer_size, approx_tmp_buffer_size_per_loop, local_key_list_sizes,
+  // local_v_list_range_firsts, local_v_list_range_lasts, local_key_list_deg1_sizes,
+  // key_segment_offset_vectors
+
+  std::conditional_t<GraphViewType::is_multi_gpu, std::vector<size_t>, std::byte /* dummy */>
+    max_tmp_buffer_sizes{};
+  std::conditional_t<GraphViewType::is_multi_gpu, std::vector<size_t>, std::byte /* dummy */>
+    tmp_buffer_size_per_loop_approximations{};
+  std::conditional_t<use_input_key, std::vector<size_t>, std::byte /* dummy */>
+    local_key_list_sizes{};
+  std::conditional_t<try_bitmap, std::vector<vertex_t>, std::byte /* dummy */>
+    local_v_list_range_firsts{};
+  std::conditional_t<try_bitmap, std::vector<vertex_t>, std::byte /* dummy */>
+    local_v_list_range_lasts{};
+  std::conditional_t<filter_input_key, std::optional<std::vector<size_t>>, std::byte /* dummy */>
+    local_key_list_deg1_sizes{};  // if global degree is 1, any valid local value should be selected
+  std::conditional_t<use_input_key,
+                     std::optional<std::vector<std::vector<size_t>>>,
+                     std::byte /* dummy */>
+    key_segment_offset_vectors{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
+
+    auto max_tmp_buffer_size =
+      static_cast<size_t>(static_cast<double>(handle.get_device_properties().totalGlobalMem) * 0.2);
+    size_t approx_tmp_buffer_size_per_loop{0};
+    if constexpr (update_major) {
+      size_t key_size{0};
+      if constexpr (use_input_key) {
+        if constexpr (std::is_arithmetic_v<key_t>) {
+          key_size = sizeof(key_t);
+        } else {
+          key_size = sum_thrust_tuple_element_sizes<key_t>();
+        }
+      }
+      size_t value_size{0};
+      if constexpr (std::is_arithmetic_v<key_t>) {
+        value_size = sizeof(T);
+      } else {
+        value_size = sum_thrust_tuple_element_sizes<T>();
+      }
+
+      size_t major_range_size{};
+      if constexpr (use_input_key) {
+        major_range_size = static_cast<size_t>(
+          thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last));
+        ;
+      } else {
+        major_range_size = graph_view.local_vertex_partition_range_size();
+      }
+      size_t size_per_key{};
+      if constexpr (filter_input_key) {
+        size_per_key =
+          key_size +
+          value_size / 2;  // to reflect that many keys will be filtered out, note that this is a
+                           // simple approximation, memory requirement in this case is much more
+                           // complex as we store additional temporary variables
+
+      } else {
+        size_per_key = key_size + value_size;
+      }
+      approx_tmp_buffer_size_per_loop = major_range_size * size_per_key;
+    }
+
+    size_t num_scalars = 2;  // max_tmp_buffer_size, approx_tmp_buffer_size_per_loop
+    size_t num_scalars_less_key_segment_offsets = num_scalars;
+    if constexpr (use_input_key) {
+      num_scalars += 1;  // local_key_list_size
+      if constexpr (try_bitmap) {
+        num_scalars += 2;  // local_key_list_range_first, local_key_list_range_last
+      }
+      if (filter_input_key && graph_view.use_dcs()) {
+        num_scalars += 1;  // local_key_list_degree_1_size
+      }
+      num_scalars_less_key_segment_offsets = num_scalars;
+      if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); }
+    }
+
+    rmm::device_uvector<size_t> d_aggregate_tmps(minor_comm_size * num_scalars,
+                                                 handle.get_stream());
+    auto hypersparse_degree_offsets =
+      graph_view.local_vertex_partition_hypersparse_degree_offsets();
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      d_aggregate_tmps.begin() + num_scalars * minor_comm_rank,
+      d_aggregate_tmps.begin() + num_scalars * minor_comm_rank +
+        num_scalars_less_key_segment_offsets,
+      [max_tmp_buffer_size,
+       approx_tmp_buffer_size_per_loop,
+       sorted_unique_key_first,
+       sorted_unique_nzd_key_last,
+       deg1_v_first = (filter_input_key && graph_view.use_dcs())
+                        ? thrust::make_optional(graph_view.local_vertex_partition_range_first() +
+                                                (*local_vertex_partition_segment_offsets)[3] +
+                                                *((*hypersparse_degree_offsets).rbegin() + 1))
+                        : thrust::nullopt,
+       vertex_partition_range_first =
+         graph_view.local_vertex_partition_range_first()] __device__(size_t i) {
+        if (i == 0) {
+          return max_tmp_buffer_size;
+        } else if (i == 1) {
+          return approx_tmp_buffer_size_per_loop;
+        }
+        if constexpr (use_input_key) {
+          auto v_list_size = static_cast<size_t>(
+            thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last));
+          if (i == 2) { return v_list_size; }
+          if constexpr (try_bitmap) {
+            if (i == 3) {
+              vertex_t first{};
+              if (v_list_size > 0) {
+                first = *sorted_unique_key_first;
+              } else {
+                first = vertex_partition_range_first;
+              }
+              assert(static_cast<vertex_t>(static_cast<size_t>(first)) == first);
+              return static_cast<size_t>(first);
+            } else if (i == 4) {
+              vertex_t last{};
+              if (v_list_size > 0) {
+                last = *(sorted_unique_key_first + (v_list_size - 1)) + 1;
+              } else {
+                last = vertex_partition_range_first;
+              }
+              assert(static_cast<vertex_t>(static_cast<size_t>(last)) == last);
+              return static_cast<size_t>(last);
+            } else if (i == 5) {
+              if (deg1_v_first) {
+                auto sorted_unique_v_first = thrust::make_transform_iterator(
+                  sorted_unique_key_first,
+                  cuda::proclaim_return_type<vertex_t>([] __device__(auto key) {
+                    return thrust_tuple_get_or_identity<key_t, 0>(key);
+                  }));
+                return v_list_size - static_cast<size_t>(thrust::distance(
+                                       sorted_unique_v_first,
+                                       thrust::lower_bound(thrust::seq,
+                                                           sorted_unique_v_first,
+                                                           sorted_unique_v_first + v_list_size,
+                                                           deg1_v_first)));
+              }
+            }
+          } else {
+            if (i == 3) {
+              if (deg1_v_first) {
+                auto sorted_unique_v_first = thrust::make_transform_iterator(
+                  sorted_unique_key_first,
+                  cuda::proclaim_return_type<vertex_t>([] __device__(auto key) {
+                    return thrust_tuple_get_or_identity<key_t, 0>(key);
+                  }));
+                return v_list_size - static_cast<size_t>(thrust::distance(
+                                       sorted_unique_v_first,
+                                       thrust::lower_bound(thrust::seq,
+                                                           sorted_unique_v_first,
+                                                           sorted_unique_v_first + v_list_size,
+                                                           deg1_v_first)));
+              }
+            }
+          }
+        }
+        assert(false);
+        return size_t{0};
+      });
+    if constexpr (use_input_key) {
+      if (key_segment_offsets) {
+        raft::update_device(d_aggregate_tmps.data() + (num_scalars * minor_comm_rank +
+                                                       num_scalars_less_key_segment_offsets),
+                            (*key_segment_offsets).data(),
+                            (*key_segment_offsets).size(),
+                            handle.get_stream());
+      }
+    }
+
+    if (minor_comm_size > 1) {
+      device_allgather(minor_comm,
+                       d_aggregate_tmps.data() + minor_comm_rank * num_scalars,
+                       d_aggregate_tmps.data(),
+                       num_scalars,
+                       handle.get_stream());
+    }
+
+    std::vector<size_t> h_aggregate_tmps(d_aggregate_tmps.size());
+    raft::update_host(h_aggregate_tmps.data(),
+                      d_aggregate_tmps.data(),
+                      d_aggregate_tmps.size(),
+                      handle.get_stream());
+    handle.sync_stream();
+    max_tmp_buffer_sizes                    = std::vector<size_t>(minor_comm_size);
+    tmp_buffer_size_per_loop_approximations = std::vector<size_t>(minor_comm_size);
+    if constexpr (use_input_key) {
+      local_key_list_sizes = std::vector<size_t>(minor_comm_size);
+      if constexpr (try_bitmap) {
+        local_v_list_range_firsts = std::vector<vertex_t>(minor_comm_size);
+        local_v_list_range_lasts  = std::vector<vertex_t>(minor_comm_size);
+      }
+      if constexpr (filter_input_key) {
+        if (graph_view.use_dcs()) {
+          local_key_list_deg1_sizes = std::vector<size_t>(minor_comm_size);
+        }
+      }
+      if (key_segment_offsets) {
+        key_segment_offset_vectors = std::vector<std::vector<size_t>>{};
+        (*key_segment_offset_vectors).reserve(minor_comm_size);
+      }
+    }
+    for (int i = 0; i < minor_comm_size; ++i) {
+      max_tmp_buffer_sizes[i]                    = h_aggregate_tmps[i * num_scalars];
+      tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 1];
+      if constexpr (use_input_key) {
+        local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars + 2];
+        if constexpr (try_bitmap) {
+          local_v_list_range_firsts[i] =
+            static_cast<vertex_t>(h_aggregate_tmps[i * num_scalars + 3]);
+          local_v_list_range_lasts[i] =
+            static_cast<vertex_t>(h_aggregate_tmps[i * num_scalars + 4]);
+        }
+        if constexpr (filter_input_key) {
+          if (graph_view.use_dcs()) {
+            (*local_key_list_deg1_sizes)[i] =
+              static_cast<vertex_t>(h_aggregate_tmps[i * num_scalars + (try_bitmap ? 5 : 3)]);
+          }
+        }
+        if (key_segment_offsets) {
+          (*key_segment_offset_vectors)
+            .emplace_back(
+              h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets,
+              h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets +
+                (*key_segment_offsets).size());
+        }
+      }
+    }
+  } else {
+    if constexpr (use_input_key) {
+      local_key_list_sizes = std::vector<size_t>{
+        static_cast<size_t>(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))};
+      if (key_segment_offsets) {
+        key_segment_offset_vectors       = std::vector<std::vector<size_t>>(1);
+        (*key_segment_offset_vectors)[0] = *key_segment_offsets;
+      }
+    }
+  }
+
+  // 6. compute optional bitmap info & compressed vertex list
+
+  bool v_compressible{false};
+  std::
+    conditional_t<try_bitmap, std::optional<rmm::device_uvector<uint32_t>>, std::byte /* dummy */>
+      v_list_bitmap{};
+  std::
+    conditional_t<try_bitmap, std::optional<rmm::device_uvector<uint32_t>>, std::byte /* dummy */>
+      compressed_v_list{};
+  if constexpr (try_bitmap) {
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+    if (minor_comm_size > 1) {
+      auto const minor_comm_rank = minor_comm.get_rank();
+
+      if constexpr (sizeof(vertex_t) == 8) {
+        vertex_t local_v_list_max_range_size{0};
+        for (int i = 0; i < minor_comm_size; ++i) {
+          auto range_size             = local_v_list_range_lasts[i] - local_v_list_range_firsts[i];
+          local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size);
+        }
+        if (local_v_list_max_range_size <=
+            std::numeric_limits<uint32_t>::max()) {  // broadcast 32bit offset values instead of 64
+                                                     // bit vertex IDs
+          v_compressible = true;
+        }
+      }
+
+      double avg_fill_ratio{0.0};
+      for (int i = 0; i < minor_comm_size; ++i) {
+        auto num_keys   = static_cast<double>(local_key_list_sizes[i]);
+        auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i];
+        avg_fill_ratio +=
+          (range_size > 0) ? (num_keys / static_cast<double>(range_size)) : double{0.0};
+      }
+      avg_fill_ratio /= static_cast<double>(minor_comm_size);
+      double threshold_ratio =
+        2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ /
+        static_cast<double>((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8);
+      auto avg_key_list_size =
+        std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()) /
+        static_cast<vertex_t>(minor_comm_size);
+
+      if ((avg_fill_ratio > threshold_ratio) &&
+          (static_cast<size_t>(avg_key_list_size) >
+           packed_bools_per_word() *
+             32 /* tuning parameter, to considerr additional kernel launch overhead */)) {
+        v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first,
+                                                        sorted_unique_nzd_key_last,
+                                                        local_v_list_range_firsts[minor_comm_rank],
+                                                        local_v_list_range_lasts[minor_comm_rank],
+                                                        handle.get_stream());
+      } else if (v_compressible) {
+        rmm::device_uvector<uint32_t> tmps(local_key_list_sizes[minor_comm_rank],
+                                           handle.get_stream());
+        thrust::transform(handle.get_thrust_policy(),
+                          sorted_unique_key_first,
+                          sorted_unique_nzd_key_last,
+                          tmps.begin(),
+                          cuda::proclaim_return_type<uint32_t>(
+                            [range_first = local_v_list_range_firsts[minor_comm_rank]] __device__(
+                              auto v) { return static_cast<uint32_t>(v - range_first); }));
+        compressed_v_list = std::move(tmps);
+      }
+    }
+  }
+
+  bool uint32_key_output_offset = false;
+  if constexpr (GraphViewType::is_multi_gpu && update_major &&
+                std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+    size_t max_key_offset_size = std::numeric_limits<size_t>::max();
+    if constexpr (filter_input_key) {
+      max_key_offset_size = std::reduce(
+        local_key_list_sizes.begin(), local_key_list_sizes.end(), size_t{0}, [](auto l, auto r) {
+          return std::max(l, r);
+        });
+    } else {
+      static_assert(!use_input_key);
+      for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+        auto edge_partition =
+          edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+            graph_view.local_edge_partition_view(i));
+        auto const& segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
+
+        auto output_range_size =
+          segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */
+                          : edge_partition.major_range_size();
+
+        max_key_offset_size = std::max(static_cast<size_t>(output_range_size), max_key_offset_size);
+      }
+    }
+    uint32_key_output_offset =
+      (max_key_offset_size <= static_cast<size_t>(std::numeric_limits<uint32_t>::max()));
+  }
+
+  // 7. set-up stream pool & events
+
+  std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+    auto max_tmp_buffer_size =
+      std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) /
+      static_cast<size_t>(minor_comm_size);
+    auto approx_tmp_buffer_size_per_loop =
+      std::reduce(tmp_buffer_size_per_loop_approximations.begin(),
+                  tmp_buffer_size_per_loop_approximations.end()) /
+      static_cast<size_t>(minor_comm_size);
+    size_t num_streams_per_loop{1};
+    if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) {
+      num_streams_per_loop = std::max(
+        std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments),
+        size_t{
+          1});  // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets
+                // the number of queues, if the total number of streams exceeds this number, jobs on
+                // different streams can be sent to one queue leading to false dependency. Setting
+                // num_concurrent_loops above the number of queues has some benefits in NCCL
+                // communications but creating too many streams just for compute may not help.
+    }
+    stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size,
+                                                   approx_tmp_buffer_size_per_loop,
+                                                   graph_view.number_of_local_edge_partitions(),
+                                                   num_streams_per_loop,
+                                                   handle.get_stream_pool_size());
+    if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; }
+  }
+
+  size_t num_concurrent_loops{1};
+  std::optional<std::vector<size_t>> loop_stream_pool_indices{
+    std::nullopt};  // first num_concurrent_loops streams from stream_pool_indices
+  if (stream_pool_indices) {
+    num_concurrent_loops =
+      std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size());
+    loop_stream_pool_indices = std::vector<size_t>(num_concurrent_loops);
+    std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0});
+  }
+
+  // 8. set-up temporary buffers
+
+  using minor_tmp_buffer_type = std::conditional_t<GraphViewType::is_storage_transposed,
+                                                   edge_src_property_t<GraphViewType, T>,
+                                                   edge_dst_property_t<GraphViewType, T>>;
+  [[maybe_unused]] std::unique_ptr<minor_tmp_buffer_type> minor_tmp_buffer{};
+  if constexpr (GraphViewType::is_multi_gpu && !update_major) {
+    minor_tmp_buffer = std::make_unique<minor_tmp_buffer_type>(handle, graph_view);
+    auto minor_init  = init;
+    auto view        = minor_tmp_buffer->view();
+    if (view.keys()) {  // defer applying the initial value to the end as minor_tmp_buffer ma not
+                        // store values for the entire minor rangey
+      minor_init = ReduceOp::identity_element;
+    } else {
+      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_rank = major_comm.get_rank();
+      minor_init                 = (major_comm_rank == 0) ? init : ReduceOp::identity_element;
+    }
+    fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init);
+  }
+
+  using edge_partition_minor_output_device_view_t =
+    std::conditional_t<GraphViewType::is_multi_gpu && !update_major,
+                       detail::edge_partition_endpoint_property_device_view_t<
+                         vertex_t,
+                         decltype(minor_tmp_buffer->mutable_view().value_first())>,
+                       void /* dummy */>;
+
+  auto counters = allocate_optional_dataframe_buffer<
+    std::conditional_t<GraphViewType::is_multi_gpu && update_major, size_t, void>>(
+    num_concurrent_loops, handle.get_stream());
+
+  if constexpr (!GraphViewType::is_multi_gpu || !use_input_key) {
+    if (loop_stream_pool_indices) { handle.sync_stream(); }
+  }
+
+  // 9. process local edge partitions
+
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) {
+    auto loop_count =
+      std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i);
+
+    std::conditional_t<
+      GraphViewType::is_multi_gpu && use_input_key,
+      std::conditional_t<
+        try_bitmap,
+        std::vector<std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<vertex_t>>>,
+        std::vector<dataframe_buffer_type_t<key_t>>>,
+      std::byte /* dummy */>
+      edge_partition_key_buffers{};
+    std::conditional_t<filter_input_key,
+                       std::optional<std::vector<
+                         std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>>>>,
+                       std::byte /* dummy */>
+      edge_partition_hypersparse_key_offset_vectors{};  // drop zero local degree keys in th
+                                                        // hypersparse regione
+    std::conditional_t<filter_input_key, std::optional<std::vector<size_t>>, std::byte /* dummy */>
+      edge_partition_deg1_hypersparse_key_offset_counts{};
+    std::vector<bool> process_local_edges(loop_count, true);
+
+    if constexpr (GraphViewType::is_multi_gpu && use_input_key) {
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+      auto const minor_comm_rank = minor_comm.get_rank();
+
+      edge_partition_key_buffers.reserve(loop_count);
+
+      std::conditional_t<try_bitmap,
+                         std::optional<std::vector<rmm::device_uvector<uint32_t>>>,
+                         std::byte /* dummy */>
+        edge_partition_bitmap_buffers{std::nullopt};
+      if constexpr (try_bitmap) {
+        if (v_list_bitmap) {
+          edge_partition_bitmap_buffers = std::vector<rmm::device_uvector<uint32_t>>{};
+          (*edge_partition_bitmap_buffers).reserve(loop_count);
+        }
+      }
+
+      for (size_t j = 0; j < loop_count; ++j) {
+        auto partition_idx = i + j;
+
+        bool use_bitmap_buffer = false;
+        if constexpr (try_bitmap) {
+          if (edge_partition_bitmap_buffers) {
+            (*edge_partition_bitmap_buffers)
+              .emplace_back(packed_bool_size(local_v_list_range_lasts[partition_idx] -
+                                             local_v_list_range_firsts[partition_idx]),
+                            handle.get_stream());
+            use_bitmap_buffer = true;
+          }
+        }
+        if (!use_bitmap_buffer) {
+          bool allocated{false};
+          if constexpr (try_bitmap) {
+            if (v_compressible) {
+              edge_partition_key_buffers.push_back(rmm::device_uvector<uint32_t>(
+                local_key_list_sizes[partition_idx], handle.get_stream()));
+              allocated = true;
+            }
+          }
+          if (!allocated) {
+            edge_partition_key_buffers.push_back(allocate_dataframe_buffer<key_t>(
+              local_key_list_sizes[partition_idx], handle.get_stream()));
+          }
+        }
+
+        if constexpr (filter_input_key) {
+          if (static_cast<int>(partition_idx) == minor_comm_rank) {
+            process_local_edges[j] = false;
+          }
+        }
+      }
+
+      device_group_start(minor_comm);
+      for (size_t j = 0; j < loop_count; ++j) {
+        auto partition_idx = i + j;
+        if constexpr (try_bitmap) {
+          if (v_list_bitmap) {
+            device_bcast(minor_comm,
+                         (*v_list_bitmap).data(),
+                         get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]),
+                         size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]),
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          } else if (compressed_v_list) {
+            device_bcast(minor_comm,
+                         (*compressed_v_list).data(),
+                         get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])),
+                         local_key_list_sizes[partition_idx],
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          } else {
+            device_bcast(minor_comm,
+                         sorted_unique_key_first,
+                         get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])),
+                         local_key_list_sizes[partition_idx],
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          }
+        } else {
+          device_bcast(minor_comm,
+                       sorted_unique_key_first,
+                       get_dataframe_buffer_begin(edge_partition_key_buffers[j]),
+                       local_key_list_sizes[partition_idx],
+                       static_cast<int>(partition_idx),
+                       handle.get_stream());
+        }
+      }
+      device_group_end(minor_comm);
+      if (loop_stream_pool_indices) { handle.sync_stream(); }
+
+      if constexpr (try_bitmap) {
+        if (edge_partition_bitmap_buffers) {
+          // copy keys from temporary bitmap buffers to key buffers (copy only the sparse segments
+          // if filter_input_key is true)
+
+          for (size_t j = 0; j < loop_count; ++j) {
+            auto partition_idx = i + j;
+            auto loop_stream =
+              loop_stream_pool_indices
+                ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                : handle.get_stream();
+
+            std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<vertex_t>> keys =
+              rmm::device_uvector<uint32_t>(0, loop_stream);
+            if (v_compressible) {
+              std::get<0>(keys).resize(
+                process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0},
+                loop_stream);
+            } else {
+              keys = rmm::device_uvector<vertex_t>(
+                process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0},
+                loop_stream);
+            }
+
+            auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j];
+            if (process_local_edges[j]) {
+              auto range_first = local_v_list_range_firsts[partition_idx];
+              auto range_last  = local_v_list_range_lasts[partition_idx];
+              if constexpr (filter_input_key) {
+                if (graph_view.use_dcs()) {  // skip copying the hypersparse segment
+                  auto edge_partition =
+                    edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+                      graph_view.local_edge_partition_view(partition_idx));
+                  range_last = std::min(range_last, *(edge_partition.major_hypersparse_first()));
+                }
+              }
+              if (range_first < range_last) {
+                if (keys.index() == 0) {
+                  retrieve_vertex_list_from_bitmap(
+                    raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                    get_dataframe_buffer_begin(std::get<0>(keys)),
+                    raft::device_span<size_t>(
+                      counters.data() + j,
+                      size_t{1}),  // dummy, we already know the counts (i.e.
+                                   // (*key_segment_offset_vectors)[partition_idx][3])
+                    uint32_t{0},
+                    static_cast<uint32_t>(range_last - range_first),
+                    loop_stream);
+                } else {
+                  retrieve_vertex_list_from_bitmap(
+                    raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                    get_dataframe_buffer_begin(std::get<1>(keys)),
+                    raft::device_span<size_t>(
+                      counters.data() + j,
+                      size_t{1}),  // dummy, we already know the counts (i.e.
+                                   // (*key_segment_offset_vectors)[partition_idx][3])
+                    range_first,
+                    range_last,
+                    loop_stream);
+                }
+              }
+            } else {
+              rx_bitmap.resize(0, loop_stream);
+              rx_bitmap.shrink_to_fit(loop_stream);
+            }
+            edge_partition_key_buffers.push_back(std::move(keys));
+          }
+        }
+      }
+
+      if constexpr (filter_input_key) {
+        if (graph_view.use_dcs()) {
+          edge_partition_hypersparse_key_offset_vectors =
+            std::vector<std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>>>{};
+          (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count);
+          edge_partition_deg1_hypersparse_key_offset_counts = std::vector<size_t>(loop_count, 0);
+
+          std::conditional_t<GraphViewType::is_multi_gpu && use_input_key,
+                             std::optional<std::conditional_t<
+                               try_bitmap,
+                               std::vector<std::variant<rmm::device_uvector<uint32_t>,
+                                                        rmm::device_uvector<vertex_t>>>,
+                               std::vector<dataframe_buffer_type_t<key_t>>>>,
+                             std::byte /* dummy */>
+            edge_partition_new_key_buffers{};
+          bool allocate_new_key_buffer{true};
+          if constexpr (try_bitmap) {
+            if (edge_partition_bitmap_buffers) { allocate_new_key_buffer = false; }
+          }
+          if (allocate_new_key_buffer) {  // allocate new key buffers and copy the sparse segment
+                                          // keys to the new key buffers
+            if constexpr (try_bitmap) {
+              edge_partition_new_key_buffers = std::vector<
+                std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<vertex_t>>>{};
+            } else {
+              edge_partition_new_key_buffers = std::vector<dataframe_buffer_type_t<key_t>>{};
+            }
+            (*edge_partition_new_key_buffers).reserve(loop_count);
+
+            for (size_t j = 0; j < loop_count; ++j) {
+              auto partition_idx = i + j;
+              auto loop_stream =
+                loop_stream_pool_indices
+                  ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                  : handle.get_stream();
+
+              auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx];
+
+              if constexpr (try_bitmap) {
+                if (v_compressible) {
+                  auto new_key_buffer = rmm::device_uvector<uint32_t>(
+                    process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0},
+                    loop_stream);
+                  if (process_local_edges[j]) {
+                    thrust::copy(
+                      rmm::exec_policy_nosync(loop_stream),
+                      get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])),
+                      get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])) +
+                        key_segment_offsets[3],
+                      get_dataframe_buffer_begin(new_key_buffer));
+                  } else {
+                    std::get<0>(edge_partition_key_buffers[j]).resize(0, loop_stream);
+                    std::get<0>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream);
+                  }
+                  (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer));
+                } else {
+                  auto new_key_buffer = rmm::device_uvector<vertex_t>(
+                    process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0},
+                    loop_stream);
+                  if (process_local_edges[j]) {
+                    thrust::copy(
+                      rmm::exec_policy_nosync(loop_stream),
+                      get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])),
+                      get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])) +
+                        key_segment_offsets[3],
+                      get_dataframe_buffer_begin(new_key_buffer));
+                  } else {
+                    std::get<1>(edge_partition_key_buffers[j]).resize(0, loop_stream);
+                    std::get<1>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream);
+                  }
+                  (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer));
+                }
+              } else {
+                auto new_key_buffer = allocate_dataframe_buffer<key_t>(
+                  process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0},
+                  loop_stream);
+                if (process_local_edges[j]) {
+                  thrust::copy(rmm::exec_policy_nosync(loop_stream),
+                               get_dataframe_buffer_begin(edge_partition_key_buffers[j]),
+                               get_dataframe_buffer_begin(edge_partition_key_buffers[j]) +
+                                 key_segment_offsets[3],
+                               get_dataframe_buffer_begin(new_key_buffer));
+                } else {
+                  edge_partition_key_buffers[j].resize(0, loop_stream);
+                  edge_partition_key_buffers[j].shrink_to_fit(loop_stream);
+                }
+                (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer));
+              }
+            }
+          }
+
+          if constexpr (try_bitmap) {  // if we are using a bitmap buffer
+            if (v_list_bitmap) {
+              std::vector<rmm::device_uvector<vertex_t>> input_count_offset_vectors{};
+              input_count_offset_vectors.reserve(loop_count);
+
+              std::vector<rmm::device_uvector<uint32_t>> filtered_bitmap_vectors{};
+              std::vector<rmm::device_uvector<vertex_t>> output_count_offset_vectors{};
+              filtered_bitmap_vectors.reserve(loop_count);
+              output_count_offset_vectors.reserve(loop_count);
+
+              std::vector<vertex_t> range_offset_firsts(loop_count, 0);
+              std::vector<vertex_t> range_offset_lasts(loop_count, 0);
+
+              for (size_t j = 0; j < loop_count; ++j) {
+                auto partition_idx = i + j;
+                auto loop_stream =
+                  loop_stream_pool_indices
+                    ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                    : handle.get_stream();
+
+                rmm::device_uvector<vertex_t> input_count_offsets(0, loop_stream);
+                if (process_local_edges[j]) {
+                  auto edge_partition =
+                    edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+                      graph_view.local_edge_partition_view(partition_idx));
+                  auto const& segment_offsets =
+                    graph_view.local_edge_partition_segment_offsets(partition_idx);
+
+                  auto range_offset_first =
+                    std::min((edge_partition.major_range_first() + (*segment_offsets)[3] >
+                              local_v_list_range_firsts[partition_idx])
+                               ? ((edge_partition.major_range_first() + (*segment_offsets)[3]) -
+                                  local_v_list_range_firsts[partition_idx])
+                               : vertex_t{0},
+                             local_v_list_range_lasts[partition_idx] -
+                               local_v_list_range_firsts[partition_idx]);
+                  auto range_offset_last =
+                    std::min(((edge_partition.major_range_first() + (*segment_offsets)[4]) >
+                              local_v_list_range_firsts[partition_idx])
+                               ? ((edge_partition.major_range_first() + (*segment_offsets)[4]) -
+                                  local_v_list_range_firsts[partition_idx])
+                               : vertex_t{0},
+                             local_v_list_range_lasts[partition_idx] -
+                               local_v_list_range_firsts[partition_idx]);
+                  if (range_offset_first < range_offset_last) {
+                    auto const& rx_bitmap  = (*edge_partition_bitmap_buffers)[j];
+                    auto input_count_first = thrust::make_transform_iterator(
+                      thrust::make_counting_iterator(packed_bool_offset(range_offset_first)),
+                      cuda::proclaim_return_type<vertex_t>(
+                        [range_bitmap =
+                           raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                         range_offset_first] __device__(size_t i) {
+                          auto word = range_bitmap[i];
+                          if (i == packed_bool_offset(range_offset_first)) {
+                            word &= ~packed_bool_partial_mask(
+                              range_offset_first %
+                              packed_bools_per_word());  // clear the bits in the sparse region
+                          }
+                          return static_cast<vertex_t>(__popc(word));
+                        }));
+                    input_count_offsets.resize(
+                      (rx_bitmap.size() - packed_bool_offset(range_offset_first)) + 1, loop_stream);
+                    input_count_offsets.set_element_to_zero_async(0, loop_stream);
+                    thrust::inclusive_scan(
+                      rmm::exec_policy_nosync(loop_stream),
+                      input_count_first,
+                      input_count_first +
+                        (rx_bitmap.size() - packed_bool_offset(range_offset_first)),
+                      input_count_offsets.begin() + 1);
+                  }
+                  range_offset_firsts[j] = range_offset_first;
+                  range_offset_lasts[j]  = range_offset_last;
+                }
+                input_count_offset_vectors.push_back(std::move(input_count_offsets));
+              }
+
+              for (size_t j = 0; j < loop_count; ++j) {
+                auto partition_idx = i + j;
+                auto loop_stream =
+                  loop_stream_pool_indices
+                    ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                    : handle.get_stream();
+
+                rmm::device_uvector<uint32_t> filtered_bitmap(0, loop_stream);
+                rmm::device_uvector<vertex_t> output_count_offsets(0, loop_stream);
+                if (process_local_edges[j]) {
+                  auto edge_partition =
+                    edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+                      graph_view.local_edge_partition_view(partition_idx));
+
+                  auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap());
+
+                  auto range_offset_first = range_offset_firsts[j];
+                  auto range_offset_last  = range_offset_lasts[j];
+                  if (range_offset_first < range_offset_last) {
+                    auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j];
+                    filtered_bitmap.resize(
+                      rx_bitmap.size() - packed_bool_offset(range_offset_first), loop_stream);
+                    thrust::tabulate(
+                      rmm::exec_policy_nosync(loop_stream),
+                      filtered_bitmap.begin(),
+                      filtered_bitmap.end(),
+                      cuda::proclaim_return_type<uint32_t>(
+                        [range_bitmap =
+                           raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                         segment_bitmap = raft::device_span<uint32_t const>(segment_bitmap.data(),
+                                                                            segment_bitmap.size()),
+                         range_first    = local_v_list_range_firsts[partition_idx],
+                         range_offset_first,
+                         range_offset_last,
+                         major_hypersparse_first =
+                           *(edge_partition.major_hypersparse_first())] __device__(size_t i) {
+                          auto this_word_range_offset_first = cuda::std::max(
+                            static_cast<vertex_t>((packed_bool_offset(range_offset_first) + i) *
+                                                  packed_bools_per_word()),
+                            range_offset_first);
+                          auto this_word_range_offset_last =
+                            cuda::std::min(static_cast<vertex_t>(
+                                             (packed_bool_offset(range_offset_first) + (i + 1)) *
+                                             packed_bools_per_word()),
+                                           range_offset_last);
+                          auto range_lead_bits = static_cast<size_t>(this_word_range_offset_first %
+                                                                     packed_bools_per_word());
+                          auto range_bitmap_word =
+                            range_bitmap[packed_bool_offset(range_offset_first) + i];
+                          if (i == 0) {  // clear the bits in the sparse region
+                            range_bitmap_word &= ~packed_bool_partial_mask(range_offset_first %
+                                                                           packed_bools_per_word());
+                          }
+                          auto this_word_hypersparse_offset_first =
+                            (range_first + this_word_range_offset_first) - major_hypersparse_first;
+                          auto num_bits = static_cast<size_t>(this_word_range_offset_last -
+                                                              this_word_range_offset_first);
+                          auto hypersparse_lead_bits =
+                            static_cast<size_t>(this_word_hypersparse_offset_first) %
+                            packed_bools_per_word();
+                          auto segment_bitmap_word = ((segment_bitmap[packed_bool_offset(
+                                                         this_word_hypersparse_offset_first)] >>
+                                                       hypersparse_lead_bits))
+                                                     << range_lead_bits;
+                          auto remaining_bits =
+                            (num_bits > (packed_bools_per_word() - hypersparse_lead_bits))
+                              ? (num_bits - (packed_bools_per_word() - hypersparse_lead_bits))
+                              : size_t{0};
+                          if (remaining_bits > 0) {
+                            segment_bitmap_word |=
+                              ((segment_bitmap
+                                  [packed_bool_offset(this_word_hypersparse_offset_first) + 1] &
+                                packed_bool_partial_mask(remaining_bits))
+                               << ((packed_bools_per_word() - hypersparse_lead_bits) +
+                                   range_lead_bits));
+                          }
+                          return range_bitmap_word & segment_bitmap_word;
+                        }));
+                    auto output_count_first = thrust::make_transform_iterator(
+                      filtered_bitmap.begin(),
+                      cuda::proclaim_return_type<vertex_t>([] __device__(uint32_t word) {
+                        return static_cast<vertex_t>(__popc(word));
+                      }));
+                    output_count_offsets.resize(filtered_bitmap.size() + 1, loop_stream);
+                    output_count_offsets.set_element_to_zero_async(0, loop_stream);
+                    thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream),
+                                           output_count_first,
+                                           output_count_first + filtered_bitmap.size(),
+                                           output_count_offsets.begin() + 1);
+                  }
+                }
+                filtered_bitmap_vectors.push_back(std::move(filtered_bitmap));
+                output_count_offset_vectors.push_back(std::move(output_count_offsets));
+              }
+
+              for (size_t j = 0; j < loop_count; ++j) {
+                auto partition_idx = i + j;
+                auto loop_stream =
+                  loop_stream_pool_indices
+                    ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                    : handle.get_stream();
+
+                auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx];
+
+                auto& keys = edge_partition_key_buffers[j];
+                std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>> offsets =
+                  rmm::device_uvector<uint32_t>(0, loop_stream);
+                if (uint32_key_output_offset) {
+                  std::get<0>(offsets).resize(process_local_edges[j]
+                                                ? (key_segment_offsets[4] - key_segment_offsets[3])
+                                                : vertex_t{0},
+                                              loop_stream);
+                } else {
+                  offsets = rmm::device_uvector<size_t>(
+                    process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3])
+                                           : vertex_t{0},
+                    loop_stream);
+                }
+
+                if (process_local_edges[j]) {
+                  auto range_offset_first = range_offset_firsts[j];
+                  auto range_offset_last  = range_offset_lasts[j];
+                  if (range_offset_first < range_offset_last) {
+                    auto const& rx_bitmap            = (*edge_partition_bitmap_buffers)[j];
+                    auto const& input_count_offsets  = input_count_offset_vectors[j];
+                    auto const& filtered_bitmap      = filtered_bitmap_vectors[j];
+                    auto const& output_count_offsets = output_count_offset_vectors[j];
+
+                    if (keys.index() == 0) {
+                      if (offsets.index() == 0) {
+                        thrust::for_each(
+                          rmm::exec_policy_nosync(loop_stream),
+                          thrust::make_counting_iterator(size_t{0}),
+                          thrust::make_counting_iterator(filtered_bitmap.size()),
+                          [range_bitmap =
+                             raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                           filtered_bitmap = raft::device_span<uint32_t const>(
+                             filtered_bitmap.data(), filtered_bitmap.size()),
+                           input_count_offsets = raft::device_span<vertex_t const>(
+                             input_count_offsets.data(), input_count_offsets.size()),
+                           output_count_offsets = raft::device_span<vertex_t const>(
+                             output_count_offsets.data(), output_count_offsets.size()),
+                           output_key_first =
+                             get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3],
+                           output_offset_first = std::get<0>(offsets).begin(),
+                           range_offset_first,
+                           start_key_offset = key_segment_offsets[3]] __device__(size_t i) {
+                            auto range_bitmap_word =
+                              range_bitmap[packed_bool_offset(range_offset_first) + i];
+                            if (i == 0) {  // clear the bits in the sparse region
+                              range_bitmap_word &= ~packed_bool_partial_mask(
+                                range_offset_first % packed_bools_per_word());
+                            }
+                            auto filtered_bitmap_word = filtered_bitmap[i];
+                            auto lead_bits            = (i == 0)
+                                                          ? static_cast<unsigned int>(range_offset_first %
+                                                                           packed_bools_per_word())
+                                                          : static_cast<unsigned int>(0);
+                            auto this_word_start_v_offset =
+                              static_cast<uint32_t>((packed_bool_offset(range_offset_first) + i) *
+                                                    packed_bools_per_word());
+                            auto this_word_start_key_offset =
+                              static_cast<uint32_t>(start_key_offset + input_count_offsets[i]);
+                            auto this_word_output_start_offset = output_count_offsets[i];
+                            for (int j = 0; j < __popc(filtered_bitmap_word); ++j) {
+                              auto jth_set_bit_pos = static_cast<uint32_t>(
+                                __fns(filtered_bitmap_word, lead_bits, j + 1));
+                              *(output_key_first + (this_word_output_start_offset + j)) =
+                                this_word_start_v_offset + jth_set_bit_pos;
+                              *(output_offset_first + (this_word_output_start_offset + j)) =
+                                this_word_start_key_offset +
+                                static_cast<uint32_t>(__popc(
+                                  range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos)));
+                            }
+                          });
+                      } else {
+                        thrust::for_each(
+                          rmm::exec_policy_nosync(loop_stream),
+                          thrust::make_counting_iterator(size_t{0}),
+                          thrust::make_counting_iterator(filtered_bitmap.size()),
+                          [range_bitmap =
+                             raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                           filtered_bitmap = raft::device_span<uint32_t const>(
+                             filtered_bitmap.data(), filtered_bitmap.size()),
+                           input_count_offsets = raft::device_span<vertex_t const>(
+                             input_count_offsets.data(), input_count_offsets.size()),
+                           output_count_offsets = raft::device_span<vertex_t const>(
+                             output_count_offsets.data(), output_count_offsets.size()),
+                           output_key_first =
+                             get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3],
+                           output_offset_first = std::get<1>(offsets).begin(),
+                           range_offset_first,
+                           start_key_offset = key_segment_offsets[3]] __device__(size_t i) {
+                            auto range_bitmap_word =
+                              range_bitmap[packed_bool_offset(range_offset_first) + i];
+                            if (i == 0) {  // clear the bits in the sparse region
+                              range_bitmap_word &= ~packed_bool_partial_mask(
+                                range_offset_first % packed_bools_per_word());
+                            }
+                            auto filtered_bitmap_word = filtered_bitmap[i];
+                            auto lead_bits            = (i == 0)
+                                                          ? static_cast<unsigned int>(range_offset_first %
+                                                                           packed_bools_per_word())
+                                                          : static_cast<unsigned int>(0);
+                            auto this_word_start_v_offset =
+                              static_cast<uint32_t>((packed_bool_offset(range_offset_first) + i) *
+                                                    packed_bools_per_word());
+                            auto this_word_start_key_offset =
+                              static_cast<size_t>(start_key_offset + input_count_offsets[i]);
+                            auto this_word_output_start_offset = output_count_offsets[i];
+                            for (int j = 0; j < __popc(filtered_bitmap_word); ++j) {
+                              auto jth_set_bit_pos = static_cast<uint32_t>(
+                                __fns(filtered_bitmap_word, lead_bits, j + 1));
+                              *(output_key_first + (this_word_output_start_offset + j)) =
+                                this_word_start_v_offset + jth_set_bit_pos;
+                              *(output_offset_first + (this_word_output_start_offset + j)) =
+                                this_word_start_key_offset +
+                                static_cast<size_t>(__popc(
+                                  range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos)));
+                            }
+                          });
+                      }
+                    } else {
+                      if (offsets.index() == 0) {
+                        thrust::for_each(
+                          rmm::exec_policy_nosync(loop_stream),
+                          thrust::make_counting_iterator(size_t{0}),
+                          thrust::make_counting_iterator(filtered_bitmap.size()),
+                          [range_bitmap =
+                             raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                           filtered_bitmap = raft::device_span<uint32_t const>(
+                             filtered_bitmap.data(), filtered_bitmap.size()),
+                           input_count_offsets = raft::device_span<vertex_t const>(
+                             input_count_offsets.data(), input_count_offsets.size()),
+                           output_count_offsets = raft::device_span<vertex_t const>(
+                             output_count_offsets.data(), output_count_offsets.size()),
+                           output_key_first =
+                             get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3],
+                           output_offset_first = std::get<0>(offsets).begin(),
+                           range_first         = local_v_list_range_firsts[partition_idx],
+                           range_offset_first,
+                           start_key_offset = key_segment_offsets[3]] __device__(size_t i) {
+                            auto range_bitmap_word =
+                              range_bitmap[packed_bool_offset(range_offset_first) + i];
+                            if (i == 0) {  // clear the bits in the sparse region
+                              range_bitmap_word &= ~packed_bool_partial_mask(
+                                range_offset_first % packed_bools_per_word());
+                            }
+                            auto filtered_bitmap_word = filtered_bitmap[i];
+                            auto lead_bits            = (i == 0)
+                                                          ? static_cast<unsigned int>(range_offset_first %
+                                                                           packed_bools_per_word())
+                                                          : static_cast<unsigned int>(0);
+                            auto this_word_start_v =
+                              range_first +
+                              static_cast<vertex_t>((packed_bool_offset(range_offset_first) + i) *
+                                                    packed_bools_per_word());
+                            auto this_word_start_key_offset =
+                              static_cast<uint32_t>(start_key_offset + input_count_offsets[i]);
+                            auto this_word_output_start_offset = output_count_offsets[i];
+                            for (int j = 0; j < __popc(filtered_bitmap_word); ++j) {
+                              auto jth_set_bit_pos = static_cast<vertex_t>(
+                                __fns(filtered_bitmap_word, lead_bits, j + 1));
+                              *(output_key_first + (this_word_output_start_offset + j)) =
+                                this_word_start_v + jth_set_bit_pos;
+                              *(output_offset_first + (this_word_output_start_offset + j)) =
+                                this_word_start_key_offset +
+                                static_cast<uint32_t>(__popc(
+                                  range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos)));
+                            }
+                          });
+                      } else {
+                        thrust::for_each(
+                          rmm::exec_policy_nosync(loop_stream),
+                          thrust::make_counting_iterator(size_t{0}),
+                          thrust::make_counting_iterator(filtered_bitmap.size()),
+                          [range_bitmap =
+                             raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                           filtered_bitmap = raft::device_span<uint32_t const>(
+                             filtered_bitmap.data(), filtered_bitmap.size()),
+                           input_count_offsets = raft::device_span<vertex_t const>(
+                             input_count_offsets.data(), input_count_offsets.size()),
+                           output_count_offsets = raft::device_span<vertex_t const>(
+                             output_count_offsets.data(), output_count_offsets.size()),
+                           output_key_first =
+                             get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3],
+                           output_offset_first = std::get<1>(offsets).begin(),
+                           range_first         = local_v_list_range_firsts[partition_idx],
+                           range_offset_first,
+                           start_key_offset = key_segment_offsets[3]] __device__(size_t i) {
+                            auto range_bitmap_word =
+                              range_bitmap[packed_bool_offset(range_offset_first) + i];
+                            if (i == 0) {  // clear the bits in the sparse region
+                              range_bitmap_word &= ~packed_bool_partial_mask(
+                                range_offset_first % packed_bools_per_word());
+                            }
+                            auto filtered_bitmap_word = filtered_bitmap[i];
+                            auto lead_bits            = (i == 0)
+                                                          ? static_cast<unsigned int>(range_offset_first %
+                                                                           packed_bools_per_word())
+                                                          : static_cast<unsigned int>(0);
+                            auto this_word_start_v =
+                              range_first +
+                              static_cast<vertex_t>((packed_bool_offset(range_offset_first) + i) *
+                                                    packed_bools_per_word());
+                            auto this_word_start_key_offset =
+                              static_cast<size_t>(start_key_offset + input_count_offsets[i]);
+                            auto this_word_output_start_offset = output_count_offsets[i];
+                            for (int j = 0; j < __popc(filtered_bitmap_word); ++j) {
+                              auto jth_set_bit_pos = static_cast<vertex_t>(
+                                __fns(filtered_bitmap_word, lead_bits, j + 1));
+                              *(output_key_first + (this_word_output_start_offset + j)) =
+                                this_word_start_v + jth_set_bit_pos;
+                              *(output_offset_first + (this_word_output_start_offset + j)) =
+                                this_word_start_key_offset +
+                                static_cast<size_t>(__popc(
+                                  range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos)));
+                            }
+                          });
+                      }
+                    }
+                    thrust::transform(
+                      rmm::exec_policy_nosync(loop_stream),
+                      output_count_offsets.begin() + (output_count_offsets.size() - 1),
+                      output_count_offsets.end(),
+                      counters.data() + j,
+                      typecast_t<vertex_t, size_t>{});
+                  } else {
+                    thrust::fill(rmm::exec_policy_nosync(loop_stream),
+                                 counters.data() + j,
+                                 counters.data() + (j + 1),
+                                 size_t{0});
+                  }
+                }
+
+                (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets));
+              }
+            }
+          }
+          if (edge_partition_new_key_buffers) {  // if there is no bitmap buffer
+            for (size_t j = 0; j < loop_count; ++j) {
+              auto partition_idx = i + j;
+              auto loop_stream =
+                loop_stream_pool_indices
+                  ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                  : handle.get_stream();
+
+              auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx];
+
+              auto& keys = edge_partition_key_buffers[j];
+              std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>> offsets =
+                rmm::device_uvector<uint32_t>(0, loop_stream);
+              if (uint32_key_output_offset) {
+                std::get<0>(offsets).resize(process_local_edges[j]
+                                              ? (key_segment_offsets[4] - key_segment_offsets[3])
+                                              : vertex_t{0},
+                                            loop_stream);
+              } else {
+                offsets = rmm::device_uvector<size_t>(
+                  process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3])
+                                         : vertex_t{0},
+                  loop_stream);
+              }
+
+              if (process_local_edges[j]) {
+                auto edge_partition =
+                  edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+                    graph_view.local_edge_partition_view(partition_idx));
+                auto const& segment_offsets =
+                  graph_view.local_edge_partition_segment_offsets(partition_idx);
+
+                auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap());
+
+                auto& new_keys = (*edge_partition_new_key_buffers)[j];
+                if constexpr (try_bitmap) {
+                  assert(!v_list_bitmap);
+                  if (keys.index() == 0) {
+                    auto flag_first = thrust::make_transform_iterator(
+                      get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3],
+                      cuda::proclaim_return_type<bool>(
+                        [segment_bitmap = raft::device_span<uint32_t const>(segment_bitmap.data(),
+                                                                            segment_bitmap.size()),
+                         range_first    = local_v_list_range_firsts[partition_idx],
+                         major_hypersparse_first =
+                           *(edge_partition
+                               .major_hypersparse_first())] __device__(uint32_t v_offset) {
+                          auto v              = range_first + static_cast<vertex_t>(v_offset);
+                          auto segment_offset = v - major_hypersparse_first;
+                          return ((segment_bitmap[packed_bool_offset(segment_offset)] &
+                                   packed_bool_mask(segment_offset)) != packed_bool_empty_mask());
+                        }));
+                    if (offsets.index() == 0) {
+                      auto input_pair_first =
+                        thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)),
+                                                  thrust::make_counting_iterator(uint32_t{0})) +
+                        key_segment_offsets[3];
+                      detail::copy_if_nosync(
+                        input_pair_first,
+                        input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]),
+                        flag_first,
+                        thrust::make_zip_iterator(
+                          get_dataframe_buffer_begin(std::get<0>(new_keys)) +
+                            key_segment_offsets[3],
+                          std::get<0>(offsets).begin()),
+                        raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                        loop_stream);
+                    } else {
+                      auto input_pair_first =
+                        thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)),
+                                                  thrust::make_counting_iterator(size_t{0})) +
+                        key_segment_offsets[3];
+                      detail::copy_if_nosync(
+                        input_pair_first,
+                        input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]),
+                        flag_first,
+                        thrust::make_zip_iterator(
+                          get_dataframe_buffer_begin(std::get<0>(new_keys)) +
+                            key_segment_offsets[3],
+                          std::get<1>(offsets).begin()),
+                        raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                        loop_stream);
+                    }
+                  } else {
+                    auto flag_first = thrust::make_transform_iterator(
+                      get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3],
+                      cuda::proclaim_return_type<bool>(
+                        [segment_bitmap = raft::device_span<uint32_t const>(segment_bitmap.data(),
+                                                                            segment_bitmap.size()),
+                         major_hypersparse_first =
+                           *(edge_partition.major_hypersparse_first())] __device__(vertex_t v) {
+                          auto segment_offset = v - major_hypersparse_first;
+                          return ((segment_bitmap[packed_bool_offset(segment_offset)] &
+                                   packed_bool_mask(segment_offset)) != packed_bool_empty_mask());
+                        }));
+                    if (offsets.index() == 0) {
+                      auto input_pair_first =
+                        thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)),
+                                                  thrust::make_counting_iterator(uint32_t{0})) +
+                        key_segment_offsets[3];
+                      detail::copy_if_nosync(
+                        input_pair_first,
+                        input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]),
+                        flag_first,
+                        thrust::make_zip_iterator(
+                          get_dataframe_buffer_begin(std::get<1>(new_keys)) +
+                            key_segment_offsets[3],
+                          std::get<0>(offsets).begin()),
+                        raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                        loop_stream);
+                    } else {
+                      auto input_pair_first =
+                        thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)),
+                                                  thrust::make_counting_iterator(size_t{0})) +
+                        key_segment_offsets[3];
+                      detail::copy_if_nosync(
+                        input_pair_first,
+                        input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]),
+                        flag_first,
+                        thrust::make_zip_iterator(
+                          get_dataframe_buffer_begin(std::get<1>(new_keys)) +
+                            key_segment_offsets[3],
+                          std::get<1>(offsets).begin()),
+                        raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                        loop_stream);
+                    }
+                  }
+                } else {
+                  auto flag_first = thrust::make_transform_iterator(
+                    get_dataframe_buffer_begin(keys) + key_segment_offsets[3],
+                    cuda::proclaim_return_type<bool>(
+                      [segment_bitmap = raft::device_span<uint32_t const>(segment_bitmap.data(),
+                                                                          segment_bitmap.size()),
+                       major_hypersparse_first =
+                         *(edge_partition.major_hypersparse_first())] __device__(auto key) {
+                        auto segment_offset =
+                          thrust_tuple_get_or_identity<key_t, 0>(key) - major_hypersparse_first;
+                        return ((segment_bitmap[packed_bool_offset(segment_offset)] &
+                                 packed_bool_mask(segment_offset)) != packed_bool_empty_mask());
+                      }));
+                  if (offsets.index() == 0) {
+                    auto input_pair_first =
+                      thrust::make_zip_iterator(get_dataframe_buffer_begin(keys),
+                                                thrust::make_counting_iterator(uint32_t{0})) +
+                      key_segment_offsets[3];
+                    detail::copy_if_nosync(
+                      input_pair_first,
+                      input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]),
+                      flag_first,
+                      thrust::make_zip_iterator(
+                        get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3],
+                        std::get<0>(offsets).begin()),
+                      raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                      loop_stream);
+                  } else {
+                    auto input_pair_first =
+                      thrust::make_zip_iterator(get_dataframe_buffer_begin(keys),
+                                                thrust::make_counting_iterator(size_t{0})) +
+                      key_segment_offsets[3];
+                    detail::copy_if_nosync(
+                      input_pair_first,
+                      input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]),
+                      flag_first,
+                      thrust::make_zip_iterator(
+                        get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3],
+                        std::get<1>(offsets).begin()),
+                      raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                      loop_stream);
+                  }
+                }
+              }
+
+              (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets));
+            }
+          }
+          if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+          if (edge_partition_new_key_buffers) {
+            for (size_t j = 0; j < loop_count; ++j) {
+              edge_partition_key_buffers[j] = std::move((*edge_partition_new_key_buffers)[j]);
+            }
+          }
+          if (edge_partition_bitmap_buffers) { (*edge_partition_bitmap_buffers).clear(); }
+
+          std::vector<size_t> h_counts(loop_count);
+          raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream());
+          handle.sync_stream();
+
+          for (size_t j = 0; j < loop_count; ++j) {
+            auto partition_idx = i + j;
+            auto loop_stream =
+              loop_stream_pool_indices
+                ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                : handle.get_stream();
+
+            if (process_local_edges[j]) {
+              auto& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx];
+
+              auto& keys = edge_partition_key_buffers[j];
+              if constexpr (try_bitmap) {
+                if (keys.index() == 0) {
+                  resize_dataframe_buffer(
+                    std::get<0>(keys), key_segment_offsets[3] + h_counts[j], loop_stream);
+                } else {
+                  resize_dataframe_buffer(
+                    std::get<1>(keys), key_segment_offsets[3] + h_counts[j], loop_stream);
+                }
+              } else {
+                resize_dataframe_buffer(keys, key_segment_offsets[3] + h_counts[j], loop_stream);
+              }
+              // skip shrink_to_fit to cut execution time
+
+              auto& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j];
+              if (offsets.index() == 0) {
+                std::get<0>(offsets).resize(h_counts[j], loop_stream);
+              } else {
+                std::get<1>(offsets).resize(h_counts[j], loop_stream);
+              }
+              // skip shrink_to_fit to cut execution time
+            }
+          }
+
+          {  // update edge_partition_deg1_hypersparse_key_offset_counts
+            if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+
+            std::vector<void const*> h_ptrs(
+              loop_count);  // pointers to hypersparse key offset vectors
+            std::vector<size_t> h_scalars(
+              loop_count * 2);  // (key offset vector sizes, start degree 1 key offset)
+            for (size_t j = 0; j < loop_count; ++j) {
+              auto partition_idx = i + j;
+              if (process_local_edges[j]) {
+                auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j];
+                if (offsets.index() == 0) {
+                  h_ptrs[j]        = static_cast<void const*>(std::get<0>(offsets).data());
+                  h_scalars[j * 2] = std::get<0>(offsets).size();
+                } else {
+                  h_ptrs[j]        = static_cast<void const*>(std::get<1>(offsets).data());
+                  h_scalars[j * 2] = std::get<1>(offsets).size();
+                }
+                h_scalars[j * 2 + 1] =
+                  local_key_list_sizes[partition_idx] - (*local_key_list_deg1_sizes)[partition_idx];
+              } else {
+                h_ptrs[j]            = static_cast<void const*>(nullptr);
+                h_scalars[j * 2]     = size_t{0};
+                h_scalars[j * 2 + 1] = size_t{0};
+              }
+            }
+            rmm::device_uvector<void const*> d_ptrs(h_ptrs.size(), handle.get_stream());
+            rmm::device_uvector<size_t> d_scalars(h_scalars.size(), handle.get_stream());
+            raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream());
+            raft::update_device(
+              d_scalars.data(), h_scalars.data(), h_scalars.size(), handle.get_stream());
+            rmm::device_uvector<size_t> d_counts(loop_count, handle.get_stream());
+            thrust::transform(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(size_t{0}),
+              thrust::make_counting_iterator(loop_count),
+              d_counts.begin(),
+              cuda::proclaim_return_type<size_t>(
+                [d_ptrs    = raft::device_span<void const* const>(d_ptrs.data(), d_ptrs.size()),
+                 d_scalars = raft::device_span<size_t const>(d_scalars.data(), d_scalars.size()),
+                 uint32_key_output_offset] __device__(auto i) {
+                  auto first = d_ptrs[i];
+                  if (first != static_cast<void const*>(nullptr)) {
+                    auto size         = d_scalars[i * 2];
+                    auto start_offset = d_scalars[i * 2 + 1];
+                    if (uint32_key_output_offset) {
+                      auto casted_first = static_cast<uint32_t const*>(first);
+                      return size - static_cast<size_t>(thrust::distance(
+                                      casted_first,
+                                      thrust::lower_bound(thrust::seq,
+                                                          casted_first,
+                                                          casted_first + size,
+                                                          static_cast<uint32_t>(start_offset))));
+                    } else {
+                      auto casted_first = static_cast<size_t const*>(first);
+                      return size -
+                             static_cast<size_t>(thrust::distance(
+                               casted_first,
+                               thrust::lower_bound(
+                                 thrust::seq, casted_first, casted_first + size, start_offset)));
+                    }
+                  } else {
+                    return size_t{0};
+                  }
+                }));
+            raft::update_host((*edge_partition_deg1_hypersparse_key_offset_counts).data(),
+                              d_counts.data(),
+                              d_counts.size(),
+                              handle.get_stream());
+            handle.sync_stream();
+          }
+        }
+      }
+    }
+
+    std::conditional_t<GraphViewType::is_multi_gpu && update_major,
+                       std::vector<dataframe_buffer_type_t<T>>,
+                       std::byte /* dummy */>
+      edge_partition_major_output_buffers{};
+    if constexpr (GraphViewType::is_multi_gpu && update_major) {
+      edge_partition_major_output_buffers.reserve(loop_count);
+    }
+
+    for (size_t j = 0; j < loop_count; ++j) {
+      auto partition_idx = i + j;
+      auto loop_stream   = loop_stream_pool_indices
+                             ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                             : handle.get_stream();
+
+      if constexpr (GraphViewType::is_multi_gpu && update_major) {
+        size_t buffer_size{0};
+        if (process_local_edges[j]) {
+          if constexpr (use_input_key) {
+            auto const& keys = edge_partition_key_buffers[j];
+            if constexpr (try_bitmap) {
+              if (keys.index() == 0) {
+                buffer_size = size_dataframe_buffer(std::get<0>(keys));
+              } else {
+                buffer_size = size_dataframe_buffer(std::get<1>(keys));
+              }
+            } else {
+              buffer_size = size_dataframe_buffer(keys);
+            }
+          } else {
+            auto edge_partition =
+              edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+                graph_view.local_edge_partition_view(partition_idx));
+            auto const& segment_offsets =
+              graph_view.local_edge_partition_segment_offsets(partition_idx);
+
+            buffer_size =
+              segment_offsets
+                ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */
+                : edge_partition.major_range_size();
+          }
+        }
+        edge_partition_major_output_buffers.push_back(
+          allocate_dataframe_buffer<T>(buffer_size, loop_stream));
+      }
+    }
+    if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+
+    for (size_t j = 0; j < loop_count; ++j) {
+      if (process_local_edges[j]) {
+        auto partition_idx = i + j;
+
+        auto edge_partition =
+          edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+            graph_view.local_edge_partition_view(partition_idx));
+        auto edge_partition_e_mask =
+          edge_mask_view
+            ? thrust::make_optional<
+                detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+                *edge_mask_view, partition_idx)
+            : thrust::nullopt;
+        size_t num_streams_per_loop{1};
+        if (stream_pool_indices) {
+          assert((*stream_pool_indices).size() >= num_concurrent_loops);
+          num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops;
+        }
+        auto edge_partition_stream_pool_indices =
+          stream_pool_indices
+            ? std::make_optional<raft::host_span<size_t const>>(
+                (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop)
+            : std::nullopt;
+
+        T major_init{};
+        T major_identity_element{};
+        if constexpr (update_major) {
+          if constexpr (std::is_same_v<ReduceOp,
+                                       reduce_op::any<T>>) {  // if any edge has a non-init value,
+                                                              // one of the non-init values will
+                                                              // be selected.
+            major_init             = init;
+            major_identity_element = init;
+          } else {
+            major_init = ReduceOp::identity_element;
+            if constexpr (GraphViewType::is_multi_gpu) {
+              auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+              auto const minor_comm_rank = minor_comm.get_rank();
+              major_init                 = (static_cast<int>(partition_idx) == minor_comm_rank)
+                                             ? init
+                                             : ReduceOp::identity_element;
+            } else {
+              major_init = init;
+            }
+            major_identity_element = ReduceOp::identity_element;
+          }
+        }
+
+        std::optional<std::vector<size_t>> key_segment_offsets{std::nullopt};
+        if constexpr (use_input_key) {
+          if (key_segment_offset_vectors) {
+            key_segment_offsets = (*key_segment_offset_vectors)[partition_idx];
+            if constexpr (filter_input_key) {
+              if (edge_partition_hypersparse_key_offset_vectors) {
+                (*key_segment_offsets).back() =
+                  size_dataframe_buffer(edge_partition_major_output_buffers[j]);
+                *((*key_segment_offsets).rbegin() + 1) = (*key_segment_offsets).back();
+              }
+            }
+          }
+        } else {
+          auto const& segment_offsets =
+            graph_view.local_edge_partition_segment_offsets(partition_idx);
+          if (segment_offsets) {
+            key_segment_offsets = std::vector<size_t>((*segment_offsets).size());
+            std::transform((*segment_offsets).begin(),
+                           (*segment_offsets).end(),
+                           (*key_segment_offsets).begin(),
+                           [](vertex_t offset) { return static_cast<size_t>(offset); });
+          }
+        }
+
+        edge_partition_src_input_device_view_t edge_partition_src_value_input{};
+        edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
+        if constexpr (GraphViewType::is_storage_transposed) {
+          edge_partition_src_value_input =
+            edge_partition_src_input_device_view_t(edge_src_value_input);
+          edge_partition_dst_value_input =
+            edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx);
+        } else {
+          edge_partition_src_value_input =
+            edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx);
+          edge_partition_dst_value_input =
+            edge_partition_dst_input_device_view_t(edge_dst_value_input);
+        }
+        auto edge_partition_e_value_input =
+          edge_partition_e_input_device_view_t(edge_value_input, partition_idx);
+
+        std::conditional_t<GraphViewType::is_multi_gpu,
+                           std::conditional_t<update_major,
+                                              dataframe_buffer_iterator_type_t<T>,
+                                              edge_partition_minor_output_device_view_t>,
+                           VertexValueOutputIterator>
+          output_buffer{};
+        if constexpr (GraphViewType::is_multi_gpu) {
+          if constexpr (update_major) {
+            output_buffer = get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]);
+          } else {
+            output_buffer =
+              edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view());
+          }
+        } else {
+          output_buffer = tmp_vertex_value_output_first;
+        }
+
+        bool processed{false};
+        if constexpr (try_bitmap) {
+          auto const& keys = edge_partition_key_buffers[j];
+          if (keys.index() == 0) {
+            auto edge_partition_key_first = thrust::make_transform_iterator(
+              std::get<0>(keys).begin(),
+              cuda::proclaim_return_type<vertex_t>(
+                [range_first = local_v_list_range_firsts[partition_idx]] __device__(
+                  uint32_t v_offset) { return range_first + static_cast<vertex_t>(v_offset); }));
+            per_v_transform_reduce_e_edge_partition<update_major, GraphViewType>(
+              handle,
+              edge_partition,
+              edge_partition_key_first,
+              edge_partition_key_first + std::get<0>(keys).size(),
+              edge_partition_src_value_input,
+              edge_partition_dst_value_input,
+              edge_partition_e_value_input,
+              edge_partition_e_mask,
+              output_buffer,
+              e_op,
+              major_init,
+              major_identity_element,
+              reduce_op,
+              pred_op,
+              key_segment_offsets ? std::make_optional<raft::host_span<size_t const>>(
+                                      (*key_segment_offsets).data(), (*key_segment_offsets).size())
+                                  : std::nullopt,
+              edge_partition_stream_pool_indices);
+            processed = true;
+          }
+        }
+        if (!processed) {
+          auto edge_partition_key_first = sorted_unique_key_first;
+          auto edge_partition_key_last  = sorted_unique_nzd_key_last;
+          if constexpr (GraphViewType::is_multi_gpu && use_input_key) {
+            auto const& keys = edge_partition_key_buffers[j];
+            if constexpr (try_bitmap) {
+              edge_partition_key_first = get_dataframe_buffer_begin(std::get<1>(keys));
+              edge_partition_key_last  = get_dataframe_buffer_end(std::get<1>(keys));
+            } else {
+              edge_partition_key_first = get_dataframe_buffer_begin(keys);
+              edge_partition_key_last  = get_dataframe_buffer_end(keys);
+            }
+          }
+
+          per_v_transform_reduce_e_edge_partition<update_major, GraphViewType>(
+            handle,
+            edge_partition,
+            edge_partition_key_first,
+            edge_partition_key_last,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_e_mask,
+            output_buffer,
+            e_op,
+            major_init,
+            major_identity_element,
+            reduce_op,
+            pred_op,
+            key_segment_offsets ? std::make_optional<raft::host_span<size_t const>>(
+                                    (*key_segment_offsets).data(), (*key_segment_offsets).size())
+                                : std::nullopt,
+            edge_partition_stream_pool_indices);
+        }
+      }
+    }
+    if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
+
+    if constexpr (GraphViewType::is_multi_gpu && update_major) {
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_rank = minor_comm.get_rank();
+      auto const minor_comm_size = minor_comm.get_size();
+
+      if constexpr (use_input_key) {
+        edge_partition_key_buffers.clear();
+        edge_partition_key_buffers.shrink_to_fit();
+      }
+
+      if constexpr (std::is_same_v<ReduceOp, reduce_op::any<T>>) {
+        std::conditional_t<
+          filter_input_key,
+          std::optional<std::vector<
+            std::variant<raft::device_span<uint32_t const>, raft::device_span<size_t const>>>>,
+          std::byte /* dummy */>
+          edge_partition_hypersparse_non_deg1_key_offset_spans{};
+        if constexpr (filter_input_key) {
+          if (edge_partition_hypersparse_key_offset_vectors) {
+            edge_partition_hypersparse_non_deg1_key_offset_spans = std::vector<
+              std::variant<raft::device_span<uint32_t const>, raft::device_span<size_t const>>>(
+              loop_count);
+          }
+        }
+
+        std::vector<size_t> edge_partition_allreduce_sizes(loop_count);
+        std::vector<size_t> edge_partition_allreduce_displacements(loop_count);
+        std::vector<size_t> edge_partition_contiguous_sizes(loop_count);
+
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx        = i + j;
+          auto const& output_buffer = edge_partition_major_output_buffers[j];
+
+          size_t allreduce_size{};
+          size_t contiguous_size{};
+          if constexpr (filter_input_key) {
+            allreduce_size = local_key_list_sizes[partition_idx];
+            if (local_key_list_deg1_sizes) {
+              allreduce_size -= (*local_key_list_deg1_sizes)[partition_idx];
+            }
+            if (key_segment_offset_vectors) {
+              auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx];
+              contiguous_size                 = key_segment_offsets[3];
+            } else {
+              contiguous_size = local_key_list_sizes[partition_idx];
+            }
+          } else {
+            static_assert(!use_input_key);
+            auto hypersparse_degree_offsets =
+              graph_view.local_edge_partition_hypersparse_degree_offsets(partition_idx);
+            allreduce_size = size_dataframe_buffer(output_buffer);
+            if (hypersparse_degree_offsets) {
+              allreduce_size -= *((*hypersparse_degree_offsets).rbegin()) -
+                                *((*hypersparse_degree_offsets).rbegin() + 1);
+            }
+            contiguous_size = size_dtaframe_buffer(output_buffer);
+          }
+          edge_partition_allreduce_sizes[j]  = allreduce_size;
+          edge_partition_contiguous_sizes[j] = contiguous_size;
+        }
+        std::exclusive_scan(edge_partition_allreduce_sizes.begin(),
+                            edge_partition_allreduce_sizes.end(),
+                            edge_partition_allreduce_displacements.begin(),
+                            size_t{0});
+        std::variant<rmm::device_uvector<uint8_t>, rmm::device_uvector<uint32_t>>
+          aggregate_priorities = rmm::device_uvector<uint8_t>(0, handle.get_stream());
+        if (minor_comm_size <= std::numeric_limits<uint8_t>::max()) {  // priority == uint8_t
+          std::get<0>(aggregate_priorities)
+            .resize(
+              edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(),
+              handle.get_stream());
+        } else {  // priority == uint32_t
+          aggregate_priorities = rmm::device_uvector<uint32_t>(
+            edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(),
+            handle.get_stream());
+        }
+        if (loop_stream_pool_indices) { handle.sync_stream(); }
+
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+          auto loop_stream   = loop_stream_pool_indices
+                                 ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                                 : handle.get_stream();
+
+          std::optional<
+            std::variant<raft::device_span<uint32_t const>, raft::device_span<size_t const>>>
+            hypersparse_non_deg1_key_offsets{std::nullopt};
+          if constexpr (filter_input_key) {
+            if (edge_partition_hypersparse_key_offset_vectors) {
+              auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j];
+
+              if (offsets.index() == 0) {
+                hypersparse_non_deg1_key_offsets = raft::device_span<uint32_t const>(
+                  std::get<0>(offsets).data(),
+                  std::get<0>(offsets).size() -
+                    (edge_partition_deg1_hypersparse_key_offset_counts
+                       ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j]
+                       : size_t{0}));
+              } else {
+                hypersparse_non_deg1_key_offsets = raft::device_span<size_t const>(
+                  std::get<1>(offsets).data(),
+                  std::get<1>(offsets).size() -
+                    (edge_partition_deg1_hypersparse_key_offset_counts
+                       ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j]
+                       : size_t{0}));
+              }
+              (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] =
+                *hypersparse_non_deg1_key_offsets;
+            }
+          }
+
+          auto const& output_buffer = edge_partition_major_output_buffers[j];
+
+          if (minor_comm_size <= std::numeric_limits<uint8_t>::max()) {  // priority == uint8_t
+            compute_priorities<vertex_t, uint8_t>(
+              minor_comm,
+              get_dataframe_buffer_begin(output_buffer),
+              raft::device_span<uint8_t>(std::get<0>(aggregate_priorities).data() +
+                                           edge_partition_allreduce_displacements[j],
+                                         edge_partition_allreduce_sizes[j]),
+              hypersparse_non_deg1_key_offsets,
+              edge_partition_contiguous_sizes[j],
+              static_cast<int>(partition_idx),
+              subgroup_size,
+              init,
+              process_local_edges[j] ? false : true /* ignore_local_values */,
+              loop_stream);
+          } else {  // priority == uint32_t
+            compute_priorities<vertex_t, uint32_t>(
+              minor_comm,
+              get_dataframe_buffer_begin(output_buffer),
+              raft::device_span<uint32_t>(std::get<1>(aggregate_priorities).data() +
+                                            edge_partition_allreduce_displacements[j],
+                                          edge_partition_allreduce_sizes[j]),
+              hypersparse_non_deg1_key_offsets,
+              edge_partition_contiguous_sizes[j],
+              static_cast<int>(partition_idx),
+              subgroup_size,
+              init,
+              process_local_edges[j] ? false : true /* ignore_local_values */,
+              loop_stream);
+          }
+        }
+        if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+
+        if (minor_comm_size <= std::numeric_limits<uint8_t>::max()) {  // priority == uint8_t
+          device_allreduce(minor_comm,
+                           std::get<0>(aggregate_priorities).data(),
+                           std::get<0>(aggregate_priorities).data(),
+                           std::get<0>(aggregate_priorities).size(),
+                           raft::comms::op_t::MIN,
+                           handle.get_stream());
+        } else {  // priority == uint32_t
+          device_allreduce(minor_comm,
+                           std::get<1>(aggregate_priorities).data(),
+                           std::get<1>(aggregate_priorities).data(),
+                           std::get<1>(aggregate_priorities).size(),
+                           raft::comms::op_t::MIN,
+                           handle.get_stream());
+        }
+        if (loop_stream_pool_indices) { handle.sync_stream(); }
+
+        std::vector<
+          std::variant<std::variant<rmm::device_uvector<uint8_t>, rmm::device_uvector<int>>,
+                       std::optional<rmm::device_uvector<uint32_t>>>>
+          edge_partition_selected_ranks_or_flags{};
+        edge_partition_selected_ranks_or_flags.reserve(loop_count);
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+          auto loop_stream   = loop_stream_pool_indices
+                                 ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                                 : handle.get_stream();
+
+          auto const& output_buffer = edge_partition_major_output_buffers[j];
+          std::optional<
+            std::variant<raft::device_span<uint32_t const>, raft::device_span<size_t const>>>
+            hypersparse_non_deg1_key_offsets{std::nullopt};
+          if constexpr (filter_input_key) {
+            if (edge_partition_hypersparse_key_offset_vectors) {
+              hypersparse_non_deg1_key_offsets =
+                (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j];
+            }
+          }
+
+          auto contiguous_size = edge_partition_contiguous_sizes[j];
+
+          std::variant<std::variant<rmm::device_uvector<uint8_t>, rmm::device_uvector<int>>,
+                       std::optional<rmm::device_uvector<uint32_t>>>
+            selected_ranks_or_flags =
+              std::variant<rmm::device_uvector<uint8_t>, rmm::device_uvector<int>>(
+                rmm::device_uvector<uint8_t>(0, loop_stream));
+          if (minor_comm_size <= std::numeric_limits<uint8_t>::max()) {  // priority == uint8_t
+            auto priorities = raft::device_span<uint8_t const>(
+              std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j],
+              edge_partition_allreduce_sizes[j]);
+            auto tmp = compute_selected_ranks_from_priorities<vertex_t, uint8_t>(
+              minor_comm,
+              priorities,
+              hypersparse_non_deg1_key_offsets,
+              contiguous_size,
+              static_cast<int>(partition_idx),
+              subgroup_size,
+              process_local_edges[j] ? false : true /* ignore_local_values */,
+              loop_stream);
+            if (tmp.index() == 0) {
+              selected_ranks_or_flags =
+                std::variant<rmm::device_uvector<uint8_t>, rmm::device_uvector<int>>(
+                  std::move(std::get<0>(tmp)));
+            } else {
+              selected_ranks_or_flags = std::move(std::get<1>(tmp));
+            }
+          } else {  // priority_t == uint32_t
+            auto priorities = raft::device_span<uint32_t const>(
+              std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j],
+              edge_partition_allreduce_sizes[j]);
+            auto tmp = compute_selected_ranks_from_priorities<vertex_t, uint32_t>(
+              minor_comm,
+              priorities,
+              hypersparse_non_deg1_key_offsets,
+              contiguous_size,
+              static_cast<int>(partition_idx),
+              subgroup_size,
+              process_local_edges[j] ? false : true /* ignore_local_values */,
+              loop_stream);
+            if (tmp.index() == 0) {
+              selected_ranks_or_flags =
+                std::variant<rmm::device_uvector<uint8_t>, rmm::device_uvector<int>>(
+                  std::move(std::get<0>(tmp)));
+            } else {
+              selected_ranks_or_flags = std::move(std::get<1>(tmp));
+            }
+          }
+          edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags));
+        }
+        if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+        if (minor_comm_size <= std::numeric_limits<uint8_t>::max()) {  // priority == uint8_t
+          std::get<0>(aggregate_priorities).resize(0, handle.get_stream());
+          std::get<0>(aggregate_priorities).shrink_to_fit(handle.get_stream());
+        } else {
+          std::get<1>(aggregate_priorities).resize(0, handle.get_stream());
+          std::get<1>(aggregate_priorities).shrink_to_fit(handle.get_stream());
+        }
+        if (loop_stream_pool_indices) { handle.sync_stream(); }
+
+        std::vector<dataframe_buffer_type_t<T>> edge_partition_values{};
+        edge_partition_values.reserve(loop_count);
+
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+          auto loop_stream   = loop_stream_pool_indices
+                                 ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                                 : handle.get_stream();
+
+          auto& output_buffer = edge_partition_major_output_buffers[j];
+
+          auto values = allocate_dataframe_buffer<T>(
+            process_local_edges[j] ? size_dataframe_buffer(output_buffer) : size_t{0}, loop_stream);
+          if (process_local_edges[j]) {
+            if (minor_comm_rank == static_cast<int>(partition_idx)) {
+              assert(!use_input_key);
+              assert(edge_partition_selected_ranks_or_flags[j].index() == 0);
+              auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]);
+              if (selected_ranks.index() == 0) {
+                copy_if_nosync(
+                  get_dataframe_buffer_begin(output_buffer),
+                  get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j],
+                  thrust::make_transform_iterator(
+                    std::get<0>(selected_ranks).begin(),
+                    cuda::proclaim_return_type<bool>([minor_comm_rank] __device__(auto rank) {
+                      return static_cast<int>(rank) == minor_comm_rank;
+                    })),
+                  get_dataframe_buffer_begin(values),
+                  raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                  loop_stream);
+              } else {
+                copy_if_nosync(
+                  get_dataframe_buffer_begin(output_buffer),
+                  get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j],
+                  thrust::make_transform_iterator(
+                    std::get<1>(selected_ranks).begin(),
+                    cuda::proclaim_return_type<bool>(
+                      [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })),
+                  get_dataframe_buffer_begin(values),
+                  raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                  loop_stream);
+              }
+            } else {
+              assert(edge_partition_selected_ranks_or_flags[j].index() == 1);
+              auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]);
+              size_t input_end_offset{};
+              if constexpr (filter_input_key) {
+                input_end_offset = edge_partition_contiguous_sizes[j];
+                if (edge_partition_hypersparse_non_deg1_key_offset_spans) {
+                  auto const& span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j];
+                  if (span.index() == 0) {
+                    input_end_offset += std::get<0>(span).size();
+                  } else {
+                    input_end_offset += std::get<1>(span).size();
+                  }
+                }
+              } else {
+                input_end_offset = edge_partition_allreduce_sizes[j];
+              }
+              copy_if_nosync(
+                get_dataframe_buffer_begin(output_buffer),
+                get_dataframe_buffer_begin(output_buffer) + input_end_offset,
+                thrust::make_transform_iterator(
+                  thrust::make_counting_iterator(size_t{0}),
+                  cuda::proclaim_return_type<bool>(
+                    [keep_flags = raft::device_span<uint32_t const>(
+                       (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) {
+                      auto word = keep_flags[packed_bool_offset(offset)];
+                      return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask());
+                    })),
+                get_dataframe_buffer_begin(values),
+                raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                loop_stream);
+              (*keep_flags).resize(0, loop_stream);
+              (*keep_flags).shrink_to_fit(loop_stream);
+            }
+          }
+
+          edge_partition_values.push_back(std::move(values));
+        }
+        if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+
+        std::vector<size_t> copy_sizes(loop_count);
+        raft::update_host(copy_sizes.data(), counters.data(), loop_count, handle.get_stream());
+        handle.sync_stream();
+
+        std::optional<
+          std::vector<std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>>>>
+          edge_partition_deg1_hypersparse_output_offset_vectors{};
+
+        if (graph_view.use_dcs()) {
+          edge_partition_deg1_hypersparse_output_offset_vectors =
+            std::vector<std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>>>{};
+          (*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count);
+
+          for (size_t j = 0; j < loop_count; ++j) {
+            auto loop_stream =
+              loop_stream_pool_indices
+                ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j])
+                : handle.get_stream();
+
+            auto& output_buffer = edge_partition_major_output_buffers[j];
+            std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>>
+              output_offsets = rmm::device_uvector<uint32_t>(0, loop_stream);
+            if (!uint32_key_output_offset) {
+              output_offsets = rmm::device_uvector<size_t>(0, loop_stream);
+            }
+
+            if (process_local_edges[j]) {
+              auto& values = edge_partition_values[j];
+
+              size_t output_offset_buf_size{0};
+              if constexpr (filter_input_key) {
+                output_offset_buf_size = (*edge_partition_deg1_hypersparse_key_offset_counts)[j];
+              } else {
+                assert(!use_input_key);
+                output_offset_buf_size =
+                  size_dataframe_buffer(output_buffer) - edge_partition_allreduce_sizes[j];
+              }
+
+              if (output_offsets.index() == 0) {
+                std::get<0>(output_offsets).resize(output_offset_buf_size, loop_stream);
+              } else {
+                output_offsets = rmm::device_uvector<size_t>(output_offset_buf_size, loop_stream);
+              }
+
+              size_t input_start_offset{};
+              if constexpr (filter_input_key) {
+                auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j];
+                input_start_offset =
+                  edge_partition_contiguous_sizes[j] +
+                  (span.index() == 0 ? std::get<0>(span).size() : std::get<1>(span).size());
+              } else {
+                static_assert(!use_input_key);
+                input_start_offset = edge_partition_allreduce_sizes[j];
+              }
+              auto flag_first = thrust::make_transform_iterator(
+                get_dataframe_buffer_begin(output_buffer) + input_start_offset,
+                cuda::proclaim_return_type<bool>(
+                  [init] __device__(auto val) { return val != init; }));
+
+              if constexpr (filter_input_key) {
+                auto& hypersparse_key_offsets = (*edge_partition_hypersparse_key_offset_vectors)[j];
+                auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j];
+                if (hypersparse_key_offsets.index() == 0) {
+                  assert(output_offsets.index() == 0);
+                  auto input_pair_first = thrust::make_zip_iterator(
+                    get_dataframe_buffer_begin(output_buffer) + input_start_offset,
+                    std::get<0>(hypersparse_key_offsets).begin() + std::get<0>(span).size());
+                  copy_if_nosync(
+                    input_pair_first,
+                    input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j],
+                    flag_first,
+                    thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j],
+                                              std::get<0>(output_offsets).begin()),
+                    raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                    loop_stream);
+                  std::get<0>(hypersparse_key_offsets).resize(0, loop_stream);
+                  std::get<0>(hypersparse_key_offsets).shrink_to_fit(loop_stream);
+                } else {
+                  assert(output_offsets.index() == 1);
+                  auto input_pair_first = thrust::make_zip_iterator(
+                    get_dataframe_buffer_begin(output_buffer) + input_start_offset,
+                    std::get<1>(hypersparse_key_offsets).begin() + std::get<1>(span).size());
+                  copy_if_nosync(
+                    input_pair_first,
+                    input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j],
+                    flag_first,
+                    thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j],
+                                              std::get<1>(output_offsets).begin()),
+                    raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                    loop_stream);
+                  std::get<1>(hypersparse_key_offsets).resize(0, loop_stream);
+                  std::get<1>(hypersparse_key_offsets).shrink_to_fit(loop_stream);
+                }
+              } else {
+                static_assert(!use_input_key);
+                assert(process_local_edges[j]);
+                if (output_offsets.index() == 0) {
+                  auto input_pair_first =
+                    thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer),
+                                              thrust::make_counting_iterator(uint32_t{0}));
+                  copy_if_nosync(
+                    input_pair_first + input_start_offset,
+                    input_pair_first + size_dataframe_buffer(output_buffer),
+                    flag_first,
+                    thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j],
+                                              std::get<0>(output_offsets).begin()),
+                    raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                    loop_stream);
+                } else {
+                  auto input_pair_first =
+                    thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer),
+                                              thrust::make_counting_iterator(size_t{0}));
+                  copy_if_nosync(
+                    input_pair_first + input_start_offset,
+                    input_pair_first + size_dataframe_buffer(output_buffer),
+                    flag_first,
+                    thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j],
+                                              std::get<1>(output_offsets).begin()),
+                    raft::device_span<size_t>(counters.data() + j, size_t{1}),
+                    loop_stream);
+                }
+              }
+            }
+
+            (*edge_partition_deg1_hypersparse_output_offset_vectors)
+              .push_back(std::move(output_offsets));
+
+            resize_dataframe_buffer(output_buffer, 0, loop_stream);
+            shrink_to_fit_dataframe_buffer(output_buffer, loop_stream);
+          }
+          if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
+
+          std::vector<size_t> deg1_copy_sizes(loop_count);
+          raft::update_host(
+            deg1_copy_sizes.data(), counters.data(), loop_count, handle.get_stream());
+          handle.sync_stream();
+
+          for (size_t j = 0; j < loop_count; ++j) {
+            if (process_local_edges[j]) {
+              copy_sizes[j] += deg1_copy_sizes[j];
+              auto& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j];
+              if (offsets.index() == 0) {
+                std::get<0>(offsets).resize(deg1_copy_sizes[j], handle.get_stream());
+              } else {
+                assert(offsets.index() == 1);
+                std::get<1>(offsets).resize(deg1_copy_sizes[j], handle.get_stream());
+              }
+              // skip shrink_to_fit() to cut execution time
+            }
+          }
+        }
+
+        for (size_t j = 0; j < loop_count; ++j) {
+          if (process_local_edges[j]) {
+            resize_dataframe_buffer(edge_partition_values[j], copy_sizes[j], handle.get_stream());
+            // skip shrink_to_fit() to cut execution time
+          }
+        }
+
+        size_t min_element_size{cache_line_size};
+        if constexpr (std::is_arithmetic_v<T>) {
+          min_element_size = std::min(sizeof(T), min_element_size);
+        } else {
+          static_assert(is_thrust_tuple_of_arithmetic<T>::value);
+          min_element_size =
+            std::min(cugraph::min_thrust_tuple_element_sizes<T>(), min_element_size);
+        }
+        assert((cache_line_size % min_element_size) == 0);
+        size_t value_alignment = cache_line_size / min_element_size;
+
+        size_t offset_alignment = 1;
+        if (graph_view.use_dcs()) {
+          static_assert(((cache_line_size % sizeof(uint32_t)) == 0) &&
+                        ((cache_line_size % sizeof(size_t)) == 0));
+          offset_alignment =
+            cache_line_size / (uint32_key_output_offset ? sizeof(uint32_t) : sizeof(size_t));
+        }
+
+        std::optional<std::vector<size_t>> rx_value_sizes{};
+        std::optional<std::vector<size_t>> rx_value_displs{};
+        std::optional<dataframe_buffer_type_t<T>> rx_values{};
+
+        std::optional<std::vector<size_t>> rx_offset_sizes{};
+        std::optional<std::vector<size_t>> rx_offset_displs{};
+        std::optional<std::variant<rmm::device_uvector<uint32_t>, rmm::device_uvector<size_t>>>
+          rx_offsets{};
+        {
+          auto size_per_rank =
+            loop_count * (graph_view.use_dcs() ? 2 /* value buffer size, offset buffer size */
+                                               : 1 /* value buffer size */);
+          rmm::device_uvector<size_t> d_aggregate_buffer_sizes(minor_comm_size * size_per_rank,
+                                                               handle.get_stream());
+          std::vector<size_t> h_buffer_sizes(size_per_rank);
+          for (size_t j = 0; j < loop_count; ++j) {
+            h_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]);
+            if (graph_view.use_dcs()) {
+              auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j];
+              if (offsets.index() == 0) {
+                h_buffer_sizes[loop_count + j] = std::get<0>(offsets).size();
+              } else {
+                assert(offsets.index() == 1);
+                h_buffer_sizes[loop_count + j] = std::get<1>(offsets).size();
+              }
+            }
+          }
+          raft::update_device(d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank,
+                              h_buffer_sizes.data(),
+                              h_buffer_sizes.size(),
+                              handle.get_stream());
+          device_allgather(minor_comm,
+                           d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank,
+                           d_aggregate_buffer_sizes.data(),
+                           size_per_rank,
+                           handle.get_stream());
+          if (static_cast<size_t>(minor_comm_rank / num_concurrent_loops) ==
+              (i / num_concurrent_loops)) {
+            std::vector<size_t> h_aggregate_buffer_sizes(d_aggregate_buffer_sizes.size());
+            raft::update_host(h_aggregate_buffer_sizes.data(),
+                              d_aggregate_buffer_sizes.data(),
+                              d_aggregate_buffer_sizes.size(),
+                              handle.get_stream());
+            handle.sync_stream();
+            auto j          = static_cast<size_t>(minor_comm_rank % num_concurrent_loops);
+            rx_value_sizes  = std::vector<size_t>(minor_comm_size);
+            rx_value_displs = std::vector<size_t>(minor_comm_size);
+            if (graph_view.use_dcs()) {
+              rx_offset_sizes  = std::vector<size_t>(minor_comm_size);
+              rx_offset_displs = std::vector<size_t>(minor_comm_size);
+            }
+            for (int k = 0; k < minor_comm_size; ++k) {
+              (*rx_value_sizes)[k] = h_aggregate_buffer_sizes[k * size_per_rank + j];
+              if (graph_view.use_dcs()) {
+                (*rx_offset_sizes)[k] =
+                  h_aggregate_buffer_sizes[k * size_per_rank + loop_count + j];
+              }
+            }
+
+            std::vector<size_t> aligned_sizes(minor_comm_size);
+            for (int k = 0; k < minor_comm_size; ++k) {
+              if (k == (minor_comm_size - 1)) {
+                aligned_sizes[k] = (*rx_value_sizes)[k];
+              } else {
+                aligned_sizes[k] = raft::round_up_safe((*rx_value_sizes)[k], value_alignment);
+              }
+            }
+            std::exclusive_scan(
+              aligned_sizes.begin(), aligned_sizes.end(), (*rx_value_displs).begin(), size_t{0});
+
+            if (graph_view.use_dcs()) {
+              for (int k = 0; k < minor_comm_size; ++k) {
+                if (k == (minor_comm_size - 1)) {
+                  aligned_sizes[k] = (*rx_offset_sizes)[k];
+                } else {
+                  aligned_sizes[k] = raft::round_up_safe((*rx_offset_sizes)[k], offset_alignment);
+                }
+              }
+              std::exclusive_scan(
+                aligned_sizes.begin(), aligned_sizes.end(), (*rx_offset_displs).begin(), size_t{0});
+            }
+
+            rx_values = allocate_dataframe_buffer<T>(
+              (*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream());
+            if (graph_view.use_dcs()) {
+              if (uint32_key_output_offset) {
+                rx_offsets = rmm::device_uvector<uint32_t>(
+                  (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream());
+              } else {
+                rx_offsets = rmm::device_uvector<size_t>(
+                  (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream());
+              }
+            }
+          }
+        }
+
+        device_group_start(minor_comm);
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+          auto& values       = edge_partition_values[j];
+
+          if (minor_comm_rank == static_cast<int>(partition_idx)) {
+            device_gatherv(minor_comm,
+                           get_dataframe_buffer_begin(values),
+                           get_dataframe_buffer_begin(*rx_values),
+                           values.size(),
+                           *rx_value_sizes,
+                           *rx_value_displs,
+                           static_cast<int>(partition_idx),
+                           handle.get_stream());
+          } else {
+            device_gatherv(minor_comm,
+                           get_dataframe_buffer_begin(values),
+                           dataframe_buffer_iterator_type_t<T>{},
+                           values.size(),
+                           std::vector<size_t>{},
+                           std::vector<size_t>{},
+                           static_cast<int>(partition_idx),
+                           handle.get_stream());
+          }
+        }
+        device_group_end(minor_comm);
+        if (graph_view.use_dcs()) {
+          device_group_start(minor_comm);
+          for (size_t j = 0; j < loop_count; ++j) {
+            auto partition_idx = i + j;
+            auto& values       = edge_partition_values[j];
+
+            auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j];
+            if (offsets.index() == 0) {
+              if (minor_comm_rank == static_cast<int>(partition_idx)) {
+                device_gatherv(minor_comm,
+                               std::get<0>(offsets).data(),
+                               std::get<0>(*rx_offsets).data(),
+                               std::get<0>(offsets).size(),
+                               *rx_offset_sizes,
+                               *rx_offset_displs,
+                               static_cast<int>(partition_idx),
+                               handle.get_stream());
+              } else {
+                device_gatherv(minor_comm,
+                               std::get<0>(offsets).data(),
+                               static_cast<uint32_t*>(nullptr),
+                               std::get<0>(offsets).size(),
+                               std::vector<size_t>{},
+                               std::vector<size_t>{},
+                               static_cast<int>(partition_idx),
+                               handle.get_stream());
+              }
+            } else {
+              assert(offsets.index() == 1);
+              if (minor_comm_rank == static_cast<int>(partition_idx)) {
+                device_gatherv(minor_comm,
+                               std::get<1>(offsets).data(),
+                               std::get<1>(*rx_offsets).data(),
+                               std::get<1>(offsets).size(),
+                               *rx_offset_sizes,
+                               *rx_offset_displs,
+                               static_cast<int>(partition_idx),
+                               handle.get_stream());
+              } else {
+                device_gatherv(minor_comm,
+                               std::get<1>(offsets).data(),
+                               static_cast<size_t*>(nullptr),
+                               std::get<1>(offsets).size(),
+                               std::vector<size_t>{},
+                               std::vector<size_t>{},
+                               static_cast<int>(partition_idx),
+                               handle.get_stream());
+              }
+            }
+          }
+          device_group_end(minor_comm);
+        }
+        handle.sync_stream();  // this is required before edge_partition_values.clear();
+        edge_partition_values.clear();
+        if (loop_stream_pool_indices) {
+          handle.sync_stream_pool(*loop_stream_pool_indices);
+        }  // to ensure that memory is freed
+
+        if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) {
+          auto j             = static_cast<size_t>(minor_comm_rank % num_concurrent_loops);
+          auto partition_idx = i + j;
+
+          {  // remove gaps introduced to enforce alignment
+            rmm::device_uvector<uint32_t> bitmap(
+              packed_bool_size(size_dataframe_buffer(*rx_values)), handle.get_stream());
+            thrust::fill(
+              handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask());
+            rmm::device_uvector<size_t> d_displs((*rx_value_displs).size(), handle.get_stream());
+            rmm::device_uvector<size_t> d_sizes((*rx_value_sizes).size(), handle.get_stream());
+            raft::update_device(d_displs.data(),
+                                (*rx_value_displs).data(),
+                                (*rx_value_displs).size(),
+                                handle.get_stream());
+            raft::update_device(d_sizes.data(),
+                                (*rx_value_sizes).data(),
+                                (*rx_value_sizes).size(),
+                                handle.get_stream());
+            thrust::for_each(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(size_t{0}),
+              thrust::make_counting_iterator(static_cast<size_t>(minor_comm_size - 1) *
+                                             value_alignment),
+              [bitmap    = raft::device_span<uint32_t>(bitmap.data(), bitmap.size()),
+               displs    = raft::device_span<size_t const>(d_displs.data(), d_displs.size()),
+               sizes     = raft::device_span<size_t const>(d_sizes.data(), d_sizes.size()),
+               alignment = value_alignment] __device__(size_t i) {
+                auto rank  = static_cast<int>(i / alignment);
+                auto first = displs[rank] + sizes[rank];
+                auto last  = displs[rank + 1];
+                if ((i % alignment) < (last - first)) {
+                  auto offset = first + (i % alignment);
+                  cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+                    bitmap[packed_bool_offset(offset)]);
+                  word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed);
+                }
+              });
+            resize_dataframe_buffer(
+              *rx_values,
+              thrust::distance(
+                get_dataframe_buffer_begin(*rx_values),
+                thrust::remove_if(handle.get_thrust_policy(),
+                                  get_dataframe_buffer_begin(*rx_values),
+                                  get_dataframe_buffer_end(*rx_values),
+                                  thrust::make_transform_iterator(
+                                    thrust::make_counting_iterator(size_t{0}),
+                                    cuda::proclaim_return_type<bool>(
+                                      [bitmap = raft::device_span<uint32_t const>(
+                                         bitmap.data(), bitmap.size())] __device__(size_t i) {
+                                        return (bitmap[packed_bool_offset(i)] &
+                                                packed_bool_mask(i)) == packed_bool_mask(i);
+                                      })),
+                                  thrust::identity<bool>{})),
+              handle.get_stream());
+            // skip shrink_to_fit() to cut execution time
+            std::exclusive_scan((*rx_value_sizes).begin(),
+                                (*rx_value_sizes).end(),
+                                (*rx_value_displs).begin(),
+                                size_t{0});  // now gaps are removed
+
+            if (rx_offsets) {
+              size_t num_offsets = ((*rx_offsets).index() == 0)
+                                     ? size_dataframe_buffer(std::get<0>(*rx_offsets))
+                                     : size_dataframe_buffer(std::get<1>(*rx_offsets));
+              bitmap.resize(packed_bool_size(num_offsets), handle.get_stream());
+              thrust::fill(
+                handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask());
+              d_displs.resize((*rx_offset_displs).size(), handle.get_stream());
+              d_sizes.resize((*rx_offset_sizes).size(), handle.get_stream());
+              raft::update_device(d_displs.data(),
+                                  (*rx_offset_displs).data(),
+                                  (*rx_offset_displs).size(),
+                                  handle.get_stream());
+              raft::update_device(d_sizes.data(),
+                                  (*rx_offset_sizes).data(),
+                                  (*rx_offset_sizes).size(),
+                                  handle.get_stream());
+              thrust::for_each(
+                handle.get_thrust_policy(),
+                thrust::make_counting_iterator(size_t{0}),
+                thrust::make_counting_iterator(static_cast<size_t>(minor_comm_size - 1) *
+                                               offset_alignment),
+                [bitmap    = raft::device_span<uint32_t>(bitmap.data(), bitmap.size()),
+                 displs    = raft::device_span<size_t const>(d_displs.data(), d_displs.size()),
+                 sizes     = raft::device_span<size_t const>(d_sizes.data(), d_sizes.size()),
+                 alignment = offset_alignment] __device__(size_t i) {
+                  auto rank  = static_cast<int>(i / alignment);
+                  auto first = displs[rank] + sizes[rank];
+                  auto last  = displs[rank + 1];
+                  if ((i % alignment) < (last - first)) {
+                    auto offset = first + (i % alignment);
+                    cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+                      bitmap[packed_bool_offset(offset)]);
+                    word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed);
+                  }
+                });
+              if ((*rx_offsets).index() == 0) {
+                resize_dataframe_buffer(
+                  std::get<0>(*rx_offsets),
+                  thrust::distance(
+                    get_dataframe_buffer_begin(std::get<0>(*rx_offsets)),
+                    thrust::remove_if(handle.get_thrust_policy(),
+                                      get_dataframe_buffer_begin(std::get<0>(*rx_offsets)),
+                                      get_dataframe_buffer_end(std::get<0>(*rx_offsets)),
+                                      thrust::make_transform_iterator(
+                                        thrust::make_counting_iterator(size_t{0}),
+                                        cuda::proclaim_return_type<bool>(
+                                          [bitmap = raft::device_span<uint32_t const>(
+                                             bitmap.data(), bitmap.size())] __device__(size_t i) {
+                                            return (bitmap[packed_bool_offset(i)] &
+                                                    packed_bool_mask(i)) == packed_bool_mask(i);
+                                          })),
+                                      thrust::identity<bool>{})),
+                  handle.get_stream());
+                // skip shrink_to_fit() to cut execution time
+              } else {
+                resize_dataframe_buffer(
+                  std::get<1>(*rx_offsets),
+                  thrust::distance(
+                    get_dataframe_buffer_begin(std::get<1>(*rx_offsets)),
+                    thrust::remove_if(handle.get_thrust_policy(),
+                                      get_dataframe_buffer_begin(std::get<1>(*rx_offsets)),
+                                      get_dataframe_buffer_end(std::get<1>(*rx_offsets)),
+                                      thrust::make_transform_iterator(
+                                        thrust::make_counting_iterator(size_t{0}),
+                                        cuda::proclaim_return_type<bool>(
+                                          [bitmap = raft::device_span<uint32_t const>(
+                                             bitmap.data(), bitmap.size())] __device__(size_t i) {
+                                            return (bitmap[packed_bool_offset(i)] &
+                                                    packed_bool_mask(i)) == packed_bool_mask(i);
+                                          })),
+                                      thrust::identity<bool>{})),
+                  handle.get_stream());
+                // skip shrink_to_fit() to cut execution time
+              }
+              std::exclusive_scan((*rx_offset_sizes).begin(),
+                                  (*rx_offset_sizes).end(),
+                                  (*rx_offset_displs).begin(),
+                                  size_t{0});  // now gaps are removed
+            }
+          }
+
+          size_t output_range_size{};
+          if constexpr (filter_input_key) {
+            output_range_size = local_key_list_sizes[partition_idx];
+          } else {
+            auto const& segment_offsets = graph_view.local_vertex_partition_segment_offsets();
+            output_range_size =
+              segment_offsets
+                ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */
+                : graph_view.local_vertex_partition_range_size();
+          }
+          auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]);
+          if (selected_ranks.index() == 0) {
+            auto old_size = std::get<0>(selected_ranks).size();
+            std::get<0>(selected_ranks).resize(output_range_size, handle.get_stream());
+            thrust::fill(handle.get_thrust_policy(),
+                         std::get<0>(selected_ranks).begin() + old_size,
+                         std::get<0>(selected_ranks).end(),
+                         static_cast<uint8_t>(minor_comm_size));
+          } else {
+            assert(selected_ranks.index() == 1);
+            auto old_size = std::get<1>(selected_ranks).size();
+            std::get<1>(selected_ranks).resize(output_range_size, handle.get_stream());
+            thrust::fill(handle.get_thrust_policy(),
+                         std::get<1>(selected_ranks).begin() + old_size,
+                         std::get<1>(selected_ranks).end(),
+                         minor_comm_size);
+          }
+          if (rx_offsets) {
+            rmm::device_uvector<size_t> lasts((*rx_offset_displs).size(), handle.get_stream());
+            raft::update_device(lasts.data(),
+                                (*rx_offset_displs).data() + 1,
+                                (*rx_offset_displs).size() - 1,
+                                handle.get_stream());
+            auto num_elements = (*rx_offset_displs).back() + (*rx_offset_sizes).back();
+            lasts.set_element_async(lasts.size() - 1, num_elements, handle.get_stream());
+            handle.sync_stream();  // this is necessary before num_elements becomes out-of-scope
+
+            if ((*rx_offsets).index() == 0) {
+              auto& offsets = std::get<0>(*rx_offsets);
+              if (selected_ranks.index() == 0) {
+                thrust::for_each(
+                  handle.get_thrust_policy(),
+                  thrust::make_counting_iterator(size_t{0}),
+                  thrust::make_counting_iterator(offsets.size()),
+                  [offsets = raft::device_span<uint32_t const>(offsets.data(), offsets.size()),
+                   lasts   = raft::device_span<size_t const>(lasts.data(), lasts.size()),
+                   selected_ranks = raft::device_span<uint8_t>(
+                     std::get<0>(selected_ranks).data(),
+                     std::get<0>(selected_ranks).size())] __device__(auto i) {
+                    auto minor_comm_rank       = static_cast<uint8_t>(thrust::distance(
+                      lasts.begin(),
+                      thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i)));
+                    selected_ranks[offsets[i]] = minor_comm_rank;
+                  });
+              } else {
+                assert(selected_ranks.index() == 1);
+                thrust::for_each(
+                  handle.get_thrust_policy(),
+                  thrust::make_counting_iterator(size_t{0}),
+                  thrust::make_counting_iterator(offsets.size()),
+                  [offsets = raft::device_span<uint32_t const>(offsets.data(), offsets.size()),
+                   lasts   = raft::device_span<size_t const>(lasts.data(), lasts.size()),
+                   selected_ranks = raft::device_span<int>(
+                     std::get<1>(selected_ranks).data(),
+                     std::get<1>(selected_ranks).size())] __device__(auto i) {
+                    auto minor_comm_rank       = static_cast<int>(thrust::distance(
+                      lasts.begin(),
+                      thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i)));
+                    selected_ranks[offsets[i]] = minor_comm_rank;
+                  });
+              }
+              offsets.resize(0, handle.get_stream());
+              offsets.shrink_to_fit(handle.get_stream());
+            } else {
+              assert((*rx_offsets).index() == 1);
+              auto& offsets = std::get<1>(*rx_offsets);
+              if (selected_ranks.index() == 0) {
+                thrust::for_each(
+                  handle.get_thrust_policy(),
+                  thrust::make_counting_iterator(size_t{0}),
+                  thrust::make_counting_iterator(offsets.size()),
+                  [offsets        = raft::device_span<size_t const>(offsets.data(), offsets.size()),
+                   lasts          = raft::device_span<size_t const>(lasts.data(), lasts.size()),
+                   selected_ranks = raft::device_span<uint8_t>(
+                     std::get<0>(selected_ranks).data(),
+                     std::get<0>(selected_ranks).size())] __device__(auto i) {
+                    auto minor_comm_rank       = static_cast<uint8_t>(thrust::distance(
+                      lasts.begin(),
+                      thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i)));
+                    selected_ranks[offsets[i]] = minor_comm_rank;
+                  });
+              } else {
+                assert(selected_ranks.index() == 1);
+                thrust::for_each(
+                  handle.get_thrust_policy(),
+                  thrust::make_counting_iterator(size_t{0}),
+                  thrust::make_counting_iterator(offsets.size()),
+                  [offsets        = raft::device_span<size_t const>(offsets.data(), offsets.size()),
+                   lasts          = raft::device_span<size_t const>(lasts.data(), lasts.size()),
+                   selected_ranks = raft::device_span<int>(
+                     std::get<1>(selected_ranks).data(),
+                     std::get<1>(selected_ranks).size())] __device__(auto i) {
+                    auto minor_comm_rank       = static_cast<int>(thrust::distance(
+                      lasts.begin(),
+                      thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i)));
+                    selected_ranks[offsets[i]] = minor_comm_rank;
+                  });
+              }
+              offsets.resize(0, handle.get_stream());
+              offsets.shrink_to_fit(handle.get_stream());
+            }
+          }
+
+          size_t num_positions = (selected_ranks.index() == 0) ? std::get<0>(selected_ranks).size()
+                                                               : std::get<1>(selected_ranks).size();
+          if (num_positions <= static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+            rmm::device_uvector<uint32_t> rx_positions(num_positions, handle.get_stream());
+            thrust::sequence(
+              handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), uint32_t{0});
+            if (selected_ranks.index() == 0) {
+              thrust::stable_sort_by_key(handle.get_thrust_policy(),
+                                         std::get<0>(selected_ranks).begin(),
+                                         std::get<0>(selected_ranks).end(),
+                                         rx_positions.begin());
+            } else {
+              assert(selected_ranks.index() == 1);
+              thrust::stable_sort_by_key(handle.get_thrust_policy(),
+                                         std::get<1>(selected_ranks).begin(),
+                                         std::get<1>(selected_ranks).end(),
+                                         rx_positions.begin());
+            }
+            // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value
+            rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(),
+                                handle.get_stream());
+            thrust::scatter(handle.get_thrust_policy(),
+                            get_dataframe_buffer_begin(*rx_values),
+                            get_dataframe_buffer_end(*rx_values),
+                            rx_positions.begin(),
+                            tmp_vertex_value_output_first);
+          } else {
+            rmm::device_uvector<size_t> rx_positions(num_positions, handle.get_stream());
+            thrust::sequence(
+              handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), size_t{0});
+            if (selected_ranks.index() == 0) {
+              thrust::stable_sort_by_key(handle.get_thrust_policy(),
+                                         std::get<0>(selected_ranks).begin(),
+                                         std::get<0>(selected_ranks).end(),
+                                         rx_positions.begin());
+            } else {
+              assert(selected_ranks.index() == 1);
+              thrust::stable_sort_by_key(handle.get_thrust_policy(),
+                                         std::get<1>(selected_ranks).begin(),
+                                         std::get<1>(selected_ranks).end(),
+                                         rx_positions.begin());
+            }
+            // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value
+            rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(),
+                                handle.get_stream());
+            thrust::scatter(handle.get_thrust_policy(),
+                            get_dataframe_buffer_begin(*rx_values),
+                            get_dataframe_buffer_end(*rx_values),
+                            rx_positions.begin(),
+                            tmp_vertex_value_output_first);
+          }
+        }
+        handle.sync_stream();
+      } else {
+        device_group_start(minor_comm);
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+
+          device_reduce(minor_comm,
+                        get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]),
+                        tmp_vertex_value_output_first,
+                        size_dataframe_buffer(edge_partition_major_output_buffers[j]),
+                        ReduceOp::compatible_raft_comms_op,
+                        static_cast<int>(partition_idx),
+                        handle.get_stream());
+        }
+        device_group_end(minor_comm);
+        if (loop_stream_pool_indices) { handle.sync_stream(); }
+      }
+    }
+  }
+
+  // 10. communication
+
+  if constexpr (GraphViewType::is_multi_gpu && !update_major) {
+    auto& comm                 = handle.get_comms();
+    auto const comm_rank       = comm.get_rank();
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_rank = major_comm.get_rank();
+    auto const major_comm_size = major_comm.get_size();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
+
+    auto view = minor_tmp_buffer->view();
+    if (view.keys()) {  // applying the initial value is deferred to here
+      vertex_t max_vertex_partition_size{0};
+      for (int i = 0; i < major_comm_size; ++i) {
+        auto this_segment_vertex_partition_id =
+          compute_local_edge_partition_minor_range_vertex_partition_id_t{
+            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
+        max_vertex_partition_size =
+          std::max(max_vertex_partition_size,
+                   graph_view.vertex_partition_range_size(this_segment_vertex_partition_id));
+      }
+      auto tx_buffer = allocate_dataframe_buffer<T>(max_vertex_partition_size, handle.get_stream());
+      auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer);
+      std::optional<raft::host_span<vertex_t const>> minor_key_offsets{};
+      if constexpr (GraphViewType::is_storage_transposed) {
+        minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets();
+      } else {
+        minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets();
+      }
+      for (int i = 0; i < major_comm_size; ++i) {
+        auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element;
+        auto this_segment_vertex_partition_id =
+          compute_local_edge_partition_minor_range_vertex_partition_id_t{
+            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
+        thrust::fill(handle.get_thrust_policy(),
+                     tx_buffer_first,
+                     tx_buffer_first +
+                       graph_view.vertex_partition_range_size(this_segment_vertex_partition_id),
+                     minor_init);
+        auto value_first = thrust::make_transform_iterator(
+          view.value_first(),
+          cuda::proclaim_return_type<T>(
+            [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); }));
+        thrust::scatter(handle.get_thrust_policy(),
+                        value_first + (*minor_key_offsets)[i],
+                        value_first + (*minor_key_offsets)[i + 1],
+                        thrust::make_transform_iterator(
+                          (*(view.keys())).begin() + (*minor_key_offsets)[i],
+                          cuda::proclaim_return_type<vertex_t>(
+                            [key_first = graph_view.vertex_partition_range_first(
+                               this_segment_vertex_partition_id)] __device__(auto key) {
+                              return key - key_first;
+                            })),
+                        tx_buffer_first);
+        device_reduce(major_comm,
+                      tx_buffer_first,
+                      tmp_vertex_value_output_first,
+                      static_cast<size_t>(
+                        graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)),
+                      ReduceOp::compatible_raft_comms_op,
+                      i,
+                      handle.get_stream());
+      }
+    } else {
+      auto first_segment_vertex_partition_id =
+        compute_local_edge_partition_minor_range_vertex_partition_id_t{
+          major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0);
+      vertex_t minor_range_first =
+        graph_view.vertex_partition_range_first(first_segment_vertex_partition_id);
+      for (int i = 0; i < major_comm_size; ++i) {
+        auto this_segment_vertex_partition_id =
+          compute_local_edge_partition_minor_range_vertex_partition_id_t{
+            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
+        auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) -
+                      minor_range_first;
+        device_reduce(major_comm,
+                      view.value_first() + offset,
+                      tmp_vertex_value_output_first,
+                      static_cast<size_t>(
+                        graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)),
+                      ReduceOp::compatible_raft_comms_op,
+                      i,
+                      handle.get_stream());
+      }
+    }
+  }
+}
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh
index f426cd993e..a166f37906 100644
--- a/cpp/src/prims/detail/prim_functors.cuh
+++ b/cpp/src/prims/detail/prim_functors.cuh
@@ -21,6 +21,23 @@ namespace cugraph {
 
 namespace detail {
 
+template <typename key_t,
+          typename vertex_t,
+          typename src_value_t,
+          typename dst_value_t,
+          typename e_value_t,
+          bool store_transposed>
+struct const_true_e_op_t {
+  __device__ auto operator()(std::conditional_t<store_transposed, vertex_t, key_t> key_or_src,
+                             std::conditional_t<store_transposed, key_t, vertex_t> key_or_dst,
+                             src_value_t,
+                             dst_value_t,
+                             e_value_t) const
+  {
+    return true;
+  }
+};
+
 template <typename GraphViewType,
           typename key_t,
           typename EdgePartitionSrcValueInputWrapper,
@@ -71,6 +88,11 @@ struct call_e_op_t {
   }
 };
 
+template <typename edge_t>
+struct call_const_true_e_op_t {
+  __device__ auto operator()(edge_t i) const { return true; }
+};
+
 template <typename GraphViewType,
           typename key_t,
           typename EdgePartitionSrcValueInputWrapper,
diff --git a/cpp/src/prims/detail/prim_utils.cuh b/cpp/src/prims/detail/prim_utils.cuh
new file mode 100644
index 0000000000..3d8f562604
--- /dev/null
+++ b/cpp/src/prims/detail/prim_utils.cuh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <limits>
+#include <type_traits>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename priority_t>
+__host__ __device__ priority_t
+rank_to_priority(int rank,
+                 int root,
+                 int subgroup_size /* faster interconnect within a subgroup */,
+                 int comm_size,
+                 vertex_t offset /* to evenly distribute traffic */)
+{
+  static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4);
+  using cast_t = std::conditional_t<
+    sizeof(priority_t) == 1,
+    int16_t,
+    std::conditional_t<sizeof(priority_t) == 2, int32_t, int64_t>>;  // to prevent overflow
+
+  if (rank == root) {
+    return priority_t{0};
+  } else if (rank / subgroup_size ==
+             root / subgroup_size) {  // intra-subgroup communication is sufficient (priorities in
+                                      // [1, subgroup_size)
+    auto rank_dist =
+      static_cast<int>(((static_cast<cast_t>(rank) + subgroup_size) - root) % subgroup_size);
+    int modulo = subgroup_size - 1;
+    return static_cast<priority_t>(1 + (static_cast<cast_t>(rank_dist - 1) + (offset % modulo)) %
+                                         modulo);
+  } else {  // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size)
+    auto subgroup_dist =
+      static_cast<int>(((static_cast<cast_t>(rank / subgroup_size) + (comm_size / subgroup_size)) -
+                        (root / subgroup_size)) %
+                       (comm_size / subgroup_size));
+    auto intra_subgroup_rank_dist = static_cast<int>(
+      ((static_cast<cast_t>(rank % subgroup_size) + subgroup_size) - (root % subgroup_size)) %
+      subgroup_size);
+    auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist;
+    int modulo     = comm_size - subgroup_size;
+    return static_cast<priority_t>(
+      subgroup_size +
+      (static_cast<cast_t>(rank_dist - subgroup_size) + (offset % modulo)) % modulo);
+  }
+}
+
+template <typename vertex_t, typename priority_t>
+__host__ __device__ int priority_to_rank(
+  priority_t priority,
+  int root,
+  int subgroup_size /* faster interconnect within a subgroup */,
+  int comm_size,
+  vertex_t offset /* to evenly distribute traffict */)
+{
+  static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4);
+  using cast_t = std::conditional_t<
+    sizeof(priority_t) == 1,
+    int16_t,
+    std::conditional_t<sizeof(priority_t) == 2, int32_t, int64_t>>;  // to prevent overflow
+
+  if (priority == priority_t{0}) {
+    return root;
+  } else if (priority < static_cast<priority_t>(subgroup_size)) {
+    int modulo     = subgroup_size - 1;
+    auto rank_dist = static_cast<int>(
+      1 + ((static_cast<cast_t>(priority - 1) + modulo) - (offset % modulo)) % modulo);
+    return static_cast<int>((root - (root % subgroup_size)) +
+                            ((static_cast<cast_t>(root) + rank_dist) % subgroup_size));
+  } else {
+    int modulo     = comm_size - subgroup_size;
+    auto rank_dist = static_cast<int>(
+      subgroup_size +
+      ((static_cast<cast_t>(priority) - subgroup_size) + (modulo - (offset % modulo))) % modulo);
+    auto subgroup_dist            = rank_dist / subgroup_size;
+    auto intra_subgroup_rank_dist = rank_dist % subgroup_size;
+    return static_cast<int>(
+      ((static_cast<cast_t>((root / subgroup_size) * subgroup_size) +
+        subgroup_dist * subgroup_size) +
+       (static_cast<cast_t>(root) + intra_subgroup_rank_dist) % subgroup_size) %
+      comm_size);
+  }
+}
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh
index d51e03628e..5741c98d90 100644
--- a/cpp/src/prims/extract_transform_e.cuh
+++ b/cpp/src/prims/extract_transform_e.cuh
@@ -116,8 +116,8 @@ extract_transform_e(raft::handle_t const& handle,
                   thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()));
 
   auto value_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
-  std::tie(std::ignore, value_buffer) = detail::
-    extract_transform_v_frontier_e<GraphViewType::is_storage_transposed, false, void, payload_t>(
+  std::tie(std::ignore, value_buffer) =
+    detail::extract_transform_v_frontier_e<GraphViewType::is_storage_transposed, void, payload_t>(
       handle,
       graph_view,
       frontier,
diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
index 7ad033b93c..ba227b263b 100644
--- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
+++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
@@ -64,13 +64,13 @@ namespace cugraph {
  * @return Dataframe buffer object storing extracted and accumulated valid @p e_op return values.
  */
 template <typename GraphViewType,
-          typename VertexFrontierBucketType,
+          typename KeyBucketType,
           typename EdgeSrcValueInputWrapper,
           typename EdgeDstValueInputWrapper,
           typename EdgeValueInputWrapper,
           typename EdgeOp>
 decltype(allocate_dataframe_buffer<
-         typename detail::edge_op_result_type<typename VertexFrontierBucketType::key_type,
+         typename detail::edge_op_result_type<typename KeyBucketType::key_type,
                                               typename GraphViewType::vertex_type,
                                               typename EdgeSrcValueInputWrapper::value_type,
                                               typename EdgeDstValueInputWrapper::value_type,
@@ -79,7 +79,7 @@ decltype(allocate_dataframe_buffer<
                                                                          rmm::cuda_stream_view{}))
 extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle,
                                         GraphViewType const& graph_view,
-                                        VertexFrontierBucketType const& frontier,
+                                        KeyBucketType const& frontier,
                                         EdgeSrcValueInputWrapper edge_src_value_input,
                                         EdgeDstValueInputWrapper edge_dst_value_input,
                                         EdgeValueInputWrapper edge_value_input,
@@ -89,7 +89,7 @@ extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle,
   static_assert(!GraphViewType::is_storage_transposed);
 
   using e_op_result_t =
-    typename detail::edge_op_result_type<typename VertexFrontierBucketType::key_type,
+    typename detail::edge_op_result_type<typename KeyBucketType::key_type,
                                          typename GraphViewType::vertex_type,
                                          typename EdgeSrcValueInputWrapper::value_type,
                                          typename EdgeDstValueInputWrapper::value_type,
@@ -100,14 +100,14 @@ extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle,
 
   auto value_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
   std::tie(std::ignore, value_buffer) =
-    detail::extract_transform_v_frontier_e<false, false, void, payload_t>(handle,
-                                                                          graph_view,
-                                                                          frontier,
-                                                                          edge_src_value_input,
-                                                                          edge_dst_value_input,
-                                                                          edge_value_input,
-                                                                          e_op,
-                                                                          do_expensive_check);
+    detail::extract_transform_v_frontier_e<false, void, payload_t>(handle,
+                                                                   graph_view,
+                                                                   frontier,
+                                                                   edge_src_value_input,
+                                                                   edge_dst_value_input,
+                                                                   edge_value_input,
+                                                                   e_op,
+                                                                   do_expensive_check);
 
   return value_buffer;
 }
diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh
index 58dbf7e74a..a36cf332eb 100644
--- a/cpp/src/prims/fill_edge_src_dst_property.cuh
+++ b/cpp/src/prims/fill_edge_src_dst_property.cuh
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include "prims/vertex_frontier.cuh"
+
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
@@ -129,8 +131,8 @@ template <typename GraphViewType,
           typename T>
 void fill_edge_major_property(raft::handle_t const& handle,
                               GraphViewType const& graph_view,
-                              VertexIterator vertex_first,
-                              VertexIterator vertex_last,
+                              VertexIterator sorted_unique_vertex_first,
+                              VertexIterator sorted_unique_vertex_last,
                               EdgeMajorPropertyOutputWrapper edge_major_property_output,
                               T input)
 {
@@ -153,12 +155,12 @@ void fill_edge_major_property(raft::handle_t const& handle,
     auto const minor_comm_rank = minor_comm.get_rank();
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto rx_counts =
-      host_scalar_allgather(minor_comm,
-                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
-                            handle.get_stream());
-    auto max_rx_size =
-      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
+    auto local_v_list_sizes = host_scalar_allgather(
+      minor_comm,
+      static_cast<size_t>(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)),
+      handle.get_stream());
+    auto max_rx_size = std::reduce(
+      local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) {
         return std::max(lhs, rhs);
       });
     rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
@@ -169,14 +171,18 @@ void fill_edge_major_property(raft::handle_t const& handle,
         edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
           graph_view.local_edge_partition_view(i));
 
-      device_bcast(
-        minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+      device_bcast(minor_comm,
+                   sorted_unique_vertex_first,
+                   rx_vertices.begin(),
+                   local_v_list_sizes[i],
+                   i,
+                   handle.get_stream());
 
       if (edge_partition_keys) {
         thrust::for_each(
           handle.get_thrust_policy(),
           thrust::make_counting_iterator(size_t{0}),
-          thrust::make_counting_iterator(rx_counts[i]),
+          thrust::make_counting_iterator(local_v_list_sizes[i]),
           [rx_vertex_first = rx_vertices.begin(),
            input,
            edge_partition_key_first   = ((*edge_partition_keys)[i]).begin(),
@@ -199,7 +205,7 @@ void fill_edge_major_property(raft::handle_t const& handle,
           thrust::for_each(
             handle.get_thrust_policy(),
             thrust::make_counting_iterator(vertex_t{0}),
-            thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
+            thrust::make_counting_iterator(static_cast<vertex_t>(local_v_list_sizes[i])),
             [edge_partition,
              rx_vertex_first = rx_vertices.begin(),
              input,
@@ -219,7 +225,7 @@ void fill_edge_major_property(raft::handle_t const& handle,
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
                           val_first,
-                          val_first + rx_counts[i],
+                          val_first + local_v_list_sizes[i],
                           map_first,
                           edge_partition_value_firsts[i]);
         }
@@ -232,17 +238,18 @@ void fill_edge_major_property(raft::handle_t const& handle,
     assert(edge_partition_value_firsts.size() == size_t{1});
     if constexpr (contains_packed_bool_element) {
       thrust::for_each(handle.get_thrust_policy(),
-                       vertex_first,
-                       vertex_last,
+                       sorted_unique_vertex_first,
+                       sorted_unique_vertex_last,
                        [input, output_value_first = edge_partition_value_firsts[0]] __device__(
                          auto v) { packed_bool_atomic_set(output_value_first, v, input); });
     } else {
       auto val_first = thrust::make_constant_iterator(input);
-      thrust::scatter(handle.get_thrust_policy(),
-                      val_first,
-                      val_first + thrust::distance(vertex_first, vertex_last),
-                      vertex_first,
-                      edge_partition_value_firsts[0]);
+      thrust::scatter(
+        handle.get_thrust_policy(),
+        val_first,
+        val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+        sorted_unique_vertex_first,
+        edge_partition_value_firsts[0]);
     }
   }
 }
@@ -286,8 +293,8 @@ template <typename GraphViewType,
           typename T>
 void fill_edge_minor_property(raft::handle_t const& handle,
                               GraphViewType const& graph_view,
-                              VertexIterator vertex_first,
-                              VertexIterator vertex_last,
+                              VertexIterator sorted_unique_vertex_first,
+                              VertexIterator sorted_unique_vertex_last,
                               EdgeMinorPropertyOutputWrapper edge_minor_property_output,
                               T input)
 {
@@ -300,22 +307,269 @@ void fill_edge_minor_property(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
 
   auto edge_partition_value_first = edge_minor_property_output.value_first();
+  vertex_t minor_range_first{};
+  if constexpr (GraphViewType::is_storage_transposed) {
+    minor_range_first = graph_view.local_edge_partition_src_range_first();
+  } else {
+    minor_range_first = graph_view.local_edge_partition_dst_range_first();
+  }
+
   if constexpr (GraphViewType::is_multi_gpu) {
     auto& comm                 = handle.get_comms();
-    auto const comm_rank       = comm.get_rank();
+    auto const comm_size       = comm.get_size();
     auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
     auto const major_comm_rank = major_comm.get_rank();
     auto const major_comm_size = major_comm.get_size();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
 
-    auto rx_counts =
-      host_scalar_allgather(major_comm,
-                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
-                            handle.get_stream());
-    auto max_rx_size =
-      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
-        return std::max(lhs, rhs);
-      });
-    rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
+    constexpr size_t packed_bool_word_bcast_alignment =
+      128 /
+      sizeof(
+        uint32_t);  // 128B cache line alignment (unaligned ncclBroadcast operations are slower)
+
+    std::vector<size_t> max_tmp_buffer_sizes{};
+    std::vector<vertex_t> local_v_list_sizes{};
+    std::vector<vertex_t> local_v_list_range_firsts{};
+    std::vector<vertex_t> local_v_list_range_lasts{};
+    {
+      auto v_list_size = static_cast<vertex_t>(
+        thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last));
+      rmm::device_uvector<size_t> d_aggregate_tmps(major_comm_size * size_t{4},
+                                                   handle.get_stream());
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        d_aggregate_tmps.begin() + major_comm_rank * size_t{4},
+        d_aggregate_tmps.begin() + (major_comm_rank + 1) * size_t{4},
+        [max_tmp_buffer_size = static_cast<size_t>(
+           static_cast<double>(handle.get_device_properties().totalGlobalMem) * 0.05),
+         sorted_unique_vertex_first,
+         v_list_size,
+         vertex_partition_range_first =
+           graph_view.local_vertex_partition_range_first()] __device__(size_t i) {
+          if (i == 0) {
+            return max_tmp_buffer_size;
+          } else if (i == 1) {
+            return static_cast<size_t>(v_list_size);
+          } else if (i == 2) {
+            vertex_t first{};
+            if (v_list_size > 0) {
+              first = *sorted_unique_vertex_first;
+            } else {
+              first = vertex_partition_range_first;
+            }
+            assert(static_cast<vertex_t>(static_cast<size_t>(first)) == first);
+            return static_cast<size_t>(first);
+          } else {
+            vertex_t last{};
+            if (v_list_size > 0) {
+              last = *(sorted_unique_vertex_first + (v_list_size - 1)) + 1;
+            } else {
+              last = vertex_partition_range_first;
+            }
+            assert(static_cast<vertex_t>(static_cast<size_t>(last)) == last);
+            return static_cast<size_t>(last);
+          }
+        });
+
+      if (major_comm_size > 1) {  // allgather max_tmp_buffer_size, v_list_size, v_list_range_first
+                                  // (inclusive), v_list_range_last (exclusive)
+        device_allgather(major_comm,
+                         d_aggregate_tmps.data() + major_comm_rank * size_t{4},
+                         d_aggregate_tmps.data(),
+                         size_t{4},
+                         handle.get_stream());
+      }
+
+      std::vector<size_t> h_aggregate_tmps(d_aggregate_tmps.size());
+      raft::update_host(h_aggregate_tmps.data(),
+                        d_aggregate_tmps.data(),
+                        d_aggregate_tmps.size(),
+                        handle.get_stream());
+      handle.sync_stream();
+      max_tmp_buffer_sizes      = std::vector<size_t>(major_comm_size);
+      local_v_list_sizes        = std::vector<vertex_t>(major_comm_size);
+      local_v_list_range_firsts = std::vector<vertex_t>(major_comm_size);
+      local_v_list_range_lasts  = std::vector<vertex_t>(major_comm_size);
+      for (int i = 0; i < major_comm_size; ++i) {
+        max_tmp_buffer_sizes[i]      = h_aggregate_tmps[i * size_t{4}];
+        local_v_list_sizes[i]        = static_cast<vertex_t>(h_aggregate_tmps[i * size_t{4} + 1]);
+        local_v_list_range_firsts[i] = static_cast<vertex_t>(h_aggregate_tmps[i * size_t{4} + 2]);
+        local_v_list_range_lasts[i]  = static_cast<vertex_t>(h_aggregate_tmps[i * size_t{4} + 3]);
+      }
+    }
+
+    auto edge_partition_keys = edge_minor_property_output.keys();
+
+    std::optional<rmm::device_uvector<uint32_t>> v_list_bitmap{std::nullopt};
+    std::optional<rmm::device_uvector<uint32_t>> compressed_v_list{std::nullopt};
+    if (major_comm_size > 1) {
+      bool v_compressible{false};
+      if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) {
+        vertex_t local_v_list_max_range_size{0};
+        for (int i = 0; i < major_comm_size; ++i) {
+          auto range_size             = local_v_list_range_lasts[i] - local_v_list_range_firsts[i];
+          local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size);
+        }
+        if (local_v_list_max_range_size <=
+            std::numeric_limits<uint32_t>::max()) {  // broadcast 32bit offset values instead of 64
+                                                     // bit vertex IDs
+          v_compressible = true;
+        }
+      }
+
+      double avg_fill_ratio{0.0};
+      for (int i = 0; i < major_comm_size; ++i) {
+        auto num_keys   = static_cast<double>(local_v_list_sizes[i]);
+        auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i];
+        avg_fill_ratio += (range_size > 0)
+                            ? (static_cast<double>(num_keys) / static_cast<double>(range_size))
+                            : double{0.0};
+      }
+      avg_fill_ratio /= static_cast<double>(major_comm_size);
+      double threshold_ratio =
+        1.0 / static_cast<double>((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8);
+      auto avg_v_list_size = std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) /
+                             static_cast<vertex_t>(major_comm_size);
+
+      if ((avg_fill_ratio > threshold_ratio) &&
+          (static_cast<size_t>(avg_v_list_size) > packed_bool_word_bcast_alignment)) {
+        if (is_packed_bool<typename EdgeMinorPropertyOutputWrapper::value_iterator,
+                           typename EdgeMinorPropertyOutputWrapper::value_type>() &&
+            !edge_partition_keys) {  // directly update edge_minor_property_output (with special
+                                     // care for unaligned boundaries)
+          rmm::device_uvector<uint32_t> boundary_words(
+            packed_bool_word_bcast_alignment,
+            handle.get_stream());  // for unaligned boundaries
+          auto leading_boundary_words =
+            (packed_bool_word_bcast_alignment -
+             packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) %
+               packed_bool_word_bcast_alignment) %
+            packed_bool_word_bcast_alignment;
+          if ((leading_boundary_words == 0) &&
+              (packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) ==
+               packed_bool_offset(graph_view.local_vertex_partition_range_first() -
+                                  minor_range_first)) &&
+              (((local_v_list_range_firsts[major_comm_rank] - minor_range_first) %
+                packed_bools_per_word()) !=
+               0)) {  // there are unaligned bits (fewer than packed_bools_per_word()) in the vertex
+                      // partition boundary
+            leading_boundary_words = packed_bool_word_bcast_alignment;
+          }
+          thrust::fill(handle.get_thrust_policy(),
+                       boundary_words.begin(),
+                       boundary_words.begin() + leading_boundary_words,
+                       packed_bool_empty_mask());
+          if (local_v_list_range_firsts[major_comm_rank] <
+              local_v_list_range_lasts[major_comm_rank]) {
+            auto word_offset_first =
+              packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first);
+            auto word_offset_last =
+              packed_bool_offset((local_v_list_range_lasts[major_comm_rank] - 1) -
+                                 minor_range_first) +
+              1;
+            thrust::for_each(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(word_offset_first),
+              thrust::make_counting_iterator(word_offset_last),
+              [sorted_unique_vertex_first,
+               sorted_unique_vertex_last,
+               input,
+               minor_range_first,
+               leading_boundary_words,
+               word_offset_first,
+               vertex_partition_range_last = graph_view.local_vertex_partition_range_last(),
+               output_value_first          = edge_partition_value_first,
+               boundary_words              = raft::device_span<uint32_t>(
+                 boundary_words.data(), boundary_words.size())] __device__(auto i) {
+                auto& word = ((i - word_offset_first) < leading_boundary_words)
+                               ? boundary_words[i - word_offset_first]
+                               : *(output_value_first + i);
+                auto word_v_first =
+                  minor_range_first + static_cast<vertex_t>(i * packed_bools_per_word());
+                auto word_v_last =
+                  ((vertex_partition_range_last - word_v_first) <= packed_bools_per_word())
+                    ? vertex_partition_range_last
+                    : (word_v_first + static_cast<vertex_t>(packed_bools_per_word()));
+                auto it = thrust::lower_bound(
+                  thrust::seq, sorted_unique_vertex_first, sorted_unique_vertex_last, word_v_first);
+                while ((it != sorted_unique_vertex_last) && (*it < word_v_last)) {
+                  auto v_offset = *it - minor_range_first;
+                  if (input) {
+                    word |= packed_bool_mask(v_offset);
+                  } else {
+                    word &= ~packed_bool_mask(v_offset);
+                  }
+                  ++it;
+                }
+              });
+          }
+          rmm::device_uvector<uint32_t> aggregate_boundary_words(
+            major_comm_size * packed_bool_word_bcast_alignment, handle.get_stream());
+          device_allgather(major_comm,
+                           boundary_words.data(),
+                           aggregate_boundary_words.data(),
+                           packed_bool_word_bcast_alignment,
+                           handle.get_stream());
+          v_list_bitmap = std::move(aggregate_boundary_words);
+        } else {
+          v_list_bitmap =
+            compute_vertex_list_bitmap_info(sorted_unique_vertex_first,
+                                            sorted_unique_vertex_last,
+                                            local_v_list_range_firsts[major_comm_rank],
+                                            local_v_list_range_lasts[major_comm_rank],
+                                            handle.get_stream());
+        }
+      } else if (v_compressible) {
+        rmm::device_uvector<uint32_t> tmps(local_v_list_sizes[major_comm_rank],
+                                           handle.get_stream());
+        thrust::transform(handle.get_thrust_policy(),
+                          sorted_unique_vertex_first,
+                          sorted_unique_vertex_last,
+                          tmps.begin(),
+                          cuda::proclaim_return_type<uint32_t>(
+                            [range_first = local_v_list_range_firsts[major_comm_rank]] __device__(
+                              auto v) { return static_cast<uint32_t>(v - range_first); }));
+        compressed_v_list = std::move(tmps);
+      }
+    }
+
+    std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
+    size_t num_concurrent_bcasts{1};
+    {
+      size_t tmp_buffer_size_per_loop{};
+      for (int i = 0; i < major_comm_size; ++i) {
+        if (is_packed_bool<typename EdgeMinorPropertyOutputWrapper::value_iterator,
+                           typename EdgeMinorPropertyOutputWrapper::value_type>() &&
+            !edge_partition_keys && v_list_bitmap) {
+          tmp_buffer_size_per_loop += 0;
+        } else if (v_list_bitmap) {
+          tmp_buffer_size_per_loop +=
+            packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) *
+              sizeof(uint32_t) +
+            static_cast<size_t>(local_v_list_sizes[i]) * sizeof(vertex_t);
+        } else {
+          tmp_buffer_size_per_loop += static_cast<size_t>(local_v_list_sizes[i]) * sizeof(vertex_t);
+        }
+      }
+      tmp_buffer_size_per_loop /= major_comm_size;
+      size_t max_streams =
+        static_cast<size_t>(major_comm_size);  // to allow setting num_concurrent_bcasts above
+                                               // hnadle.get_stream_pool_size()
+      stream_pool_indices = init_stream_pool_indices(
+        std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) /
+          static_cast<size_t>(major_comm_size),
+        tmp_buffer_size_per_loop,
+        major_comm_size,
+        1,
+        max_streams);
+      num_concurrent_bcasts = (*stream_pool_indices).size();
+      if ((*stream_pool_indices).size() > handle.get_stream_pool_size()) {
+        (*stream_pool_indices).resize(handle.get_stream_pool_size());
+      }
+      if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; }
+    }
 
     std::optional<raft::host_span<vertex_t const>> key_offsets{};
     if constexpr (GraphViewType::is_storage_transposed) {
@@ -324,88 +578,417 @@ void fill_edge_minor_property(raft::handle_t const& handle,
       key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets();
     }
 
-    auto edge_partition =
-      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(size_t{0}));
-    auto edge_partition_keys = edge_minor_property_output.keys();
-    for (int i = 0; i < major_comm_size; ++i) {
-      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-      // ncclGroupEnd()
-      device_bcast(
-        major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+    for (size_t i = 0; i < static_cast<size_t>(major_comm_size); i += num_concurrent_bcasts) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto sub0       = std::chrono::steady_clock::now();
+      auto loop_count = std::min(num_concurrent_bcasts, static_cast<size_t>(major_comm_size) - i);
+
+      if (is_packed_bool<typename EdgeMinorPropertyOutputWrapper::value_iterator,
+                         typename EdgeMinorPropertyOutputWrapper::value_type>() &&
+          !edge_partition_keys && v_list_bitmap) {
+        std::vector<size_t> leading_boundary_word_counts(loop_count);
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+          auto leading_boundary_words =
+            (packed_bool_word_bcast_alignment -
+             packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) %
+               packed_bool_word_bcast_alignment) %
+            packed_bool_word_bcast_alignment;
+          auto vertex_partition_id =
+            partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
+              major_comm_size, minor_comm_size, partition_idx, minor_comm_rank);
+          if ((leading_boundary_words == 0) &&
+              (packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) ==
+               packed_bool_offset(graph_view.vertex_partition_range_first(vertex_partition_id) -
+                                  minor_range_first)) &&
+              (((local_v_list_range_firsts[partition_idx] - minor_range_first) %
+                packed_bools_per_word()) != 0)) {
+            leading_boundary_words = packed_bool_word_bcast_alignment;
+          }
+          leading_boundary_word_counts[j] = leading_boundary_words;
+        }
+        device_group_start(major_comm);
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+          size_t bcast_size{0};
+          vertex_t packed_bool_offset_first{0};
+          if (local_v_list_range_firsts[partition_idx] < local_v_list_range_lasts[partition_idx]) {
+            auto leading_boundary_words = leading_boundary_word_counts[j];
+            packed_bool_offset_first =
+              packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) +
+              static_cast<vertex_t>(leading_boundary_words);
+            auto packed_bool_offset_last =
+              packed_bool_offset(local_v_list_range_lasts[partition_idx] - 1 - minor_range_first);
+            if (packed_bool_offset_first <= packed_bool_offset_last) {
+              bcast_size = (packed_bool_offset_last - packed_bool_offset_first) + 1;
+            }
+          }
+
+          device_bcast(major_comm,
+                       edge_partition_value_first + packed_bool_offset_first,
+                       edge_partition_value_first + packed_bool_offset_first,
+                       bcast_size,
+                       static_cast<int>(partition_idx),
+                       handle.get_stream());
+        }
+        device_group_end(major_comm);
+
+        rmm::device_uvector<size_t> d_leading_boundary_word_counts(
+          leading_boundary_word_counts.size(), handle.get_stream());
+        raft::update_device(d_leading_boundary_word_counts.data(),
+                            leading_boundary_word_counts.data(),
+                            leading_boundary_word_counts.size(),
+                            handle.get_stream());
+
+        rmm::device_uvector<vertex_t> d_local_v_list_range_firsts(loop_count, handle.get_stream());
+        raft::update_device(d_local_v_list_range_firsts.data(),
+                            local_v_list_range_firsts.data() + i,
+                            loop_count,
+                            handle.get_stream());
 
-      if (edge_partition_keys) {
         thrust::for_each(
           handle.get_thrust_policy(),
           thrust::make_counting_iterator(size_t{0}),
-          thrust::make_counting_iterator(rx_counts[i]),
-          [rx_vertex_first = rx_vertices.begin(),
-           input,
-           subrange_key_first         = (*edge_partition_keys).begin() + (*key_offsets)[i],
-           subrange_key_last          = (*edge_partition_keys).begin() + (*key_offsets)[i + 1],
-           edge_partition_value_first = edge_partition_value_first,
-           subrange_start_offset      = (*key_offsets)[i]] __device__(auto i) {
-            auto minor = *(rx_vertex_first + i);
-            auto it =
-              thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor);
-            if ((it != subrange_key_last) && (*it == minor)) {
-              auto subrange_offset = thrust::distance(subrange_key_first, it);
-              if constexpr (contains_packed_bool_element) {
-                fill_scalar_or_thrust_tuple(
-                  edge_partition_value_first, subrange_start_offset + subrange_offset, input);
-              } else {
-                *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input;
+          thrust::make_counting_iterator(loop_count * packed_bool_word_bcast_alignment),
+          [input,
+           minor_range_first,
+           leading_boundary_word_counts = raft::device_span<size_t const>(
+             d_leading_boundary_word_counts.data(), d_leading_boundary_word_counts.size()),
+           local_v_list_range_firsts = raft::device_span<vertex_t const>(
+             d_local_v_list_range_firsts.data(), d_local_v_list_range_firsts.size()),
+           aggregate_boundary_words = raft::device_span<uint32_t const>(
+             (*v_list_bitmap).data() + i * packed_bool_word_bcast_alignment,
+             loop_count * packed_bool_word_bcast_alignment),
+           output_value_first = edge_partition_value_first] __device__(size_t i) {
+            auto j                      = i / packed_bool_word_bcast_alignment;
+            auto leading_boundary_words = leading_boundary_word_counts[j];
+            if ((i % packed_bool_word_bcast_alignment) < leading_boundary_words) {
+              auto boundary_word = aggregate_boundary_words[i];
+              if (boundary_word != packed_bool_empty_mask()) {
+                auto word_offset =
+                  packed_bool_offset(local_v_list_range_firsts[j] - minor_range_first) +
+                  (i % packed_bool_word_bcast_alignment);
+                cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+                  *(output_value_first + word_offset));
+                if (input) {
+                  word.fetch_or(aggregate_boundary_words[i], cuda::std::memory_order_relaxed);
+                } else {
+                  word.fetch_and(~aggregate_boundary_words[i], cuda::std::memory_order_relaxed);
+                }
               }
             }
           });
       } else {
-        if constexpr (contains_packed_bool_element) {
-          thrust::for_each(handle.get_thrust_policy(),
-                           thrust::make_counting_iterator(vertex_t{0}),
-                           thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
-                           [edge_partition,
-                            rx_vertex_first = rx_vertices.begin(),
-                            input,
-                            output_value_first = edge_partition_value_first] __device__(auto i) {
-                             auto rx_vertex = *(rx_vertex_first + i);
-                             auto minor_offset =
-                               edge_partition.minor_offset_from_minor_nocheck(rx_vertex);
-                             fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input);
-                           });
-        } else {
-          auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(),
-            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
-              return edge_partition.minor_offset_from_minor_nocheck(v);
-            }));
-          auto val_first = thrust::make_constant_iterator(input);
-          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-          // directly scatters from the internal buffer)
-          thrust::scatter(handle.get_thrust_policy(),
-                          val_first,
-                          val_first + rx_counts[i],
-                          map_first,
-                          edge_partition_value_first);
+        std::vector<std::variant<rmm::device_uvector<vertex_t>, rmm::device_uvector<uint32_t>>>
+          edge_partition_v_buffers{};
+        edge_partition_v_buffers.reserve(loop_count);
+        rmm::device_uvector<size_t> dummy_counters(loop_count, handle.get_stream());
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+
+          std::variant<rmm::device_uvector<vertex_t>, rmm::device_uvector<uint32_t>> v_buffer =
+            rmm::device_uvector<vertex_t>(0, handle.get_stream());
+          if (v_list_bitmap) {
+            v_buffer = rmm::device_uvector<uint32_t>(
+              packed_bool_size(local_v_list_range_lasts[partition_idx] -
+                               local_v_list_range_firsts[partition_idx]),
+              handle.get_stream());
+          } else if (compressed_v_list) {
+            v_buffer =
+              rmm::device_uvector<uint32_t>(local_v_list_sizes[partition_idx], handle.get_stream());
+          } else {
+            std::get<0>(v_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream());
+          }
+          edge_partition_v_buffers.push_back(std::move(v_buffer));
+        }
+
+        device_group_start(major_comm);
+        for (size_t j = 0; j < loop_count; ++j) {
+          auto partition_idx = i + j;
+
+          auto& v_buffer = edge_partition_v_buffers[j];
+          if (v_list_bitmap) {
+            device_bcast(major_comm,
+                         (*v_list_bitmap).data(),
+                         std::get<1>(v_buffer).data(),
+                         std::get<1>(v_buffer).size(),
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          } else if (compressed_v_list) {
+            device_bcast(major_comm,
+                         (*compressed_v_list).data(),
+                         std::get<1>(v_buffer).data(),
+                         std::get<1>(v_buffer).size(),
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          } else {
+            device_bcast(major_comm,
+                         (static_cast<int>(partition_idx) == major_comm_rank)
+                           ? sorted_unique_vertex_first
+                           : static_cast<vertex_t const*>(nullptr),
+                         std::get<0>(v_buffer).data(),
+                         std::get<0>(v_buffer).size(),
+                         static_cast<int>(partition_idx),
+                         handle.get_stream());
+          }
+        }
+        device_group_end(major_comm);
+        bool kernel_fusion =
+          !edge_partition_keys && !v_list_bitmap && (loop_count > 1) &&
+          (static_cast<size_t>(std::reduce(local_v_list_sizes.begin() + i,
+                                           local_v_list_sizes.begin() + (i + loop_count))) <
+           size_t{256 * 1024} /* tuning parameter (binary search vs kernel launch overhead) */ *
+             loop_count);  // FIXME: kernle fusion can be useful even when
+                           // edge_partition_keys.has_value() is true
+
+        if (!kernel_fusion) {
+          if (stream_pool_indices) { handle.sync_stream(); }
+        }
+
+        if (!kernel_fusion) {
+          size_t stream_pool_size{0};
+          if (stream_pool_indices) { stream_pool_size = (*stream_pool_indices).size(); }
+          for (size_t j = 0; j < loop_count; ++j) {
+            auto partition_idx = i + j;
+            auto loop_stream =
+              stream_pool_indices
+                ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j % stream_pool_size])
+                : handle.get_stream();
+
+            if (v_list_bitmap) {
+              auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]);
+              rmm::device_uvector<vertex_t> rx_vertices(local_v_list_sizes[partition_idx],
+                                                        loop_stream);
+              retrieve_vertex_list_from_bitmap(
+                raft::device_span<uint32_t const>(rx_bitmap.data(), rx_bitmap.size()),
+                rx_vertices.begin(),
+                raft::device_span<size_t>(dummy_counters.data() + j, size_t{1}),
+                local_v_list_range_firsts[partition_idx],
+                local_v_list_range_lasts[partition_idx],
+                loop_stream);
+              edge_partition_v_buffers[j] = std::move(rx_vertices);
+            }
+
+            if (edge_partition_keys) {
+              thrust::for_each(
+                rmm::exec_policy_nosync(loop_stream),
+                thrust::make_counting_iterator(vertex_t{0}),
+                thrust::make_counting_iterator(local_v_list_sizes[partition_idx]),
+                [rx_vertex_first            = compressed_v_list
+                                                ? static_cast<vertex_t const*>(nullptr)
+                                                : std::get<0>(edge_partition_v_buffers[j]).data(),
+                 rx_compressed_vertex_first = compressed_v_list
+                                                ? std::get<1>(edge_partition_v_buffers[j]).data()
+                                                : static_cast<uint32_t const*>(nullptr),
+                 range_first                = local_v_list_range_firsts[partition_idx],
+                 input,
+                 subrange_key_first =
+                   (*edge_partition_keys).begin() + (*key_offsets)[partition_idx],
+                 subrange_key_last =
+                   (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1],
+                 edge_partition_value_first = edge_partition_value_first,
+                 subrange_start_offset      = (*key_offsets)[partition_idx]] __device__(auto i) {
+                  vertex_t minor{};
+                  if (rx_vertex_first != nullptr) {
+                    minor = *(rx_vertex_first + i);
+                  } else {
+                    minor = range_first + *(rx_compressed_vertex_first + i);
+                  }
+                  auto it =
+                    thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor);
+                  if ((it != subrange_key_last) && (*it == minor)) {
+                    auto subrange_offset = thrust::distance(subrange_key_first, it);
+                    if constexpr (contains_packed_bool_element) {
+                      fill_scalar_or_thrust_tuple(
+                        edge_partition_value_first, subrange_start_offset + subrange_offset, input);
+                    } else {
+                      *(edge_partition_value_first + subrange_start_offset + subrange_offset) =
+                        input;
+                    }
+                  }
+                });
+            } else {
+              if constexpr (contains_packed_bool_element) {
+                thrust::for_each(
+                  rmm::exec_policy_nosync(loop_stream),
+                  thrust::make_counting_iterator(vertex_t{0}),
+                  thrust::make_counting_iterator(local_v_list_sizes[partition_idx]),
+                  [minor_range_first,
+                   rx_vertex_first            = compressed_v_list
+                                                  ? static_cast<vertex_t const*>(nullptr)
+                                                  : std::get<0>(edge_partition_v_buffers[j]).data(),
+                   rx_compressed_vertex_first = compressed_v_list
+                                                  ? std::get<1>(edge_partition_v_buffers[j]).data()
+                                                  : static_cast<uint32_t const*>(nullptr),
+                   range_first                = local_v_list_range_firsts[partition_idx],
+                   input,
+                   output_value_first = edge_partition_value_first] __device__(auto i) {
+                    vertex_t minor{};
+                    if (rx_vertex_first != nullptr) {
+                      minor = *(rx_vertex_first + i);
+                    } else {
+                      minor = range_first + *(rx_compressed_vertex_first + i);
+                    }
+                    auto minor_offset = minor - minor_range_first;
+                    fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input);
+                  });
+              } else {
+                if (compressed_v_list) {
+                  auto map_first = thrust::make_transform_iterator(
+                    std::get<1>(edge_partition_v_buffers[j]).begin(),
+                    cuda::proclaim_return_type<vertex_t>(
+                      [minor_range_first,
+                       range_first =
+                         local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) {
+                        return static_cast<vertex_t>(v_offset + (range_first - minor_range_first));
+                      }));
+                  auto val_first = thrust::make_constant_iterator(input);
+                  thrust::scatter(rmm::exec_policy_nosync(loop_stream),
+                                  val_first,
+                                  val_first + local_v_list_sizes[partition_idx],
+                                  map_first,
+                                  edge_partition_value_first);
+                } else {
+                  auto map_first = thrust::make_transform_iterator(
+                    std::get<0>(edge_partition_v_buffers[j]).begin(),
+                    cuda::proclaim_return_type<vertex_t>(
+                      [minor_range_first] __device__(auto v) { return v - minor_range_first; }));
+                  auto val_first = thrust::make_constant_iterator(input);
+                  thrust::scatter(rmm::exec_policy_nosync(loop_stream),
+                                  val_first,
+                                  val_first + local_v_list_sizes[partition_idx],
+                                  map_first,
+                                  edge_partition_value_first);
+                }
+              }
+            }
+          }
+          if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
+        } else {  // kernel fusion
+          std::vector<vertex_t> h_vertex_vars(loop_count /* range_first values */ +
+                                              (loop_count + 1) /* loop offsets */);
+          std::copy(local_v_list_range_firsts.begin() + i,
+                    local_v_list_range_firsts.begin() + (i + loop_count),
+                    h_vertex_vars.begin());
+          h_vertex_vars[loop_count] = 0;
+          std::inclusive_scan(local_v_list_sizes.begin() + i,
+                              local_v_list_sizes.begin() + (i + loop_count),
+                              h_vertex_vars.begin() + (loop_count + 1));
+          std::vector<void const*> h_ptrs(loop_count);
+          if (compressed_v_list) {
+            for (size_t j = 0; j < loop_count; ++j) {
+              h_ptrs[j] = static_cast<void const*>(std::get<1>(edge_partition_v_buffers[j]).data());
+            }
+          } else {
+            for (size_t j = 0; j < loop_count; ++j) {
+              h_ptrs[j] = static_cast<void const*>(std::get<0>(edge_partition_v_buffers[j]).data());
+            }
+          }
+          rmm::device_uvector<vertex_t> d_vertex_vars(h_vertex_vars.size(), handle.get_stream());
+          rmm::device_uvector<void const*> d_ptrs(h_ptrs.size(), handle.get_stream());
+          raft::update_device(
+            d_vertex_vars.data(), h_vertex_vars.data(), h_vertex_vars.size(), handle.get_stream());
+          raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream());
+
+          raft::device_span<vertex_t const> range_firsts(d_vertex_vars.data(), loop_count);
+          raft::device_span<vertex_t const> loop_offsets(d_vertex_vars.data() + loop_count,
+                                                         loop_count + 1);
+          if constexpr (contains_packed_bool_element) {
+            thrust::for_each(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(vertex_t{0}),
+              thrust::make_counting_iterator(h_vertex_vars.back()),
+              [range_firsts,
+               loop_offsets,
+               minor_range_first,
+               input,
+               rx_firsts = raft::device_span<void const* const>(d_ptrs.data(), d_ptrs.size()),
+               output_value_first = edge_partition_value_first,
+               compressed         = compressed_v_list.has_value()] __device__(auto i) {
+                auto loop_idx =
+                  thrust::distance(loop_offsets.begin() + 1,
+                                   thrust::upper_bound(
+                                     thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i));
+                auto rx_first = rx_firsts[loop_idx];
+                vertex_t minor{};
+                if (compressed) {
+                  minor = range_firsts[loop_idx] +
+                          *(static_cast<uint32_t const*>(rx_first) + (i - loop_offsets[loop_idx]));
+                } else {
+                  minor = *(static_cast<vertex_t const*>(rx_first) + (i - loop_offsets[loop_idx]));
+                }
+                auto minor_offset = minor - minor_range_first;
+                fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input);
+              });
+          } else {
+            auto val_first = thrust::make_constant_iterator(input);
+            if (compressed_v_list) {
+              auto map_first = thrust::make_transform_iterator(
+                thrust::make_counting_iterator(vertex_t{0}),
+                cuda::proclaim_return_type<vertex_t>(
+                  [range_firsts,
+                   loop_offsets,
+                   rx_firsts = raft::device_span<void const* const>(d_ptrs.data(), d_ptrs.size()),
+                   minor_range_first] __device__(auto i) {
+                    auto loop_idx = thrust::distance(
+                      loop_offsets.begin() + 1,
+                      thrust::upper_bound(
+                        thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i));
+                    auto minor =
+                      range_firsts[loop_idx] + *(static_cast<uint32_t const*>(rx_firsts[loop_idx]) +
+                                                 (i - loop_offsets[loop_idx]));
+                    return minor - minor_range_first;
+                  }));
+              thrust::scatter(handle.get_thrust_policy(),
+                              val_first,
+                              val_first + h_vertex_vars.back(),
+                              map_first,
+                              edge_partition_value_first);
+            } else {
+              auto map_first = thrust::make_transform_iterator(
+                thrust::make_counting_iterator(vertex_t{0}),
+                cuda::proclaim_return_type<vertex_t>(
+                  [loop_offsets,
+                   rx_firsts = raft::device_span<void const* const>(d_ptrs.data(), d_ptrs.size()),
+                   minor_range_first] __device__(auto i) {
+                    auto loop_idx = thrust::distance(
+                      loop_offsets.begin() + 1,
+                      thrust::upper_bound(
+                        thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i));
+                    auto minor = *(static_cast<vertex_t const*>(rx_firsts[loop_idx]) +
+                                   (i - loop_offsets[loop_idx]));
+                    return minor - minor_range_first;
+                  }));
+              thrust::scatter(handle.get_thrust_policy(),
+                              val_first,
+                              val_first + h_vertex_vars.back(),
+                              map_first,
+                              edge_partition_value_first);
+            }
+          }
         }
       }
     }
   } else {
     assert(graph_view.local_vertex_partition_range_size() ==
-           graph_view.local_edge_partition_src_range_size());
+           (GraphViewType::is_storage_transposed
+              ? graph_view.local_edge_partition_src_range_size()
+              : graph_view.local_edge_partition_dst_range_sizse()));
     if constexpr (contains_packed_bool_element) {
       thrust::for_each(handle.get_thrust_policy(),
-                       vertex_first,
-                       vertex_last,
+                       sorted_unique_vertex_first,
+                       sorted_unique_vertex_last,
                        [input, output_value_first = edge_partition_value_first] __device__(auto v) {
                          fill_scalar_or_thrust_tuple(output_value_first, v, input);
                        });
     } else {
       auto val_first = thrust::make_constant_iterator(input);
-      thrust::scatter(handle.get_thrust_policy(),
-                      val_first,
-                      val_first + thrust::distance(vertex_first, vertex_last),
-                      vertex_first,
-                      edge_partition_value_first);
+      thrust::scatter(
+        handle.get_thrust_policy(),
+        val_first,
+        val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+        sorted_unique_vertex_first,
+        edge_partition_value_first);
     }
   }
 }
@@ -451,8 +1034,8 @@ void fill_edge_src_property(raft::handle_t const& handle,
 /**
  * @brief Fill graph edge source property values to the input value.
  *
- * This version fills only a subset of graph edge source property values. [@p vertex_first,
- * @p vertex_last) specifies the vertices to be filled.
+ * This version fills only a subset of graph edge source property values. [@p
+ * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator Type of the iterator for vertex identifiers.
@@ -461,10 +1044,12 @@ void fill_edge_src_property(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled.
- * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition
- * assigned to this process in multi-GPU), otherwise undefined behavior.
- * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled.
+ * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value
+ * to be filled. v in [vertex_first, sorted_unique_vertex_last) should be sorted & distinct (and
+ * should belong to the vertex partition assigned to this process in multi-GPU), otherwise undefined
+ * behavior.
+ * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to
+ * be filled.
  * @param edge_src_property_output edge_src_property_view_t class object to store source property
  * values (for the edge source assigned to this process in multi-GPU).
  * @param input Edge source property values will be set to @p input.
@@ -476,8 +1061,8 @@ template <typename GraphViewType,
           typename T>
 void fill_edge_src_property(raft::handle_t const& handle,
                             GraphViewType const& graph_view,
-                            VertexIterator vertex_first,
-                            VertexIterator vertex_last,
+                            VertexIterator sorted_unique_vertex_first,
+                            VertexIterator sorted_unique_vertex_last,
                             EdgeSrcValueOutputWrapper edge_src_property_output,
                             T input,
                             bool do_expensive_check = false)
@@ -486,8 +1071,8 @@ void fill_edge_src_property(raft::handle_t const& handle,
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
-      vertex_first,
-      vertex_last,
+      sorted_unique_vertex_first,
+      sorted_unique_vertex_last,
       [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(),
        local_vertex_partition_range_last =
          graph_view.local_vertex_partition_range_last()] __device__(auto v) {
@@ -498,17 +1083,25 @@ void fill_edge_src_property(raft::handle_t const& handle,
       num_invalids =
         host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream());
     }
-    CUGRAPH_EXPECTS(
-      num_invalids == 0,
-      "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last).");
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: invalid or non-local vertices in "
+                    "[sorted_unique_vertex_first, sorted_unique_vertex_last).");
   }
 
   if constexpr (GraphViewType::is_storage_transposed) {
-    detail::fill_edge_minor_property(
-      handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input);
+    detail::fill_edge_minor_property(handle,
+                                     graph_view,
+                                     sorted_unique_vertex_first,
+                                     sorted_unique_vertex_last,
+                                     edge_src_property_output,
+                                     input);
   } else {
-    detail::fill_edge_major_property(
-      handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input);
+    detail::fill_edge_major_property(handle,
+                                     graph_view,
+                                     sorted_unique_vertex_first,
+                                     sorted_unique_vertex_last,
+                                     edge_src_property_output,
+                                     input);
   }
 }
 
@@ -552,8 +1145,8 @@ void fill_edge_dst_property(raft::handle_t const& handle,
 /**
  * @brief Fill graph edge destination property values to the input value.
  *
- * This version fills only a subset of graph edge destination property values. [@p vertex_first,
- * @p vertex_last) specifies the vertices to be filled.
+ * This version fills only a subset of graph edge destination property values. [@p
+ * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator Type of the iterator for vertex identifiers.
@@ -563,10 +1156,12 @@ void fill_edge_dst_property(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled.
- * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition
- * assigned to this process in multi-GPU), otherwise undefined behavior.
- * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled.
+ * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value
+ * to be filled. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be sorted &
+ * distinct (and should belong to the vertex partition assigned to this process in multi-GPU),
+ * otherwise undefined behavior.
+ * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to
+ * be filled.
  * @param edge_dst_property_output edge_dst_property_view_t class object to store destination
  * property values (for the edge destinations assigned to this process in multi-GPU).
  * @param input Edge destination property values will be set to @p input.
@@ -578,8 +1173,8 @@ template <typename GraphViewType,
           typename T>
 void fill_edge_dst_property(raft::handle_t const& handle,
                             GraphViewType const& graph_view,
-                            VertexIterator vertex_first,
-                            VertexIterator vertex_last,
+                            VertexIterator sorted_unique_vertex_first,
+                            VertexIterator sorted_unique_vertex_last,
                             EdgeDstValueOutputWrapper edge_dst_property_output,
                             T input,
                             bool do_expensive_check = false)
@@ -588,8 +1183,8 @@ void fill_edge_dst_property(raft::handle_t const& handle,
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
-      vertex_first,
-      vertex_last,
+      sorted_unique_vertex_first,
+      sorted_unique_vertex_last,
       [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(),
        local_vertex_partition_range_last =
          graph_view.local_vertex_partition_range_last()] __device__(auto v) {
@@ -600,17 +1195,25 @@ void fill_edge_dst_property(raft::handle_t const& handle,
       num_invalids =
         host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream());
     }
-    CUGRAPH_EXPECTS(
-      num_invalids == 0,
-      "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last).");
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: invalid or non-local vertices in "
+                    "[sorted_unique_vertex_first, sorted_unique_vertex_last).");
   }
 
   if constexpr (GraphViewType::is_storage_transposed) {
-    detail::fill_edge_major_property(
-      handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input);
+    detail::fill_edge_major_property(handle,
+                                     graph_view,
+                                     sorted_unique_vertex_first,
+                                     sorted_unique_vertex_last,
+                                     edge_dst_property_output,
+                                     input);
   } else {
-    detail::fill_edge_minor_property(
-      handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input);
+    detail::fill_edge_minor_property(handle,
+                                     graph_view,
+                                     sorted_unique_vertex_first,
+                                     sorted_unique_vertex_last,
+                                     edge_dst_property_output,
+                                     input);
   }
 }
 
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index ce5e5d3e8c..f03e8f54fb 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -250,11 +250,14 @@ void per_v_pair_transform_dst_nbr_intersection(
   }
 
   auto num_input_pairs = static_cast<size_t>(thrust::distance(vertex_pair_first, vertex_pair_last));
-  std::optional<rmm::device_uvector<vertex_t>> unique_vertices{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> sorted_unique_vertices{std::nullopt};
   std::optional<decltype(allocate_dataframe_buffer<property_t>(size_t{0}, rmm::cuda_stream_view{}))>
-    property_buffer_for_unique_vertices{std::nullopt};
+    property_buffer_for_sorted_unique_vertices{std::nullopt};
   if constexpr (GraphViewType::is_multi_gpu) {
-    unique_vertices  = rmm::device_uvector<vertex_t>(num_input_pairs * 2, handle.get_stream());
+    auto& comm = handle.get_comms();
+
+    sorted_unique_vertices =
+      rmm::device_uvector<vertex_t>(num_input_pairs * 2, handle.get_stream());
     auto elem0_first = thrust::make_transform_iterator(
       vertex_pair_first,
       cugraph::thrust_tuple_get<typename thrust::iterator_traits<VertexPairIterator>::value_type,
@@ -262,7 +265,7 @@ void per_v_pair_transform_dst_nbr_intersection(
     thrust::copy(handle.get_thrust_policy(),
                  elem0_first,
                  elem0_first + num_input_pairs,
-                 (*unique_vertices).begin());
+                 (*sorted_unique_vertices).begin());
     auto elem1_first = thrust::make_transform_iterator(
       vertex_pair_first,
       cugraph::thrust_tuple_get<typename thrust::iterator_traits<VertexPairIterator>::value_type,
@@ -270,25 +273,25 @@ void per_v_pair_transform_dst_nbr_intersection(
     thrust::copy(handle.get_thrust_policy(),
                  elem1_first,
                  elem1_first + num_input_pairs,
-                 (*unique_vertices).begin() + num_input_pairs);
-    thrust::sort(handle.get_thrust_policy(), (*unique_vertices).begin(), (*unique_vertices).end());
-    (*unique_vertices)
-      .resize(thrust::distance((*unique_vertices).begin(),
+                 (*sorted_unique_vertices).begin() + num_input_pairs);
+    thrust::sort(handle.get_thrust_policy(),
+                 (*sorted_unique_vertices).begin(),
+                 (*sorted_unique_vertices).end());
+    (*sorted_unique_vertices)
+      .resize(thrust::distance((*sorted_unique_vertices).begin(),
                                thrust::unique(handle.get_thrust_policy(),
-                                              (*unique_vertices).begin(),
-                                              (*unique_vertices).end())),
+                                              (*sorted_unique_vertices).begin(),
+                                              (*sorted_unique_vertices).end())),
               handle.get_stream());
 
-    std::tie(unique_vertices, property_buffer_for_unique_vertices) =
-      collect_values_for_unique_int_vertices(handle,
-                                             std::move(*unique_vertices),
-                                             vertex_value_input_first,
-                                             graph_view.vertex_partition_range_lasts());
-    thrust::sort_by_key(
-      handle.get_thrust_policy(),
-      (*unique_vertices).begin(),
-      (*unique_vertices).end(),
-      (*property_buffer_for_unique_vertices).begin());  // necessary for binary search
+    property_buffer_for_sorted_unique_vertices = collect_values_for_sorted_unique_int_vertices(
+      comm,
+      raft::device_span<vertex_t const>((*sorted_unique_vertices).data(),
+                                        (*sorted_unique_vertices).size()),
+      vertex_value_input_first,
+      graph_view.vertex_partition_range_lasts(),
+      graph_view.local_vertex_partition_range_first(),
+      handle.get_stream());
   }
 
   rmm::device_uvector<size_t> vertex_pair_indices(num_input_pairs, handle.get_stream());
@@ -412,32 +415,32 @@ void per_v_pair_transform_dst_nbr_intersection(
                                    do_expensive_check);
       }
 
-      if (unique_vertices) {
-        auto vertex_value_input_for_unique_vertices_first =
-          get_dataframe_buffer_begin(*property_buffer_for_unique_vertices);
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator(size_t{0}),
-          thrust::make_counting_iterator(this_chunk_size),
-          detail::call_intersection_op_t<
-            GraphViewType,
-            decltype(vertex_value_input_for_unique_vertices_first),
-            typename decltype(r_nbr_intersection_property_values0)::const_pointer,
-            IntersectionOp,
-            decltype(chunk_vertex_pair_index_first),
-            VertexPairIterator,
-            VertexPairValueOutputIterator>{edge_partition,
-                                           thrust::make_optional<raft::device_span<vertex_t const>>(
-                                             (*unique_vertices).data(), (*unique_vertices).size()),
-                                           vertex_value_input_for_unique_vertices_first,
-                                           intersection_op,
-                                           intersection_offsets.data(),
-                                           intersection_indices.data(),
-                                           r_nbr_intersection_property_values0.data(),
-                                           r_nbr_intersection_property_values1.data(),
-                                           chunk_vertex_pair_index_first,
-                                           vertex_pair_first,
-                                           vertex_pair_value_output_first});
+      if (sorted_unique_vertices) {
+        auto vertex_value_input_for_sorted_unique_vertices_first =
+          get_dataframe_buffer_begin(*property_buffer_for_sorted_unique_vertices);
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(this_chunk_size),
+                         detail::call_intersection_op_t<
+                           GraphViewType,
+                           decltype(vertex_value_input_for_sorted_unique_vertices_first),
+                           typename decltype(r_nbr_intersection_property_values0)::const_pointer,
+                           IntersectionOp,
+                           decltype(chunk_vertex_pair_index_first),
+                           VertexPairIterator,
+                           VertexPairValueOutputIterator>{
+                           edge_partition,
+                           thrust::make_optional<raft::device_span<vertex_t const>>(
+                             (*sorted_unique_vertices).data(), (*sorted_unique_vertices).size()),
+                           vertex_value_input_for_sorted_unique_vertices_first,
+                           intersection_op,
+                           intersection_offsets.data(),
+                           intersection_indices.data(),
+                           r_nbr_intersection_property_values0.data(),
+                           r_nbr_intersection_property_values1.data(),
+                           chunk_vertex_pair_index_first,
+                           vertex_pair_first,
+                           vertex_pair_value_output_first});
       } else {
         thrust::for_each(handle.get_thrust_policy(),
                          thrust::make_counting_iterator(size_t{0}),
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index dd34d4b06a..30706632ad 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -206,7 +206,7 @@ struct return_value_compute_offset_t {
 
 template <bool incoming,
           typename GraphViewType,
-          typename VertexFrontierBucketType,
+          typename KeyBucketType,
           typename EdgeBiasSrcValueInputWrapper,
           typename EdgeBiasDstValueInputWrapper,
           typename EdgeBiasValueInputWrapper,
@@ -220,7 +220,7 @@ std::tuple<std::optional<rmm::device_uvector<size_t>>,
            decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>
 per_v_random_select_transform_e(raft::handle_t const& handle,
                                 GraphViewType const& graph_view,
-                                VertexFrontierBucketType const& frontier,
+                                KeyBucketType const& key_list,
                                 EdgeBiasSrcValueInputWrapper edge_bias_src_value_input,
                                 EdgeBiasDstValueInputWrapper edge_bias_dst_value_input,
                                 EdgeBiasValueInputWrapper edge_bias_value_input,
@@ -237,7 +237,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 {
   using vertex_t     = typename GraphViewType::vertex_type;
   using edge_t       = typename GraphViewType::edge_type;
-  using key_t        = typename VertexFrontierBucketType::key_type;
+  using key_t        = typename KeyBucketType::key_type;
   using key_buffer_t = dataframe_buffer_type_t<key_t>;
 
   using edge_partition_src_input_device_view_t = std::conditional_t<
@@ -286,15 +286,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 
   if (do_expensive_check) {
     // FIXME: better re-factor this check function?
-    auto frontier_vertex_first =
-      thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
-    auto frontier_vertex_last =
-      thrust_tuple_get_or_identity<decltype(frontier.end()), 0>(frontier.end());
+    auto key_list_vertex_first =
+      thrust_tuple_get_or_identity<decltype(key_list.begin()), 0>(key_list.begin());
+    auto key_list_vertex_last =
+      thrust_tuple_get_or_identity<decltype(key_list.end()), 0>(key_list.end());
     auto num_invalid_keys =
-      frontier.size() -
+      key_list.size() -
       thrust::count_if(handle.get_thrust_policy(),
-                       frontier_vertex_first,
-                       frontier_vertex_last,
+                       key_list_vertex_first,
+                       key_list_vertex_last,
                        check_in_range_t<vertex_t>{graph_view.local_vertex_partition_range_first(),
                                                   graph_view.local_vertex_partition_range_last()});
     if constexpr (GraphViewType::is_multi_gpu) {
@@ -302,35 +302,35 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         handle.get_comms(), num_invalid_keys, raft::comms::op_t::SUM, handle.get_stream());
     }
     CUGRAPH_EXPECTS(num_invalid_keys == size_t{0},
-                    "Invalid input argument: frontier includes out-of-range keys.");
+                    "Invalid input argument: key_list includes out-of-range keys.");
   }
 
-  std::vector<size_t> local_frontier_sizes{};
+  std::vector<size_t> local_key_list_sizes{};
   if (minor_comm_size > 1) {
     auto& minor_comm     = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-    local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream());
+    local_key_list_sizes = host_scalar_allgather(minor_comm, key_list.size(), handle.get_stream());
   } else {
-    local_frontier_sizes = std::vector<size_t>{frontier.size()};
+    local_key_list_sizes = std::vector<size_t>{key_list.size()};
   }
-  std::vector<size_t> local_frontier_displacements(local_frontier_sizes.size());
-  std::exclusive_scan(local_frontier_sizes.begin(),
-                      local_frontier_sizes.end(),
-                      local_frontier_displacements.begin(),
+  std::vector<size_t> local_key_list_displacements(local_key_list_sizes.size());
+  std::exclusive_scan(local_key_list_sizes.begin(),
+                      local_key_list_sizes.end(),
+                      local_key_list_displacements.begin(),
                       size_t{0});
 
-  // 1. aggregate frontier
+  // 1. aggregate key_list
 
-  std::optional<key_buffer_t> aggregate_local_frontier{std::nullopt};
+  std::optional<key_buffer_t> aggregate_local_key_list{std::nullopt};
   if (minor_comm_size > 1) {
     auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
 
-    aggregate_local_frontier = allocate_dataframe_buffer<key_t>(
-      local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream());
+    aggregate_local_key_list = allocate_dataframe_buffer<key_t>(
+      local_key_list_displacements.back() + local_key_list_sizes.back(), handle.get_stream());
     device_allgatherv(minor_comm,
-                      frontier.begin(),
-                      get_dataframe_buffer_begin(*aggregate_local_frontier),
-                      local_frontier_sizes,
-                      local_frontier_displacements,
+                      key_list.begin(),
+                      get_dataframe_buffer_begin(*aggregate_local_key_list),
+                      local_key_list_sizes,
+                      local_key_list_displacements,
                       handle.get_stream());
   }
 
@@ -339,66 +339,66 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 
   rmm::device_uvector<edge_t> sample_local_nbr_indices(0, handle.get_stream());
   std::optional<rmm::device_uvector<size_t>> sample_key_indices{std::nullopt};
-  std::vector<size_t> local_frontier_sample_offsets{};
+  std::vector<size_t> local_key_list_sample_offsets{};
   if constexpr (std::is_same_v<EdgeBiasOp,
                                constant_e_bias_op_t<GraphViewType,
                                                     EdgeBiasSrcValueInputWrapper,
                                                     EdgeBiasDstValueInputWrapper,
                                                     EdgeBiasValueInputWrapper,
                                                     key_t>>) {
-    std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) =
+    std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) =
       uniform_sample_and_compute_local_nbr_indices(
         handle,
         graph_view,
-        (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier)
-                              : frontier.begin(),
-        local_frontier_displacements,
-        local_frontier_sizes,
+        (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list)
+                              : key_list.begin(),
+        local_key_list_displacements,
+        local_key_list_sizes,
         rng_state,
         K,
         with_replacement);
   } else {
-    std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) =
+    std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) =
       biased_sample_and_compute_local_nbr_indices(
         handle,
         graph_view,
-        (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier)
-                              : frontier.begin(),
+        (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list)
+                              : key_list.begin(),
         edge_bias_src_value_input,
         edge_bias_dst_value_input,
         edge_bias_value_input,
         e_bias_op,
-        local_frontier_displacements,
-        local_frontier_sizes,
+        local_key_list_displacements,
+        local_key_list_sizes,
         rng_state,
         K,
         with_replacement,
         do_expensive_check);
   }
 
-  std::vector<size_t> local_frontier_sample_counts(minor_comm_size);
-  std::adjacent_difference(local_frontier_sample_offsets.begin() + 1,
-                           local_frontier_sample_offsets.end(),
-                           local_frontier_sample_counts.begin());
+  std::vector<size_t> local_key_list_sample_counts(minor_comm_size);
+  std::adjacent_difference(local_key_list_sample_offsets.begin() + 1,
+                           local_key_list_sample_offsets.end(),
+                           local_key_list_sample_counts.begin());
 
   // 3. transform
 
   auto sample_e_op_results =
-    allocate_dataframe_buffer<T>(local_frontier_sample_offsets.back(), handle.get_stream());
+    allocate_dataframe_buffer<T>(local_key_list_sample_offsets.back(), handle.get_stream());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
 
-    auto edge_partition_frontier_key_first =
-      ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_frontier)
-                             : frontier.begin()) +
-      local_frontier_displacements[i];
+    auto edge_partition_key_list_first =
+      ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list)
+                             : key_list.begin()) +
+      local_key_list_displacements[i];
     auto edge_partition_sample_local_nbr_index_first =
-      sample_local_nbr_indices.begin() + local_frontier_sample_offsets[i];
+      sample_local_nbr_indices.begin() + local_key_list_sample_offsets[i];
 
     auto edge_partition_sample_e_op_result_first =
-      get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_offsets[i];
+      get_dataframe_buffer_begin(sample_e_op_results) + local_key_list_sample_offsets[i];
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -415,14 +415,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
 
     if (sample_key_indices) {
       auto edge_partition_sample_key_index_first =
-        (*sample_key_indices).begin() + local_frontier_sample_offsets[i];
+        (*sample_key_indices).begin() + local_key_list_sample_offsets[i];
       thrust::transform(
         handle.get_thrust_policy(),
         thrust::make_counting_iterator(size_t{0}),
-        thrust::make_counting_iterator(local_frontier_sample_counts[i]),
+        thrust::make_counting_iterator(local_key_list_sample_counts[i]),
         edge_partition_sample_e_op_result_first,
         transform_local_nbr_indices_t<GraphViewType,
-                                      decltype(edge_partition_frontier_key_first),
+                                      decltype(edge_partition_key_list_first),
                                       decltype(edge_partition_sample_local_nbr_index_first),
                                       edge_partition_src_input_device_view_t,
                                       edge_partition_dst_input_device_view_t,
@@ -431,7 +431,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                       T>{
           edge_partition,
           thrust::make_optional(edge_partition_sample_key_index_first),
-          edge_partition_frontier_key_first,
+          edge_partition_key_list_first,
           edge_partition_sample_local_nbr_index_first,
           edge_partition_src_value_input,
           edge_partition_dst_value_input,
@@ -444,10 +444,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       thrust::transform(
         handle.get_thrust_policy(),
         thrust::make_counting_iterator(size_t{0}),
-        thrust::make_counting_iterator(frontier.size() * K),
+        thrust::make_counting_iterator(key_list.size() * K),
         edge_partition_sample_e_op_result_first,
         transform_local_nbr_indices_t<GraphViewType,
-                                      decltype(edge_partition_frontier_key_first),
+                                      decltype(edge_partition_key_list_first),
                                       decltype(edge_partition_sample_local_nbr_index_first),
                                       edge_partition_src_input_device_view_t,
                                       edge_partition_dst_input_device_view_t,
@@ -455,7 +455,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                       EdgeOp,
                                       T>{edge_partition,
                                          thrust::nullopt,
-                                         edge_partition_frontier_key_first,
+                                         edge_partition_key_list_first,
                                          edge_partition_sample_local_nbr_index_first,
                                          edge_partition_src_value_input,
                                          edge_partition_dst_value_input,
@@ -466,13 +466,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                          K});
     }
   }
-  aggregate_local_frontier = std::nullopt;
+  aggregate_local_key_list = std::nullopt;
 
   // 4. shuffle randomly selected & transformed results and update sample_offsets
 
   auto sample_offsets = invalid_value ? std::nullopt
                                       : std::make_optional<rmm::device_uvector<size_t>>(
-                                          frontier.size() + 1, handle.get_stream());
+                                          key_list.size() + 1, handle.get_stream());
   assert(K <= std::numeric_limits<int32_t>::max());
   if (minor_comm_size > 1) {
     sample_local_nbr_indices.resize(0, handle.get_stream());
@@ -483,12 +483,12 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     std::tie(sample_e_op_results, std::ignore) =
       shuffle_values(minor_comm,
                      get_dataframe_buffer_begin(sample_e_op_results),
-                     local_frontier_sample_counts,
+                     local_key_list_sample_counts,
                      handle.get_stream());
     std::tie(sample_key_indices, std::ignore) = shuffle_values(
-      minor_comm, (*sample_key_indices).begin(), local_frontier_sample_counts, handle.get_stream());
+      minor_comm, (*sample_key_indices).begin(), local_key_list_sample_counts, handle.get_stream());
 
-    rmm::device_uvector<int32_t> sample_counts(frontier.size(), handle.get_stream());
+    rmm::device_uvector<int32_t> sample_counts(key_list.size(), handle.get_stream());
     thrust::fill(
       handle.get_thrust_policy(), sample_counts.begin(), sample_counts.end(), int32_t{0});
     auto sample_intra_partition_displacements =
@@ -504,7 +504,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
       sample_counts.resize(0, handle.get_stream());
       sample_counts.shrink_to_fit(handle.get_stream());
 
-      resize_dataframe_buffer(tmp_sample_e_op_results, frontier.size() * K, handle.get_stream());
+      resize_dataframe_buffer(tmp_sample_e_op_results, key_list.size() * K, handle.get_stream());
       thrust::fill(handle.get_thrust_policy(),
                    get_dataframe_buffer_begin(tmp_sample_e_op_results),
                    get_dataframe_buffer_end(tmp_sample_e_op_results),
@@ -553,7 +553,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_e_op_results = std::move(tmp_sample_e_op_results);
   } else {
     if (!invalid_value) {
-      rmm::device_uvector<int32_t> sample_counts(frontier.size(), handle.get_stream());
+      rmm::device_uvector<int32_t> sample_counts(key_list.size(), handle.get_stream());
       thrust::tabulate(
         handle.get_thrust_policy(),
         sample_counts.begin(),
@@ -597,8 +597,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
  * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the
- * current (tagged-)vertex frontier.
+ * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex
+ * list.
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
@@ -609,8 +609,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample
- * outgoing edges.
+ * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing
+ * edges.
  * @param edge_src_value_input Wrapper used to access source input property values (for the edge
  * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
  * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
@@ -647,11 +647,11 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
  * @return std::tuple Tuple of an optional offset vector of type
  * std::optional<rmm::device_uvector<size_t>> and a dataframe buffer storing the output values of
  * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is
- * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true,
- * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements).
+ * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true,
+ * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements).
  */
 template <typename GraphViewType,
-          typename VertexFrontierBucketType,
+          typename KeyBucketType,
           typename EdgeBiasSrcValueInputWrapper,
           typename EdgeBiasDstValueInputWrapper,
           typename EdgeBiasValueInputWrapper,
@@ -665,7 +665,7 @@ std::tuple<std::optional<rmm::device_uvector<size_t>>,
            decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>
 per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          GraphViewType const& graph_view,
-                                         VertexFrontierBucketType const& frontier,
+                                         KeyBucketType const& key_list,
                                          EdgeBiasSrcValueInputWrapper edge_bias_src_value_input,
                                          EdgeBiasDstValueInputWrapper edge_bias_dst_value_input,
                                          EdgeBiasValueInputWrapper edge_bias_value_input,
@@ -682,7 +682,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
 {
   return detail::per_v_random_select_transform_e<false>(handle,
                                                         graph_view,
-                                                        frontier,
+                                                        key_list,
                                                         edge_bias_src_value_input,
                                                         edge_bias_dst_value_input,
                                                         edge_bias_value_input,
@@ -705,8 +705,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
  * (uniform neighbor sampling).
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the
- * current (tagged-)vertex frontier.
+ * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex
+ * list.
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
@@ -715,8 +715,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample
- * outgoing edges.
+ * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing
+ * edges.
  * @param edge_src_value_input Wrapper used to access source input property values (for the edge
  * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
  * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
@@ -747,11 +747,11 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
  * @return std::tuple Tuple of an optional offset vector of type
  * std::optional<rmm::device_uvector<size_t>> and a dataframe buffer storing the output values of
  * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is
- * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true,
- * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements).
+ * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true,
+ * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements).
  */
 template <typename GraphViewType,
-          typename VertexFrontierBucketType,
+          typename KeyBucketType,
           typename EdgeSrcValueInputWrapper,
           typename EdgeDstValueInputWrapper,
           typename EdgeValueInputWrapper,
@@ -761,7 +761,7 @@ std::tuple<std::optional<rmm::device_uvector<size_t>>,
            decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))>
 per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          GraphViewType const& graph_view,
-                                         VertexFrontierBucketType const& frontier,
+                                         KeyBucketType const& key_list,
                                          EdgeSrcValueInputWrapper edge_src_value_input,
                                          EdgeDstValueInputWrapper edge_dst_value_input,
                                          EdgeValueInputWrapper edge_value_input,
@@ -775,7 +775,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
   return detail::per_v_random_select_transform_e<false>(
     handle,
     graph_view,
-    frontier,
+    key_list,
     edge_src_dummy_property_t{}.view(),
     edge_dst_dummy_property_t{}.view(),
     edge_dummy_property_t{}.view(),
@@ -783,7 +783,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                  detail::edge_endpoint_dummy_property_view_t,
                                  detail::edge_endpoint_dummy_property_view_t,
                                  edge_dummy_property_view_t,
-                                 typename VertexFrontierBucketType::key_type>{},
+                                 typename KeyBucketType::key_type>{},
     edge_src_value_input,
     edge_dst_value_input,
     edge_value_input,
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index 5a5e933209..c13816242b 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -924,11 +924,12 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       auto values_for_unique_keys =
         allocate_dataframe_buffer<kv_pair_value_t>(0, handle.get_stream());
       std::tie(unique_minor_keys, values_for_unique_keys) =
-        collect_values_for_unique_keys(handle,
+        collect_values_for_unique_keys(comm,
                                        kv_store_view,
                                        std::move(unique_minor_keys),
                                        cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
-                                         comm_size, major_comm_size, minor_comm_size});
+                                         comm_size, major_comm_size, minor_comm_size},
+                                       handle.get_stream());
 
       if constexpr (KVStoreViewType::binary_search) {
         multi_gpu_minor_key_value_map_ptr =
diff --git a/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh
new file mode 100644
index 0000000000..1e0d366429
--- /dev/null
+++ b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/detail/per_v_transform_reduce_e.cuh"
+#include "prims/vertex_frontier.cuh"
+
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <numeric>
+#include <utility>
+
+namespace cugraph {
+
+/**
+ * @brief Iterate over every vertex's incoming edges to update vertex properties.
+ *
+ * This function is inspired by thrust::transform_reduce. In addition, this function excludes the
+ * edges that return false when the predicate @p pred_op is applied.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam PredOp Type of the quinary predicate operator.
+ * @tparam T Type of the initial value for per-vertex reduction.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to
+ * fill the wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 outgoing edges.
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @param pred_op Quinary operator takes edge source, edge destination, property values for the
+ * source, destination, and edge and returns whether this edge should be included (if true is
+ * returned) or excluded.
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.local_vertex_partition_range_size().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle,
+                                          GraphViewType const& graph_view,
+                                          EdgeSrcValueInputWrapper edge_src_value_input,
+                                          EdgeDstValueInputWrapper edge_dst_value_input,
+                                          EdgeValueInputWrapper edge_value_input,
+                                          EdgeOp e_op,
+                                          T init,
+                                          ReduceOp reduce_op,
+                                          PredOp pred_op,
+                                          VertexValueOutputIterator vertex_value_output_first,
+                                          bool do_expensive_check = false)
+{
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  constexpr bool incoming = true;
+
+  detail::per_v_transform_reduce_e<incoming>(handle,
+                                             graph_view,
+                                             static_cast<void*>(nullptr),
+                                             static_cast<void*>(nullptr),
+                                             edge_src_value_input,
+                                             edge_dst_value_input,
+                                             edge_value_input,
+                                             e_op,
+                                             init,
+                                             reduce_op,
+                                             pred_op,
+                                             vertex_value_output_first);
+}
+
+/**
+ * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the incoming
+ * edges to update (tagged-)vertex properties.
+ *
+ * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the
+ * edges that return false when the predicate @p pred_op is applied.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex
+ * list.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam PredOp Type of the quinary predicate operator.
+ * @tparam T Type of the initial value for per-vertex reduction.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update
+ * (tagged-)vertex properties.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 incoming edges.
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @param pred_op Quinary operator takes edge source, edge destination, property values for the
+ * source, destination, and edge and returns whether this edge should be included (if true is
+ * returned) or excluded.
+ * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for
+ * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is
+ * deduced as @p vertex_value_output_first + @p key_list.size().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename KeyBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle,
+                                          GraphViewType const& graph_view,
+                                          KeyBucketType const& key_list,
+                                          EdgeSrcValueInputWrapper edge_src_value_input,
+                                          EdgeDstValueInputWrapper edge_dst_value_input,
+                                          EdgeValueInputWrapper edge_value_input,
+                                          EdgeOp e_op,
+                                          T init,
+                                          ReduceOp reduce_op,
+                                          PredOp pred_op,
+                                          VertexValueOutputIterator vertex_value_output_first,
+                                          bool do_expensive_check = false)
+{
+  static_assert(GraphViewType::is_storage_transposed);
+
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  constexpr bool incoming = true;
+
+  detail::per_v_transform_reduce_e<incoming>(handle,
+                                             graph_view,
+                                             key_list.begin(),
+                                             key_list.end(),
+                                             edge_src_value_input,
+                                             edge_dst_value_input,
+                                             edge_value_input,
+                                             e_op,
+                                             init,
+                                             reduce_op,
+                                             pred_op,
+                                             vertex_value_output_first);
+}
+
+/**
+ * @brief Iterate over every vertex's outgoing edges to update vertex properties.
+ *
+ * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the
+ * edges that return false when the predicate @p pred_op is applied.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam PredOp Type of the quinary predicate operator.
+ * @tparam T Type of the initial value for per-vertex reduction.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 outgoing edges.
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @param pred_op Quinary operator takes edge source, edge destination, property values for the
+ * source, destination, and edge and returns whether this edge should be included (if true is
+ * returned) or excluded.
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
+ * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.local_vertex_partition_range_size().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle,
+                                          GraphViewType const& graph_view,
+                                          EdgeSrcValueInputWrapper edge_src_value_input,
+                                          EdgeDstValueInputWrapper edge_dst_value_input,
+                                          EdgeValueInputWrapper edge_value_input,
+                                          EdgeOp e_op,
+                                          T init,
+                                          ReduceOp reduce_op,
+                                          PredOp pred_op,
+                                          VertexValueOutputIterator vertex_value_output_first,
+                                          bool do_expensive_check = false)
+{
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  constexpr bool incoming = false;
+
+  detail::per_v_transform_reduce_e<incoming>(handle,
+                                             graph_view,
+                                             static_cast<void*>(nullptr),
+                                             static_cast<void*>(nullptr),
+                                             edge_src_value_input,
+                                             edge_dst_value_input,
+                                             edge_value_input,
+                                             e_op,
+                                             init,
+                                             reduce_op,
+                                             pred_op,
+                                             vertex_value_output_first);
+}
+
+/**
+ * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing
+ * edges to update (tagged-)vertex properties.
+ *
+ * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the
+ * edges that return false when the predicate @p pred_op is applied.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex
+ * list.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam PredOp Type of the quinary predicate operator.
+ * @tparam T Type of the initial value for per-vertex reduction.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update
+ * (tagged-)vertex properties.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 outgoing edges.
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @param pred_op Quinary operator takes edge source, edge destination, property values for the
+ * source, destination, and edge and returns whether this edge should be included (if true is
+ * returned) or excluded.
+ * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for
+ * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is
+ * deduced as @p vertex_value_output_first + @p key_list.size().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename KeyBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename PredOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle,
+                                          GraphViewType const& graph_view,
+                                          KeyBucketType const& key_list,
+                                          EdgeSrcValueInputWrapper edge_src_value_input,
+                                          EdgeDstValueInputWrapper edge_dst_value_input,
+                                          EdgeValueInputWrapper edge_value_input,
+                                          EdgeOp e_op,
+                                          T init,
+                                          ReduceOp reduce_op,
+                                          PredOp pred_op,
+                                          VertexValueOutputIterator vertex_value_output_first,
+                                          bool do_expensive_check = false)
+{
+  static_assert(!GraphViewType::is_storage_transposed);
+  static_assert(KeyBucketType::is_sorted_unique);
+
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  constexpr bool incoming = false;
+
+  detail::per_v_transform_reduce_e<incoming>(handle,
+                                             graph_view,
+                                             key_list.begin(),
+                                             key_list.end(),
+                                             edge_src_value_input,
+                                             edge_dst_value_input,
+                                             edge_value_input,
+                                             e_op,
+                                             init,
+                                             reduce_op,
+                                             pred_op,
+                                             vertex_value_output_first);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 027ef1f662..5ba7edec89 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -15,558 +15,165 @@
  */
 #pragma once
 
-#include "detail/graph_partition_utils.cuh"
-#include "prims/detail/prim_functors.cuh"
-#include "prims/fill_edge_src_dst_property.cuh"
-#include "prims/property_op_utils.cuh"
-#include "prims/reduce_op.cuh"
+#include "prims/detail/per_v_transform_reduce_e.cuh"
+#include "prims/vertex_frontier.cuh"
 
-#include <cugraph/edge_partition_device_view.cuh>
-#include <cugraph/edge_partition_edge_property_device_view.cuh>
-#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_view.hpp>
-#include <cugraph/partition_manager.hpp>
-#include <cugraph/utilities/dataframe_buffer.hpp>
-#include <cugraph/utilities/device_comm.hpp>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_scalar_comm.hpp>
-#include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <raft/core/handle.hpp>
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/integer_utils.hpp>
-
-#include <rmm/exec_policy.hpp>
-
-#include <cub/cub.cuh>
-#include <cuda/functional>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
-#include <thrust/scatter.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/tuple.h>
-#include <thrust/type_traits/integer_sequence.h>
 
 #include <numeric>
-#include <type_traits>
 #include <utility>
 
 namespace cugraph {
 
-namespace detail {
-
-int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512;
-
-template <typename vertex_t,
-          typename edge_t,
-          bool multi_gpu,
-          typename result_t,
-          typename TransformOp,
-          typename ReduceOp,
-          typename ResultValueOutputIteratorOrWrapper>
-struct transform_and_atomic_reduce_t {
-  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition{};
-  result_t identity_element{};
-  vertex_t const* indices{nullptr};
-  TransformOp const& transform_op{};
-  ResultValueOutputIteratorOrWrapper& result_value_output{};
-
-  __device__ void operator()(edge_t i) const
-  {
-    auto e_op_result = transform_op(i);
-    if (e_op_result != identity_element) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      if constexpr (multi_gpu) {
-        reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-      } else {
-        reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-      }
-    }
-  }
-};
-
-template <bool update_major,
-          typename vertex_t,
-          typename edge_t,
-          bool multi_gpu,
-          typename result_t,
-          typename TransformOp,
-          typename ReduceOp,
-          typename ResultValueOutputIteratorOrWrapper>
-__device__ void update_result_value_output(
-  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition,
-  vertex_t const* indices,
-  edge_t local_degree,
-  TransformOp const& transform_op,
-  result_t init,
-  ReduceOp const& reduce_op,
-  size_t output_idx /* relevent only when update_major === true */,
-  result_t identity_element,
-  ResultValueOutputIteratorOrWrapper& result_value_output)
-{
-  if constexpr (update_major) {
-    *(result_value_output + output_idx) =
-      thrust::transform_reduce(thrust::seq,
-                               thrust::make_counting_iterator(edge_t{0}),
-                               thrust::make_counting_iterator(local_degree),
-                               transform_op,
-                               init,
-                               reduce_op);
-  } else {
-    thrust::for_each(
-      thrust::seq,
-      thrust::make_counting_iterator(edge_t{0}),
-      thrust::make_counting_iterator(local_degree),
-      transform_and_atomic_reduce_t<vertex_t,
-                                    edge_t,
-                                    multi_gpu,
-                                    result_t,
-                                    TransformOp,
-                                    ReduceOp,
-                                    ResultValueOutputIteratorOrWrapper>{
-        edge_partition, identity_element, indices, transform_op, result_value_output});
-  }
-}
-
-template <bool update_major,
-          typename GraphViewType,
-          typename EdgePartitionSrcValueInputWrapper,
-          typename EdgePartitionDstValueInputWrapper,
-          typename EdgePartitionEdgeValueInputWrapper,
-          typename EdgePartitionEdgeMaskWrapper,
-          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
-                                                         GraphViewType::is_multi_gpu, iterator
-                                                         otherwise */
-          ,
-          typename EdgeOp,
-          typename ReduceOp,
-          typename T>
-__global__ static void per_v_transform_reduce_e_hypersparse(
-  edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                               typename GraphViewType::edge_type,
-                               GraphViewType::is_multi_gpu> edge_partition,
-  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
-  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
-  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
-  ResultValueOutputIteratorOrWrapper result_value_output,
-  EdgeOp e_op,
-  T init /* relevant only if update_major == true */,
-  T identity_element /* relevant only if update_major == true */,
-  ReduceOp reduce_op)
-{
-  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
-                                  ReduceOp>);  // atomic_reduce is defined only when
-                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
-
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-
-  auto const tid          = threadIdx.x + blockIdx.x * blockDim.x;
-  auto major_start_offset = static_cast<size_t>(*(edge_partition.major_hypersparse_first()) -
-                                                edge_partition.major_range_first());
-  auto idx                = static_cast<size_t>(tid);
-
-  auto dcs_nzd_vertex_count = *(edge_partition.dcs_nzd_vertex_count());
-
-  while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
-    auto major =
-      *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
-    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-    auto major_idx =
-      major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
-    vertex_t const* indices{nullptr};
-    edge_t edge_offset{};
-    edge_t local_degree{};
-    thrust::tie(indices, edge_offset, local_degree) =
-      edge_partition.local_edges(static_cast<vertex_t>(major_idx));
-
-    auto call_e_op = call_e_op_t<GraphViewType,
-                                 vertex_t,
-                                 EdgePartitionSrcValueInputWrapper,
-                                 EdgePartitionDstValueInputWrapper,
-                                 EdgePartitionEdgeValueInputWrapper,
-                                 EdgeOp>{edge_partition,
-                                         edge_partition_src_value_input,
-                                         edge_partition_dst_value_input,
-                                         edge_partition_e_value_input,
-                                         e_op,
-                                         major,
-                                         major_offset,
-                                         indices,
-                                         edge_offset};
-
-    if (edge_partition_e_mask) {
-      auto transform_op =
-        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
-          if ((*edge_partition_e_mask).get(edge_offset + i)) {
-            return call_e_op(i);
-          } else {
-            return identity_element;
-          }
-        };
-
-      update_result_value_output<update_major>(edge_partition,
-                                               indices,
-                                               local_degree,
-                                               transform_op,
-                                               init,
-                                               reduce_op,
-                                               major - *(edge_partition).major_hypersparse_first(),
-                                               identity_element,
-                                               result_value_output);
-    } else {
-      update_result_value_output<update_major>(edge_partition,
-                                               indices,
-                                               local_degree,
-                                               call_e_op,
-                                               init,
-                                               reduce_op,
-                                               major - *(edge_partition).major_hypersparse_first(),
-                                               identity_element,
-                                               result_value_output);
-    }
-    idx += gridDim.x * blockDim.x;
-  }
-}
-
-template <bool update_major,
-          typename GraphViewType,
-          typename EdgePartitionSrcValueInputWrapper,
-          typename EdgePartitionDstValueInputWrapper,
-          typename EdgePartitionEdgeValueInputWrapper,
-          typename EdgePartitionEdgeMaskWrapper,
-          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
-                                                         GraphViewType::is_multi_gpu, iterator
-                                                         otherwise */
-          ,
-          typename EdgeOp,
-          typename ReduceOp,
-          typename T>
-__global__ static void per_v_transform_reduce_e_low_degree(
-  edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                               typename GraphViewType::edge_type,
-                               GraphViewType::is_multi_gpu> edge_partition,
-  typename GraphViewType::vertex_type major_range_first,
-  typename GraphViewType::vertex_type major_range_last,
-  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
-  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
-  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
-  ResultValueOutputIteratorOrWrapper result_value_output,
-  EdgeOp e_op,
-  T init /* relevant only if update_major == true */,
-  T identity_element /* relevant only if update_major == true */,
-  ReduceOp reduce_op)
-{
-  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
-                                  ReduceOp>);  // atomic_reduce is defined only when
-                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
-
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  auto major_start_offset =
-    static_cast<size_t>(major_range_first - edge_partition.major_range_first());
-  auto idx = static_cast<size_t>(tid);
-
-  while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
-    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
-    vertex_t const* indices{nullptr};
-    edge_t edge_offset{};
-    edge_t local_degree{};
-    thrust::tie(indices, edge_offset, local_degree) =
-      edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-
-    auto call_e_op = call_e_op_t<GraphViewType,
-                                 vertex_t,
-                                 EdgePartitionSrcValueInputWrapper,
-                                 EdgePartitionDstValueInputWrapper,
-                                 EdgePartitionEdgeValueInputWrapper,
-                                 EdgeOp>{edge_partition,
-                                         edge_partition_src_value_input,
-                                         edge_partition_dst_value_input,
-                                         edge_partition_e_value_input,
-                                         e_op,
-                                         major,
-                                         major_offset,
-                                         indices,
-                                         edge_offset};
-
-    if (edge_partition_e_mask) {
-      auto transform_op =
-        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
-          if ((*edge_partition_e_mask).get(edge_offset + i)) {
-            return call_e_op(i);
-          } else {
-            return identity_element;
-          }
-        };
-
-      update_result_value_output<update_major>(edge_partition,
-                                               indices,
-                                               local_degree,
-                                               transform_op,
-                                               init,
-                                               reduce_op,
-                                               idx,
-                                               identity_element,
-                                               result_value_output);
-    } else {
-      update_result_value_output<update_major>(edge_partition,
-                                               indices,
-                                               local_degree,
-                                               call_e_op,
-                                               init,
-                                               reduce_op,
-                                               idx,
-                                               identity_element,
-                                               result_value_output);
-    }
-    idx += gridDim.x * blockDim.x;
-  }
-}
-
-template <bool update_major,
-          typename GraphViewType,
-          typename EdgePartitionSrcValueInputWrapper,
-          typename EdgePartitionDstValueInputWrapper,
-          typename EdgePartitionEdgeValueInputWrapper,
-          typename EdgePartitionEdgeMaskWrapper,
-          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
-                                                         GraphViewType::is_multi_gpu, iterator
-                                                         otherwise */
-          ,
+/**
+ * @brief Iterate over every vertex's incoming edges to update vertex properties.
+ *
+ * This function is inspired by thrust::transform_reduce.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam T Type of the initial value for per-vertex reduction.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to
+ * fill the wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 outgoing edges.
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.local_vertex_partition_range_size().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
           typename EdgeOp,
           typename ReduceOp,
-          typename T>
-__global__ static void per_v_transform_reduce_e_mid_degree(
-  edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                               typename GraphViewType::edge_type,
-                               GraphViewType::is_multi_gpu> edge_partition,
-  typename GraphViewType::vertex_type major_range_first,
-  typename GraphViewType::vertex_type major_range_last,
-  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
-  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
-  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
-  ResultValueOutputIteratorOrWrapper result_value_output,
-  EdgeOp e_op,
-  T init /* relevant only if update_major == true */,
-  T identity_element /* relevant only if update_major == true */,
-  ReduceOp reduce_op)
+          typename T,
+          typename VertexValueOutputIterator>
+void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
+                                       GraphViewType const& graph_view,
+                                       EdgeSrcValueInputWrapper edge_src_value_input,
+                                       EdgeDstValueInputWrapper edge_dst_value_input,
+                                       EdgeValueInputWrapper edge_value_input,
+                                       EdgeOp e_op,
+                                       T init,
+                                       ReduceOp reduce_op,
+                                       VertexValueOutputIterator vertex_value_output_first,
+                                       bool do_expensive_check = false)
 {
-  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
-                                  ReduceOp>);  // atomic_reduce is defined only when
-                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
-
-  using vertex_t      = typename GraphViewType::vertex_type;
-  using edge_t        = typename GraphViewType::edge_type;
-  using e_op_result_t = T;
-
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0);
-  auto const lane_id = tid % raft::warp_size();
-  auto major_start_offset =
-    static_cast<size_t>(major_range_first - edge_partition.major_range_first());
-  auto idx = static_cast<size_t>(tid / raft::warp_size());
-
-  using WarpReduce = cub::WarpReduce<e_op_result_t>;
-  [[maybe_unused]] __shared__ typename WarpReduce::TempStorage
-    temp_storage[per_v_transform_reduce_e_kernel_block_size /
-                 raft::warp_size()];  // relevant only if update_major == true
-
-  while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
-    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
-    vertex_t const* indices{nullptr};
-    edge_t edge_offset{};
-    edge_t local_degree{};
-    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-
-    auto call_e_op = call_e_op_t<GraphViewType,
-                                 vertex_t,
-                                 EdgePartitionSrcValueInputWrapper,
-                                 EdgePartitionDstValueInputWrapper,
-                                 EdgePartitionEdgeValueInputWrapper,
-                                 EdgeOp>{edge_partition,
-                                         edge_partition_src_value_input,
-                                         edge_partition_dst_value_input,
-                                         edge_partition_e_value_input,
-                                         e_op,
-                                         major,
-                                         major_offset,
-                                         indices,
-                                         edge_offset};
-
-    [[maybe_unused]] auto reduced_e_op_result =
-      lane_id == 0 ? init : identity_element;  // relevant only if update_major == true
-    if (edge_partition_e_mask) {
-      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-        if ((*edge_partition_e_mask).get(edge_offset + i)) {
-          auto e_op_result = call_e_op(i);
-          if constexpr (update_major) {
-            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-          } else {
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
-            if constexpr (GraphViewType::is_multi_gpu) {
-              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-            } else {
-              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-            }
-          }
-        }
-      }
-    } else {
-      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-        auto e_op_result = call_e_op(i);
-        if constexpr (update_major) {
-          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-        } else {
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
-          if constexpr (GraphViewType::is_multi_gpu) {
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          } else {
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          }
-        }
-      }
-    }
-
-    if constexpr (update_major) {
-      reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
-                              .Reduce(reduced_e_op_result, reduce_op);
-      if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; }
-    }
-
-    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  if (do_expensive_check) {
+    // currently, nothing to do
   }
-}
-
-template <bool update_major,
-          typename GraphViewType,
-          typename EdgePartitionSrcValueInputWrapper,
-          typename EdgePartitionDstValueInputWrapper,
-          typename EdgePartitionEdgeValueInputWrapper,
-          typename EdgePartitionEdgeMaskWrapper,
-          typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
-                                                         GraphViewType::is_multi_gpu, iterator
-                                                         otherwise */
-          ,
-          typename EdgeOp,
-          typename ReduceOp,
-          typename T>
-__global__ static void per_v_transform_reduce_e_high_degree(
-  edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                               typename GraphViewType::edge_type,
-                               GraphViewType::is_multi_gpu> edge_partition,
-  typename GraphViewType::vertex_type major_range_first,
-  typename GraphViewType::vertex_type major_range_last,
-  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
-  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
-  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
-  ResultValueOutputIteratorOrWrapper result_value_output,
-  EdgeOp e_op,
-  T init /* relevant only if update_major == true */,
-  T identity_element /* relevant only if update_major == true */,
-  ReduceOp reduce_op)
-{
-  static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
-                                  ReduceOp>);  // atomic_reduce is defined only when
-                                               // has_compatible_raft_comms_op_t<ReduceOp> is true
-
-  using vertex_t      = typename GraphViewType::vertex_type;
-  using edge_t        = typename GraphViewType::edge_type;
-  using e_op_result_t = T;
 
-  auto major_start_offset =
-    static_cast<size_t>(major_range_first - edge_partition.major_range_first());
-  auto idx = static_cast<size_t>(blockIdx.x);
-
-  using BlockReduce = cub::BlockReduce<e_op_result_t, per_v_transform_reduce_e_kernel_block_size>;
-  [[maybe_unused]] __shared__
-    typename BlockReduce::TempStorage temp_storage;  // relevant only if update_major == true
-
-  while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
-    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
-    vertex_t const* indices{nullptr};
-    edge_t edge_offset{};
-    edge_t local_degree{};
-    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-
-    auto call_e_op = call_e_op_t<GraphViewType,
-                                 vertex_t,
-                                 EdgePartitionSrcValueInputWrapper,
-                                 EdgePartitionDstValueInputWrapper,
-                                 EdgePartitionEdgeValueInputWrapper,
-                                 EdgeOp>{edge_partition,
-                                         edge_partition_src_value_input,
-                                         edge_partition_dst_value_input,
-                                         edge_partition_e_value_input,
-                                         e_op,
-                                         major,
-                                         major_offset,
-                                         indices,
-                                         edge_offset};
-
-    [[maybe_unused]] auto reduced_e_op_result =
-      threadIdx.x == 0 ? init : identity_element;  // relevant only if update_major == true
-    if (edge_partition_e_mask) {
-      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-        if ((*edge_partition_e_mask).get(edge_offset + i)) {
-          auto e_op_result = call_e_op(i);
-          if constexpr (update_major) {
-            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-          } else {
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
-            if constexpr (GraphViewType::is_multi_gpu) {
-              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-            } else {
-              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-            }
-          }
-        }
-      }
-    } else {
-      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-        auto e_op_result = call_e_op(i);
-        if constexpr (update_major) {
-          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-        } else {
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
-          if constexpr (GraphViewType::is_multi_gpu) {
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          } else {
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          }
-        }
-      }
-    }
-
-    if constexpr (update_major) {
-      reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op);
-      if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; }
-    }
-
-    idx += gridDim.x;
-  }
+  constexpr bool incoming = true;
+
+  detail::per_v_transform_reduce_e<incoming>(
+    handle,
+    graph_view,
+    static_cast<void*>(nullptr),
+    static_cast<void*>(nullptr),
+    edge_src_value_input,
+    edge_dst_value_input,
+    edge_value_input,
+    e_op,
+    init,
+    reduce_op,
+    detail::const_true_e_op_t<typename GraphViewType::vertex_type,
+                              typename GraphViewType::vertex_type,
+                              typename EdgeSrcValueInputWrapper::value_type,
+                              typename EdgeDstValueInputWrapper::value_type,
+                              typename EdgeValueInputWrapper::value_type,
+                              GraphViewType::is_storage_transposed>{},
+    vertex_value_output_first);
 }
 
-template <bool incoming,  // iterate over incoming edges (incoming == true) or outgoing edges
-                          // (incoming == false)
-          typename GraphViewType,
+/**
+ * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the incoming
+ * edges to update (tagged-)vertex properties.
+ *
+ * This function is inspired by thrust::transform_reduce().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex
+ * list.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam T Type of the initial value for per-vertex reduction.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update
+ * (tagged-)vertex properties.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 incoming edges.
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for
+ * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is
+ * deduced as @p vertex_value_output_first + @p key_list.size().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename KeyBucketType,
           typename EdgeSrcValueInputWrapper,
           typename EdgeDstValueInputWrapper,
           typename EdgeValueInputWrapper,
@@ -574,548 +181,50 @@ template <bool incoming,  // iterate over incoming edges (incoming == true) or o
           typename ReduceOp,
           typename T,
           typename VertexValueOutputIterator>
-void per_v_transform_reduce_e(raft::handle_t const& handle,
-                              GraphViewType const& graph_view,
-                              EdgeSrcValueInputWrapper edge_src_value_input,
-                              EdgeDstValueInputWrapper edge_dst_value_input,
-                              EdgeValueInputWrapper edge_value_input,
-                              EdgeOp e_op,
-                              T init,
-                              ReduceOp reduce_op,
-                              VertexValueOutputIterator vertex_value_output_first)
+void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
+                                       GraphViewType const& graph_view,
+                                       KeyBucketType const& key_list,
+                                       EdgeSrcValueInputWrapper edge_src_value_input,
+                                       EdgeDstValueInputWrapper edge_dst_value_input,
+                                       EdgeValueInputWrapper edge_value_input,
+                                       EdgeOp e_op,
+                                       T init,
+                                       ReduceOp reduce_op,
+                                       VertexValueOutputIterator vertex_value_output_first,
+                                       bool do_expensive_check = false)
 {
-  static_assert(ReduceOp::pure_function && reduce_op::has_compatible_raft_comms_op_v<ReduceOp> &&
-                reduce_op::has_identity_element_v<ReduceOp>);  // current restriction, to support
-                                                               // general reduction, we may need to
-                                                               // take a less efficient code path
-
-  constexpr auto update_major = (incoming == GraphViewType::is_storage_transposed);
-  [[maybe_unused]] constexpr auto max_segments =
-    detail::num_sparse_segments_per_vertex_partition + size_t{1};
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
+  static_assert(GraphViewType::is_storage_transposed);
 
-  using edge_partition_src_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
-    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
-    detail::edge_partition_endpoint_property_device_view_t<
-      vertex_t,
-      typename EdgeSrcValueInputWrapper::value_iterator,
-      typename EdgeSrcValueInputWrapper::value_type>>;
-  using edge_partition_dst_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
-    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
-    detail::edge_partition_endpoint_property_device_view_t<
-      vertex_t,
-      typename EdgeDstValueInputWrapper::value_iterator,
-      typename EdgeDstValueInputWrapper::value_type>>;
-  using edge_partition_e_input_device_view_t = std::conditional_t<
-    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
-    detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
-    detail::edge_partition_edge_property_device_view_t<
-      edge_t,
-      typename EdgeValueInputWrapper::value_iterator,
-      typename EdgeValueInputWrapper::value_type>>;
-
-  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-
-  using minor_tmp_buffer_type = std::conditional_t<GraphViewType::is_storage_transposed,
-                                                   edge_src_property_t<GraphViewType, T>,
-                                                   edge_dst_property_t<GraphViewType, T>>;
-  [[maybe_unused]] std::unique_ptr<minor_tmp_buffer_type> minor_tmp_buffer{};
-  if constexpr (GraphViewType::is_multi_gpu && !update_major) {
-    minor_tmp_buffer = std::make_unique<minor_tmp_buffer_type>(handle, graph_view);
-  }
-
-  using edge_partition_minor_output_device_view_t =
-    std::conditional_t<GraphViewType::is_multi_gpu && !update_major,
-                       detail::edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         decltype(minor_tmp_buffer->mutable_view().value_first())>,
-                       void /* dummy */>;
-
-  if constexpr (update_major) {
-    size_t partition_idx = 0;
-    if constexpr (GraphViewType::is_multi_gpu) {
-      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-      auto const minor_comm_rank = minor_comm.get_rank();
-      partition_idx              = static_cast<size_t>(minor_comm_rank);
-    }
-    auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx);
-    if (segment_offsets) {  // no vertices in the zero degree segment are visited
-      thrust::fill(handle.get_thrust_policy(),
-                   vertex_value_output_first + *((*segment_offsets).rbegin() + 1),
-                   vertex_value_output_first + *((*segment_offsets).rbegin()),
-                   init);
-    }
-  } else {
-    if constexpr (GraphViewType::is_multi_gpu) {
-      auto minor_init = init;
-      auto view       = minor_tmp_buffer->view();
-      if (view.keys()) {  // defer applying the initial value to the end as minor_tmp_buffer may not
-                          // store values for the entire minor range
-        minor_init = ReduceOp::identity_element;
-      } else {
-        auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-        auto const major_comm_rank = major_comm.get_rank();
-        minor_init                 = (major_comm_rank == 0) ? init : ReduceOp::identity_element;
-      }
-      fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init);
-    } else {
-      thrust::fill(handle.get_thrust_policy(),
-                   vertex_value_output_first,
-                   vertex_value_output_first + graph_view.local_vertex_partition_range_size(),
-                   init);
-    }
-  }
-
-  std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
-  if constexpr (GraphViewType::is_multi_gpu) {
-    if ((graph_view.local_edge_partition_segment_offsets(0)) &&
-        (handle.get_stream_pool_size() >= max_segments)) {
-      for (size_t i = 1; i < graph_view.number_of_local_edge_partitions(); ++i) {
-        assert(graph_view.local_edge_partition_segment_offsets(i));
-      }
-
-      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-      auto const minor_comm_size = minor_comm.get_size();
-
-      // memory footprint vs parallelism trade-off
-      // peak memory requirement per loop is
-      // update_major ? V / comm_size * sizeof(T) : 0
-      // and limit memory requirement to (E / comm_size) * sizeof(vertex_t)
-
-      size_t num_streams =
-        std::min(static_cast<size_t>(minor_comm_size) * max_segments,
-                 raft::round_down_safe(handle.get_stream_pool_size(), max_segments));
-      if constexpr (update_major) {
-        size_t value_size{0};
-        if constexpr (is_thrust_tuple_of_arithmetic<T>::value) {
-          auto elem_sizes = compute_thrust_tuple_element_sizes<T>{}();
-          value_size      = std::reduce(elem_sizes.begin(), elem_sizes.end());
-        } else {
-          value_size = sizeof(T);
-        }
-
-        auto avg_vertex_degree =
-          graph_view.number_of_vertices() > 0
-            ? (static_cast<double>(graph_view.compute_number_of_edges(handle)) /
-               static_cast<double>(graph_view.number_of_vertices()))
-            : double{0.0};
-
-        num_streams =
-          std::min(static_cast<size_t>(avg_vertex_degree * (static_cast<double>(sizeof(vertex_t)) /
-                                                            static_cast<double>(value_size))) *
-                     max_segments,
-                   num_streams);
-      }
-
-      if (num_streams >= max_segments) {
-        stream_pool_indices = std::vector<size_t>(num_streams);
-        std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
-        handle.sync_stream();
-      }
-    }
-  }
-
-  std::vector<decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{}))>
-    major_tmp_buffers{};
-  if constexpr (GraphViewType::is_multi_gpu && update_major) {
-    std::vector<size_t> major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(),
-                                               size_t{0});
-    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-      auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
-      if (segment_offsets) {
-        major_tmp_buffer_sizes[i] =
-          *((*segment_offsets).rbegin() + 1);  // exclude the zero degree segment
-      } else {
-        if constexpr (GraphViewType::is_storage_transposed) {
-          major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i);
-        } else {
-          major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i);
-        }
-      }
-    }
-    if (stream_pool_indices) {
-      auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments;
-      major_tmp_buffers.reserve(num_concurrent_loops);
-      for (size_t i = 0; i < num_concurrent_loops; ++i) {
-        size_t max_size{0};
-        for (size_t j = i; j < graph_view.number_of_local_edge_partitions();
-             j += num_concurrent_loops) {
-          max_size = std::max(major_tmp_buffer_sizes[j], max_size);
-        }
-        major_tmp_buffers.push_back(allocate_dataframe_buffer<T>(max_size, handle.get_stream()));
-      }
-    } else {
-      major_tmp_buffers.reserve(1);
-      major_tmp_buffers.push_back(allocate_dataframe_buffer<T>(
-        *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()),
-        handle.get_stream()));
-    }
-  } else {  // dummy
-    major_tmp_buffers.reserve(1);
-    major_tmp_buffers.push_back(allocate_dataframe_buffer<T>(size_t{0}, handle.get_stream()));
-  }
-
-  if (stream_pool_indices) { handle.sync_stream(); }
-
-  auto edge_mask_view = graph_view.edge_mask_view();
-
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto edge_partition =
-      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-    auto edge_partition_e_mask =
-      edge_mask_view
-        ? thrust::make_optional<
-            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
-            *edge_mask_view, i)
-        : thrust::nullopt;
-
-    auto major_init = ReduceOp::identity_element;
-    if constexpr (update_major) {
-      if constexpr (GraphViewType::is_multi_gpu) {
-        auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-        auto const minor_comm_rank = minor_comm.get_rank();
-        major_init = (static_cast<int>(i) == minor_comm_rank) ? init : ReduceOp::identity_element;
-      } else {
-        major_init = init;
-      }
-    }
-
-    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
-    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
-    if constexpr (GraphViewType::is_storage_transposed) {
-      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
-      edge_partition_dst_value_input =
-        edge_partition_dst_input_device_view_t(edge_dst_value_input, i);
-    } else {
-      edge_partition_src_value_input =
-        edge_partition_src_input_device_view_t(edge_src_value_input, i);
-      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
-    }
-    auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i);
-
-    auto major_buffer_first =
-      get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]);
-
-    std::conditional_t<GraphViewType::is_multi_gpu,
-                       std::conditional_t<update_major,
-                                          decltype(major_buffer_first),
-                                          edge_partition_minor_output_device_view_t>,
-                       VertexValueOutputIterator>
-      output_buffer{};
-    if constexpr (GraphViewType::is_multi_gpu) {
-      if constexpr (update_major) {
-        output_buffer = major_buffer_first;
-      } else {
-        output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view());
-      }
-    } else {
-      output_buffer = vertex_value_output_first;
-    }
-
-    auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
-    if (segment_offsets) {
-      static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
-
-      // FIXME: we may further improve performance by 1) individually tuning block sizes for
-      // different segments; and 2) adding one more segment for very high degree vertices and
-      // running segmented reduction
-      if (edge_partition.dcs_nzd_vertex_count()) {
-        auto exec_stream =
-          stream_pool_indices
-            ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size())
-            : handle.get_stream();
-
-        if constexpr (update_major) {  // this is necessary as we don't visit every vertex in the
-                                       // hypersparse segment
-          thrust::fill(rmm::exec_policy(exec_stream),
-                       output_buffer + (*segment_offsets)[3],
-                       output_buffer + (*segment_offsets)[4],
-                       major_init);
-        }
-
-        if (*(edge_partition.dcs_nzd_vertex_count()) > 0) {
-          raft::grid_1d_thread_t update_grid(*(edge_partition.dcs_nzd_vertex_count()),
-                                             detail::per_v_transform_reduce_e_kernel_block_size,
-                                             handle.get_device_properties().maxGridSize[0]);
-          auto segment_output_buffer = output_buffer;
-          if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; }
-          detail::per_v_transform_reduce_e_hypersparse<update_major, GraphViewType>
-            <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-              edge_partition,
-              edge_partition_src_value_input,
-              edge_partition_dst_value_input,
-              edge_partition_e_value_input,
-              edge_partition_e_mask,
-              segment_output_buffer,
-              e_op,
-              major_init,
-              ReduceOp::identity_element,
-              reduce_op);
-        }
-      }
-      if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
-        auto exec_stream = stream_pool_indices
-                             ? handle.get_stream_from_stream_pool((i * max_segments + 1) %
-                                                                  (*stream_pool_indices).size())
-                             : handle.get_stream();
-        raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
-                                           detail::per_v_transform_reduce_e_kernel_block_size,
-                                           handle.get_device_properties().maxGridSize[0]);
-        auto segment_output_buffer = output_buffer;
-        if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; }
-        detail::per_v_transform_reduce_e_low_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-            edge_partition,
-            edge_partition.major_range_first() + (*segment_offsets)[2],
-            edge_partition.major_range_first() + (*segment_offsets)[3],
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            segment_output_buffer,
-            e_op,
-            major_init,
-            ReduceOp::identity_element,
-            reduce_op);
-      }
-      if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
-        auto exec_stream = stream_pool_indices
-                             ? handle.get_stream_from_stream_pool((i * max_segments + 2) %
-                                                                  (*stream_pool_indices).size())
-                             : handle.get_stream();
-        raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
-                                         detail::per_v_transform_reduce_e_kernel_block_size,
-                                         handle.get_device_properties().maxGridSize[0]);
-        auto segment_output_buffer = output_buffer;
-        if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; }
-        detail::per_v_transform_reduce_e_mid_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-            edge_partition,
-            edge_partition.major_range_first() + (*segment_offsets)[1],
-            edge_partition.major_range_first() + (*segment_offsets)[2],
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            segment_output_buffer,
-            e_op,
-            major_init,
-            ReduceOp::identity_element,
-            reduce_op);
-      }
-      if ((*segment_offsets)[1] > 0) {
-        auto exec_stream = stream_pool_indices
-                             ? handle.get_stream_from_stream_pool((i * max_segments + 3) %
-                                                                  (*stream_pool_indices).size())
-                             : handle.get_stream();
-        raft::grid_1d_block_t update_grid((*segment_offsets)[1],
-                                          detail::per_v_transform_reduce_e_kernel_block_size,
-                                          handle.get_device_properties().maxGridSize[0]);
-        detail::per_v_transform_reduce_e_high_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-            edge_partition,
-            edge_partition.major_range_first(),
-            edge_partition.major_range_first() + (*segment_offsets)[1],
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            output_buffer,
-            e_op,
-            major_init,
-            ReduceOp::identity_element,
-            reduce_op);
-      }
-    } else {
-      if (edge_partition.major_range_size() > 0) {
-        raft::grid_1d_thread_t update_grid(edge_partition.major_range_size(),
-                                           detail::per_v_transform_reduce_e_kernel_block_size,
-                                           handle.get_device_properties().maxGridSize[0]);
-        detail::per_v_transform_reduce_e_low_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-            edge_partition,
-            edge_partition.major_range_first(),
-            edge_partition.major_range_last(),
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            output_buffer,
-            e_op,
-            major_init,
-            ReduceOp::identity_element,
-            reduce_op);
-      }
-    }
-
-    if constexpr (GraphViewType::is_multi_gpu && update_major) {
-      auto& comm       = handle.get_comms();
-      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-      auto const minor_comm_rank = minor_comm.get_rank();
-      auto const minor_comm_size = minor_comm.get_size();
-
-      if (segment_offsets && stream_pool_indices) {
-        if (edge_partition.dcs_nzd_vertex_count()) {
-          device_reduce(
-            minor_comm,
-            major_buffer_first + (*segment_offsets)[3],
-            vertex_value_output_first + (*segment_offsets)[3],
-            (*segment_offsets)[4] - (*segment_offsets)[3],
-            ReduceOp::compatible_raft_comms_op,
-            static_cast<int>(i),
-            handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()));
-        }
-        if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
-          device_reduce(minor_comm,
-                        major_buffer_first + (*segment_offsets)[2],
-                        vertex_value_output_first + (*segment_offsets)[2],
-                        (*segment_offsets)[3] - (*segment_offsets)[2],
-                        ReduceOp::compatible_raft_comms_op,
-                        static_cast<int>(i),
-                        handle.get_stream_from_stream_pool((i * max_segments + 1) %
-                                                           (*stream_pool_indices).size()));
-        }
-        if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
-          device_reduce(minor_comm,
-                        major_buffer_first + (*segment_offsets)[1],
-                        vertex_value_output_first + (*segment_offsets)[1],
-                        (*segment_offsets)[2] - (*segment_offsets)[1],
-                        ReduceOp::compatible_raft_comms_op,
-                        static_cast<int>(i),
-                        handle.get_stream_from_stream_pool((i * max_segments + 2) %
-                                                           (*stream_pool_indices).size()));
-        }
-        if ((*segment_offsets)[1] > 0) {
-          device_reduce(minor_comm,
-                        major_buffer_first,
-                        vertex_value_output_first,
-                        (*segment_offsets)[1],
-                        ReduceOp::compatible_raft_comms_op,
-                        static_cast<int>(i),
-                        handle.get_stream_from_stream_pool((i * max_segments + 3) %
-                                                           (*stream_pool_indices).size()));
-        }
-      } else {
-        size_t reduction_size = static_cast<size_t>(
-          segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */
-                          : edge_partition.major_range_size());
-        device_reduce(minor_comm,
-                      major_buffer_first,
-                      vertex_value_output_first,
-                      reduction_size,
-                      ReduceOp::compatible_raft_comms_op,
-                      static_cast<int>(i),
-                      handle.get_stream());
-      }
-    }
-
-    if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) {
-      handle.sync_stream_pool(
-        *stream_pool_indices);  // to prevent buffer over-write (this can happen as *segment_offsets
-                                // do not necessarily coincide in different edge partitions).
-    }
+  if (do_expensive_check) {
+    // currently, nothing to do
   }
 
-  if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
-
-  if constexpr (GraphViewType::is_multi_gpu && !update_major) {
-    auto& comm                 = handle.get_comms();
-    auto const comm_rank       = comm.get_rank();
-    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-    auto const major_comm_rank = major_comm.get_rank();
-    auto const major_comm_size = major_comm.get_size();
-    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-    auto const minor_comm_rank = minor_comm.get_rank();
-    auto const minor_comm_size = minor_comm.get_size();
-
-    auto view = minor_tmp_buffer->view();
-    if (view.keys()) {  // applying the initial value is deferred to here
-      vertex_t max_vertex_partition_size{0};
-      for (int i = 0; i < major_comm_size; ++i) {
-        auto this_segment_vertex_partition_id =
-          compute_local_edge_partition_minor_range_vertex_partition_id_t{
-            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        max_vertex_partition_size =
-          std::max(max_vertex_partition_size,
-                   graph_view.vertex_partition_range_size(this_segment_vertex_partition_id));
-      }
-      auto tx_buffer = allocate_dataframe_buffer<T>(max_vertex_partition_size, handle.get_stream());
-      auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer);
-      std::optional<raft::host_span<vertex_t const>> minor_key_offsets{};
-      if constexpr (GraphViewType::is_storage_transposed) {
-        minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets();
-      } else {
-        minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets();
-      }
-      for (int i = 0; i < major_comm_size; ++i) {
-        auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element;
-        auto this_segment_vertex_partition_id =
-          compute_local_edge_partition_minor_range_vertex_partition_id_t{
-            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        thrust::fill(handle.get_thrust_policy(),
-                     tx_buffer_first,
-                     tx_buffer_first +
-                       graph_view.vertex_partition_range_size(this_segment_vertex_partition_id),
-                     minor_init);
-        auto value_first = thrust::make_transform_iterator(
-          view.value_first(),
-          cuda::proclaim_return_type<T>(
-            [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); }));
-        thrust::scatter(handle.get_thrust_policy(),
-                        value_first + (*minor_key_offsets)[i],
-                        value_first + (*minor_key_offsets)[i + 1],
-                        thrust::make_transform_iterator(
-                          (*(view.keys())).begin() + (*minor_key_offsets)[i],
-                          cuda::proclaim_return_type<vertex_t>(
-                            [key_first = graph_view.vertex_partition_range_first(
-                               this_segment_vertex_partition_id)] __device__(auto key) {
-                              return key - key_first;
-                            })),
-                        tx_buffer_first);
-        device_reduce(major_comm,
-                      tx_buffer_first,
-                      vertex_value_output_first,
-                      static_cast<size_t>(
-                        graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)),
-                      ReduceOp::compatible_raft_comms_op,
-                      i,
-                      handle.get_stream());
-      }
-    } else {
-      auto first_segment_vertex_partition_id =
-        compute_local_edge_partition_minor_range_vertex_partition_id_t{
-          major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0);
-      vertex_t minor_range_first =
-        graph_view.vertex_partition_range_first(first_segment_vertex_partition_id);
-      for (int i = 0; i < major_comm_size; ++i) {
-        auto this_segment_vertex_partition_id =
-          compute_local_edge_partition_minor_range_vertex_partition_id_t{
-            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) -
-                      minor_range_first;
-        device_reduce(major_comm,
-                      view.value_first() + offset,
-                      vertex_value_output_first,
-                      static_cast<size_t>(
-                        graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)),
-                      ReduceOp::compatible_raft_comms_op,
-                      i,
-                      handle.get_stream());
-      }
-    }
-  }
+  constexpr bool incoming = true;
+
+  detail::per_v_transform_reduce_e<incoming>(
+    handle,
+    graph_view,
+    key_list.begin(),
+    key_list.end(),
+    edge_src_value_input,
+    edge_dst_value_input,
+    edge_value_input,
+    e_op,
+    init,
+    reduce_op,
+    detail::const_true_e_op_t<typename KeyBucketType::key_type,
+                              typename GraphViewType::vertex_type,
+                              typename EdgeSrcValueInputWrapper::value_type,
+                              typename EdgeDstValueInputWrapper::value_type,
+                              typename EdgeValueInputWrapper::value_type,
+                              GraphViewType::is_storage_transposed>{},
+    vertex_value_output_first);
 }
 
-}  // namespace detail
-
 /**
- * @brief Iterate over every vertex's incoming edges to update vertex properties.
+ * @brief Iterate over every vertex's outgoing edges to update vertex properties.
  *
- * This function is inspired by thrust::transform_reduce.
+ * This function is inspired by thrust::transform_reduce().
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
@@ -1131,8 +240,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
  * @param edge_src_value_input Wrapper used to access source input property values (for the edge
  * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
  * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
- * (if @p e_op does not access source property values). Use update_edge_src_property to
- * fill the wrapper.
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
  * @param edge_dst_value_input Wrapper used to access destination input property values (for the
  * edge destinations assigned to this process in multi-GPU). Use either
  * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
@@ -1145,14 +254,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
  * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
  * destination, and edge and returns a value to be reduced.
  * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 outgoing edges.
  * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
  * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
  * recommended to use the pre-defined reduction operators whenever possible as the current (and
  * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
  * known member variables) to take a more optimized code path. See the documentation in the
  * reduce_op.cuh file for instructions on writing custom reduction operators.
- * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
- * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last`
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
+ * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last`
  * (exclusive) is deduced as @p vertex_value_output_first + @p
  * graph_view.local_vertex_partition_range_size().
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
@@ -1165,7 +276,7 @@ template <typename GraphViewType,
           typename ReduceOp,
           typename T,
           typename VertexValueOutputIterator>
-void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
+void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle,
                                        GraphViewType const& graph_view,
                                        EdgeSrcValueInputWrapper edge_src_value_input,
                                        EdgeDstValueInputWrapper edge_dst_value_input,
@@ -1180,23 +291,37 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
     // currently, nothing to do
   }
 
-  detail::per_v_transform_reduce_e<true>(handle,
-                                         graph_view,
-                                         edge_src_value_input,
-                                         edge_dst_value_input,
-                                         edge_value_input,
-                                         e_op,
-                                         init,
-                                         reduce_op,
-                                         vertex_value_output_first);
+  constexpr bool incoming = false;
+
+  detail::per_v_transform_reduce_e<incoming>(
+    handle,
+    graph_view,
+    static_cast<void*>(nullptr),
+    static_cast<void*>(nullptr),
+    edge_src_value_input,
+    edge_dst_value_input,
+    edge_value_input,
+    e_op,
+    init,
+    reduce_op,
+    detail::const_true_e_op_t<typename GraphViewType::vertex_type,
+                              typename GraphViewType::vertex_type,
+                              typename EdgeSrcValueInputWrapper::value_type,
+                              typename EdgeDstValueInputWrapper::value_type,
+                              typename EdgeValueInputWrapper::value_type,
+                              GraphViewType::is_storage_transposed>{},
+    vertex_value_output_first);
 }
 
 /**
- * @brief Iterate over every vertex's outgoing edges to update vertex properties.
+ * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing
+ * edges to update (tagged-)vertex properties.
  *
  * This function is inspired by thrust::transform_reduce().
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex
+ * list.
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
@@ -1207,6 +332,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
+ * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update
+ * (tagged-)vertex properties.
  * @param edge_src_value_input Wrapper used to access source input property values (for the edge
  * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
  * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
@@ -1223,20 +350,22 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
  * access edge property values).
  * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
  * destination, and edge and returns a value to be reduced.
- * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
+ * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex.
+ * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the
+ * (tagged-)vertices with 0 outgoing edges.
  * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
  * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is
  * recommended to use the pre-defined reduction operators whenever possible as the current (and
  * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
  * known member variables) to take a more optimized code path. See the documentation in the
  * reduce_op.cuh file for instructions on writing custom reduction operators.
- * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
- * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last`
- * (exclusive) is deduced as @p vertex_value_output_first + @p
- * graph_view.local_vertex_partition_range_size().
+ * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for
+ * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is
+ * deduced as @p vertex_value_output_first + @p key_list.size().
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename GraphViewType,
+          typename KeyBucketType,
           typename EdgeSrcValueInputWrapper,
           typename EdgeDstValueInputWrapper,
           typename EdgeValueInputWrapper,
@@ -1246,6 +375,7 @@ template <typename GraphViewType,
           typename VertexValueOutputIterator>
 void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle,
                                        GraphViewType const& graph_view,
+                                       KeyBucketType const& key_list,
                                        EdgeSrcValueInputWrapper edge_src_value_input,
                                        EdgeDstValueInputWrapper edge_dst_value_input,
                                        EdgeValueInputWrapper edge_value_input,
@@ -1255,19 +385,33 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
+  static_assert(!GraphViewType::is_storage_transposed);
+  static_assert(KeyBucketType::is_sorted_unique);
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
 
-  detail::per_v_transform_reduce_e<false>(handle,
-                                          graph_view,
-                                          edge_src_value_input,
-                                          edge_dst_value_input,
-                                          edge_value_input,
-                                          e_op,
-                                          init,
-                                          reduce_op,
-                                          vertex_value_output_first);
+  constexpr bool incoming = false;
+
+  detail::per_v_transform_reduce_e<incoming>(
+    handle,
+    graph_view,
+    key_list.begin(),
+    key_list.end(),
+    edge_src_value_input,
+    edge_dst_value_input,
+    edge_value_input,
+    e_op,
+    init,
+    reduce_op,
+    detail::const_true_e_op_t<typename KeyBucketType::key_type,
+                              typename GraphViewType::vertex_type,
+                              typename EdgeSrcValueInputWrapper::value_type,
+                              typename EdgeDstValueInputWrapper::value_type,
+                              typename EdgeValueInputWrapper::value_type,
+                              GraphViewType::is_storage_transposed>{},
+    vertex_value_output_first);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
new file mode 100644
index 0000000000..87f590f571
--- /dev/null
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -0,0 +1,1196 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "detail/graph_partition_utils.cuh"
+#include "prims/detail/extract_transform_v_frontier_e.cuh"
+#include "prims/detail/prim_utils.cuh"
+#include "prims/property_op_utils.cuh"
+#include "prims/reduce_op.cuh"
+
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/device_comm.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cub/cub.cuh>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <thrust/unique.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace cugraph {
+
+namespace detail {
+
+int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512;
+
+template <typename key_t,
+          typename payload_t,
+          typename vertex_t,
+          typename src_value_t,
+          typename dst_value_t,
+          typename e_value_t,
+          typename EdgeOp>
+struct transform_reduce_v_frontier_call_e_op_t {
+  EdgeOp e_op{};
+
+  __device__ thrust::optional<
+    std::conditional_t<!std::is_same_v<key_t, void> && !std::is_same_v<payload_t, void>,
+                       thrust::tuple<key_t, payload_t>,
+                       std::conditional_t<!std::is_same_v<key_t, void>, key_t, payload_t>>>
+  operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const
+  {
+    auto e_op_result = e_op(key, dst, sv, dv, ev);
+    if (e_op_result.has_value()) {
+      auto reduce_by = dst;
+      if constexpr (std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
+        return reduce_by;
+      } else if constexpr (std::is_same_v<key_t, vertex_t> && !std::is_same_v<payload_t, void>) {
+        return thrust::make_tuple(reduce_by, *e_op_result);
+      } else if constexpr (!std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
+        return thrust::make_tuple(reduce_by, *e_op_result);
+      } else {
+        return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)),
+                                  thrust::get<1>(*e_op_result));
+      }
+    } else {
+      return thrust::nullopt;
+    }
+  }
+};
+
+template <typename InputKeyIterator, typename key_t>
+struct update_keep_flag_t {
+  using input_key_t =
+    typename thrust::iterator_traits<InputKeyIterator>::value_type;  // uint32_t (compressed) or
+                                                                     // key_t (i.e. vertex_t)
+
+  raft::device_span<uint32_t> bitmap{};
+  raft::device_span<uint32_t> keep_flags{};
+  key_t v_range_first{};
+  InputKeyIterator input_key_first{};
+  thrust::optional<input_key_t> invalid_input_key{};
+
+  __device__ void operator()(size_t i) const
+  {
+    auto v = *(input_key_first + i);
+    if (invalid_input_key && (v == *invalid_input_key)) {
+      return;  // just discard
+    }
+    input_key_t v_offset{};
+    if constexpr ((sizeof(key_t) == 8) && std::is_same_v<input_key_t, uint32_t>) {
+      v_offset = v;
+    } else {
+      v_offset = v - v_range_first;
+    }
+    cuda::atomic_ref<uint32_t, cuda::thread_scope_device> bitmap_word(
+      bitmap[packed_bool_offset(v_offset)]);
+    auto old = bitmap_word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed);
+    if ((old & packed_bool_mask(v_offset)) == packed_bool_empty_mask()) {
+      cuda::atomic_ref<uint32_t, cuda::thread_scope_device> keep_flag_word(
+        keep_flags[packed_bool_offset(i)]);
+      keep_flag_word.fetch_or(packed_bool_mask(i), cuda::std::memory_order_relaxed);
+    }
+  }
+};
+
+template <typename priority_t, typename vertex_t, typename payload_t>
+std::tuple<rmm::device_uvector<vertex_t>, optional_dataframe_buffer_type_t<payload_t>>
+filter_buffer_elements(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&&
+    unique_v_buffer,  // assumes that buffer elements are locally reduced first and unique
+  optional_dataframe_buffer_type_t<payload_t>&& payload_buffer,
+  raft::device_span<vertex_t const> vertex_partition_range_offsets,  // size = major_comm_size + 1
+  vertex_t allreduce_count_per_rank,
+  int subgroup_size)
+{
+  auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+  auto const major_comm_rank = major_comm.get_rank();
+  auto const major_comm_size = major_comm.get_size();
+
+  rmm::device_uvector<priority_t> priorities(allreduce_count_per_rank * major_comm_size,
+                                             handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(),
+               priorities.begin(),
+               priorities.end(),
+               std::numeric_limits<priority_t>::max());
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    unique_v_buffer.begin(),
+    unique_v_buffer.end(),
+    [offsets    = vertex_partition_range_offsets,
+     priorities = raft::device_span<priority_t>(priorities.data(), priorities.size()),
+     allreduce_count_per_rank,
+     subgroup_size,
+     major_comm_rank,
+     major_comm_size] __device__(auto v) {
+      auto root =
+        thrust::distance(offsets.begin() + 1,
+                         thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v));
+      auto v_offset = v - offsets[root];
+      if (v_offset < allreduce_count_per_rank) {
+        priorities[allreduce_count_per_rank * root + v_offset] =
+          rank_to_priority<vertex_t, priority_t>(
+            major_comm_rank, root, subgroup_size, major_comm_size, v_offset);
+      }
+    });
+  device_allreduce(major_comm,
+                   priorities.data(),
+                   priorities.data(),
+                   priorities.size(),
+                   raft::comms::op_t::MIN,
+                   handle.get_stream());
+  if constexpr (std::is_same_v<payload_t, void>) {
+    unique_v_buffer.resize(
+      thrust::distance(
+        unique_v_buffer.begin(),
+        thrust::remove_if(
+          handle.get_thrust_policy(),
+          unique_v_buffer.begin(),
+          unique_v_buffer.end(),
+          unique_v_buffer.begin(),
+          [offsets    = vertex_partition_range_offsets,
+           priorities = raft::device_span<priority_t const>(priorities.data(), priorities.size()),
+           allreduce_count_per_rank,
+           subgroup_size,
+           major_comm_rank,
+           major_comm_size] __device__(auto v) {
+            auto root = thrust::distance(
+              offsets.begin() + 1,
+              thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v));
+            auto v_offset = v - offsets[root];
+            if (v_offset < allreduce_count_per_rank) {
+              auto selected_rank = priority_to_rank<vertex_t, priority_t>(
+                priorities[allreduce_count_per_rank * root + v_offset],
+                root,
+                subgroup_size,
+                major_comm_size,
+                v_offset);
+              return major_comm_rank != selected_rank;
+            } else {
+              return false;
+            }
+          })),
+      handle.get_stream());
+  } else {
+    auto kv_pair_first = thrust::make_zip_iterator(unique_v_buffer.begin(),
+                                                   get_dataframe_buffer_begin(payload_buffer));
+    unique_v_buffer.resize(
+      thrust::distance(
+        kv_pair_first,
+        thrust::remove_if(
+          handle.get_thrust_policy(),
+          kv_pair_first,
+          kv_pair_first + unique_v_buffer.size(),
+          unique_v_buffer.begin(),
+          [offsets    = vertex_partition_range_offsets,
+           priorities = raft::device_span<priority_t const>(priorities.data(), priorities.size()),
+           allreduce_count_per_rank,
+           subgroup_size,
+           major_comm_rank,
+           major_comm_size] __device__(auto v) {
+            auto root = thrust::distance(
+              offsets.begin() + 1,
+              thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v));
+            auto v_offset = v - offsets[root];
+            if (v_offset < allreduce_count_per_rank) {
+              auto selected_rank = priority_to_rank<vertex_t, priority_t>(
+                priorities[allreduce_count_per_rank * root + v_offset],
+                root,
+                subgroup_size,
+                major_comm_size,
+                v_offset);
+              return major_comm_rank != selected_rank;
+            } else {
+              return false;
+            }
+          })),
+      handle.get_stream());
+    resize_dataframe_buffer(payload_buffer, unique_v_buffer.size(), handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(unique_v_buffer), std::move(payload_buffer));
+}
+
+template <typename input_key_t /* uint32_t if 64 bit vertex IDs are compressed to 32 bit offsets,
+                                  otherwise input_key_t == output_key_t */
+          ,
+          typename key_t,
+          typename payload_t,
+          typename ReduceOp>
+std::tuple<dataframe_buffer_type_t<key_t>, optional_dataframe_buffer_type_t<payload_t>>
+sort_and_reduce_buffer_elements(
+  raft::handle_t const& handle,
+  dataframe_buffer_type_t<input_key_t>&& key_buffer,
+  optional_dataframe_buffer_type_t<payload_t>&& payload_buffer,
+  ReduceOp reduce_op,
+  std::conditional_t<std::is_integral_v<key_t>, std::tuple<key_t, key_t>, std::byte /* dummy */>
+    vertex_range,
+  std::optional<input_key_t> invalid_key /* drop (key, (payload)) pairs with invalid key */)
+{
+  constexpr bool compressed =
+    std::is_integral_v<key_t> && (sizeof(key_t) == 8) &&
+    std::is_same_v<input_key_t, uint32_t>;  // we currently compress only when key_t is an integral
+                                            // type (i.e. vertex_t)
+  static_assert(compressed || std::is_same_v<input_key_t, key_t>);
+
+  if constexpr (std::is_integral_v<key_t> &&
+                (std::is_same_v<payload_t, void> ||
+                 std::is_same_v<ReduceOp,
+                                reduce_op::any<typename ReduceOp::value_type>>)) {  // try to use
+                                                                                    // bitmap for
+                                                                                    // filtering
+    key_t range_size = std::get<1>(vertex_range) - std::get<0>(vertex_range);
+    if (static_cast<double>(size_dataframe_buffer(key_buffer)) >=
+        static_cast<double>(range_size) *
+          0.125 /* tuning parameter */) {  // use bitmap for filtering
+      rmm::device_uvector<uint32_t> bitmap(packed_bool_size(range_size), handle.get_stream());
+      rmm::device_uvector<uint32_t> keep_flags(packed_bool_size(size_dataframe_buffer(key_buffer)),
+                                               handle.get_stream());
+      thrust::fill(
+        handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask());
+      thrust::fill(
+        handle.get_thrust_policy(), keep_flags.begin(), keep_flags.end(), packed_bool_empty_mask());
+      thrust::for_each(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(size_t{0}),
+                       thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)),
+                       update_keep_flag_t<decltype(get_dataframe_buffer_begin(key_buffer)), key_t>{
+                         raft::device_span<uint32_t>(bitmap.data(), bitmap.size()),
+                         raft::device_span<uint32_t>(keep_flags.data(), keep_flags.size()),
+                         std::get<0>(vertex_range),
+                         get_dataframe_buffer_begin(key_buffer),
+                         to_thrust_optional(invalid_key)});
+      auto stencil_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<bool>(
+          [keep_flags = raft::device_span<uint32_t const>(keep_flags.data(),
+                                                          keep_flags.size())] __device__(size_t i) {
+            return (keep_flags[packed_bool_offset(i)] & packed_bool_mask(i)) !=
+                   packed_bool_empty_mask();
+          }));
+      if constexpr (std::is_same_v<payload_t, void>) {
+        resize_dataframe_buffer(
+          key_buffer,
+          thrust::distance(get_dataframe_buffer_begin(key_buffer),
+                           thrust::remove_if(handle.get_thrust_policy(),
+                                             get_dataframe_buffer_begin(key_buffer),
+                                             get_dataframe_buffer_end(key_buffer),
+                                             stencil_first,
+                                             is_not_equal_t<bool>{true})),
+          handle.get_stream());
+        shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream());
+        thrust::sort(handle.get_thrust_policy(),
+                     get_dataframe_buffer_begin(key_buffer),
+                     get_dataframe_buffer_end(key_buffer));
+      } else {
+        static_assert(std::is_same_v<ReduceOp, reduce_op::any<typename ReduceOp::value_type>>);
+        auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer),
+                                                    get_dataframe_buffer_begin(payload_buffer));
+        resize_dataframe_buffer(
+          key_buffer,
+          thrust::distance(pair_first,
+                           thrust::remove_if(handle.get_thrust_policy(),
+                                             pair_first,
+                                             pair_first + size_dataframe_buffer(key_buffer),
+                                             stencil_first,
+                                             is_not_equal_t<bool>{true})),
+          handle.get_stream());
+        resize_dataframe_buffer(
+          payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream());
+        shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream());
+        shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            get_dataframe_buffer_begin(key_buffer),
+                            get_dataframe_buffer_end(key_buffer),
+                            get_dataframe_buffer_begin(payload_buffer));
+      }
+
+      if constexpr (compressed) {
+        rmm::device_uvector<key_t> output_key_buffer(key_buffer.size(), handle.get_stream());
+        thrust::transform(handle.get_thrust_policy(),
+                          key_buffer.begin(),
+                          key_buffer.end(),
+                          output_key_buffer.begin(),
+                          cuda::proclaim_return_type<key_t>(
+                            [v_first = std::get<0>(vertex_range)] __device__(uint32_t v_offset) {
+                              return static_cast<key_t>(v_first + v_offset);
+                            }));
+        return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer));
+      } else {
+        return std::make_tuple(std::move(key_buffer), std::move(payload_buffer));
+      }
+    }
+  }
+
+  if constexpr (std::is_same_v<payload_t, void>) {
+    thrust::sort(handle.get_thrust_policy(),
+                 get_dataframe_buffer_begin(key_buffer),
+                 get_dataframe_buffer_end(key_buffer));
+  } else {
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        get_dataframe_buffer_begin(key_buffer),
+                        get_dataframe_buffer_end(key_buffer),
+                        get_optional_dataframe_buffer_begin<payload_t>(payload_buffer));
+  }
+
+  auto output_key_buffer = allocate_dataframe_buffer<key_t>(0, handle.get_stream());
+  if constexpr (std::is_same_v<payload_t, void>) {
+    if constexpr (compressed) {
+      resize_dataframe_buffer(
+        output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream());
+      auto input_key_first = thrust::make_transform_iterator(
+        get_dataframe_buffer_begin(key_buffer),
+        cuda::proclaim_return_type<key_t>(
+          [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) {
+            return static_cast<key_t>(v_first + v_offset);
+          }));
+      resize_dataframe_buffer(
+        output_key_buffer,
+        thrust::distance(
+          get_dataframe_buffer_begin(output_key_buffer),
+          thrust::copy_if(handle.get_thrust_policy(),
+                          input_key_first,
+                          input_key_first + size_dataframe_buffer(key_buffer),
+                          thrust::make_counting_iterator(size_t{0}),
+                          get_dataframe_buffer_begin(output_key_buffer),
+                          cuda::proclaim_return_type<bool>(
+                            [key_first   = get_dataframe_buffer_begin(key_buffer),
+                             invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) {
+                              auto key = *(key_first + i);
+                              if (invalid_key && (key == *invalid_key)) {
+                                return false;
+                              } else if ((i != 0) && (key == *(key_first + (i - 1)))) {
+                                return false;
+                              } else {
+                                return true;
+                              }
+                            }))),
+        handle.get_stream());
+    } else {
+      resize_dataframe_buffer(
+        key_buffer,
+        thrust::distance(
+          get_dataframe_buffer_begin(key_buffer),
+          thrust::remove_if(handle.get_thrust_policy(),
+                            get_dataframe_buffer_begin(key_buffer),
+                            get_dataframe_buffer_end(key_buffer),
+                            thrust::make_counting_iterator(size_t{0}),
+                            cuda::proclaim_return_type<bool>(
+                              [key_first   = get_dataframe_buffer_begin(key_buffer),
+                               invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) {
+                                auto key = *(key_first + i);
+                                if (invalid_key && (key == *invalid_key)) {
+                                  return true;
+                                } else if ((i != 0) && (key == *(key_first + (i - 1)))) {
+                                  return true;
+                                } else {
+                                  return false;
+                                }
+                              }))),
+        handle.get_stream());
+      output_key_buffer = std::move(key_buffer);
+    }
+    shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream());
+  } else if constexpr (std::is_same_v<ReduceOp, reduce_op::any<typename ReduceOp::value_type>>) {
+    if constexpr (compressed) {
+      resize_dataframe_buffer(
+        output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream());
+      auto input_key_first = thrust::make_transform_iterator(
+        get_dataframe_buffer_begin(key_buffer),
+        cuda::proclaim_return_type<key_t>(
+          [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) {
+            return static_cast<key_t>(v_first + v_offset);
+          }));
+      auto tmp_payload_buffer = allocate_dataframe_buffer<payload_t>(
+        size_dataframe_buffer(payload_buffer), handle.get_stream());
+      auto input_pair_first =
+        thrust::make_zip_iterator(input_key_first, get_dataframe_buffer_begin(payload_buffer));
+      auto output_pair_first =
+        thrust::make_zip_iterator(get_dataframe_buffer_begin(output_key_buffer),
+                                  get_dataframe_buffer_begin(tmp_payload_buffer));
+      resize_dataframe_buffer(
+        output_key_buffer,
+        thrust::distance(
+          output_pair_first,
+          thrust::copy_if(handle.get_thrust_policy(),
+                          input_pair_first,
+                          input_pair_first + size_dataframe_buffer(key_buffer),
+                          thrust::make_counting_iterator(size_t{0}),
+                          output_pair_first,
+                          cuda::proclaim_return_type<bool>(
+                            [key_first   = get_dataframe_buffer_begin(key_buffer),
+                             invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) {
+                              auto key = *(key_first + i);
+                              if (invalid_key && (key == *invalid_key)) {
+                                return false;
+                              } else if ((i != 0) && (key == *(key_first + (i - 1)))) {
+                                return false;
+                              } else {
+                                return true;
+                              }
+                            }))),
+        handle.get_stream());
+      resize_dataframe_buffer(
+        tmp_payload_buffer, size_dataframe_buffer(output_key_buffer), handle.get_stream());
+      payload_buffer = std::move(tmp_payload_buffer);
+    } else {
+      auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer),
+                                                  get_dataframe_buffer_begin(payload_buffer));
+      resize_dataframe_buffer(
+        key_buffer,
+        thrust::distance(
+          pair_first,
+          thrust::remove_if(handle.get_thrust_policy(),
+                            pair_first,
+                            pair_first + size_dataframe_buffer(key_buffer),
+                            thrust::make_counting_iterator(size_t{0}),
+                            cuda::proclaim_return_type<bool>(
+                              [key_first   = get_dataframe_buffer_begin(key_buffer),
+                               invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) {
+                                auto key = *(key_first + i);
+                                if (invalid_key && (key == *invalid_key)) {
+                                  return true;
+                                } else if ((i != 0) && (key == *(key_first + (i - 1)))) {
+                                  return true;
+                                } else {
+                                  return false;
+                                }
+                              }))),
+        handle.get_stream());
+      resize_dataframe_buffer(
+        payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream());
+      output_key_buffer = std::move(key_buffer);
+    }
+    shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream());
+    shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream());
+  } else {
+    if (invalid_key) {
+      auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer),
+                                                  get_dataframe_buffer_begin(payload_buffer));
+      resize_dataframe_buffer(
+        key_buffer,
+        thrust::distance(pair_first,
+                         thrust::remove_if(handle.get_thrust_policy(),
+                                           pair_first,
+                                           pair_first + size_dataframe_buffer(key_buffer),
+                                           cuda::proclaim_return_type<bool>(
+                                             [invalid_key = *invalid_key] __device__(auto kv) {
+                                               auto key = thrust::get<0>(kv);
+                                               return key == invalid_key;
+                                             }))),
+        handle.get_stream());
+      resize_dataframe_buffer(
+        payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream());
+    }
+    auto num_uniques =
+      thrust::count_if(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(size_t{0}),
+                       thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)),
+                       is_first_in_run_t<decltype(get_dataframe_buffer_begin(key_buffer))>{
+                         get_dataframe_buffer_begin(key_buffer)});
+
+    auto new_key_buffer = allocate_dataframe_buffer<key_t>(num_uniques, handle.get_stream());
+    auto new_payload_buffer =
+      allocate_dataframe_buffer<payload_t>(num_uniques, handle.get_stream());
+
+    if constexpr (compressed) {
+      auto input_key_first = thrust::make_transform_iterator(
+        get_dataframe_buffer_begin(key_buffer),
+        cuda::proclaim_return_type<key_t>(
+          [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) {
+            return static_cast<key_t>(v_first + v_offset);
+          }));
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + size_dataframe_buffer(key_buffer),
+                            get_optional_dataframe_buffer_begin<payload_t>(payload_buffer),
+                            get_dataframe_buffer_begin(new_key_buffer),
+                            get_dataframe_buffer_begin(new_payload_buffer),
+                            thrust::equal_to<key_t>(),
+                            reduce_op);
+    } else {
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            get_dataframe_buffer_begin(key_buffer),
+                            get_dataframe_buffer_end(key_buffer),
+                            get_optional_dataframe_buffer_begin<payload_t>(payload_buffer),
+                            get_dataframe_buffer_begin(new_key_buffer),
+                            get_dataframe_buffer_begin(new_payload_buffer),
+                            thrust::equal_to<key_t>(),
+                            reduce_op);
+    }
+
+    output_key_buffer = std::move(new_key_buffer);
+    payload_buffer    = std::move(new_payload_buffer);
+  }
+
+  return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer));
+}
+
+template <typename GraphViewType,
+          typename KeyBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp>
+std::conditional_t<
+  !std::is_same_v<typename ReduceOp::value_type, void>,
+  std::tuple<decltype(allocate_dataframe_buffer<typename KeyBucketType::key_type>(
+               0, rmm::cuda_stream_view{})),
+             decltype(detail::allocate_optional_dataframe_buffer<typename ReduceOp::value_type>(
+               0, rmm::cuda_stream_view{}))>,
+  decltype(allocate_dataframe_buffer<typename KeyBucketType::key_type>(0, rmm::cuda_stream_view{}))>
+transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
+                                              GraphViewType const& graph_view,
+                                              KeyBucketType const& frontier,
+                                              EdgeSrcValueInputWrapper edge_src_value_input,
+                                              EdgeDstValueInputWrapper edge_dst_value_input,
+                                              EdgeValueInputWrapper edge_value_input,
+                                              EdgeOp e_op,
+                                              ReduceOp reduce_op,
+                                              bool do_expensive_check = false)
+{
+  static_assert(!GraphViewType::is_storage_transposed,
+                "GraphViewType should support the push model.");
+
+  using vertex_t  = typename GraphViewType::vertex_type;
+  using edge_t    = typename GraphViewType::edge_type;
+  using key_t     = typename KeyBucketType::key_type;
+  using payload_t = typename ReduceOp::value_type;
+
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  // 1. fill the buffer
+
+  detail::transform_reduce_v_frontier_call_e_op_t<key_t,
+                                                  payload_t,
+                                                  vertex_t,
+                                                  typename EdgeSrcValueInputWrapper::value_type,
+                                                  typename EdgeDstValueInputWrapper::value_type,
+                                                  typename EdgeValueInputWrapper::value_type,
+                                                  EdgeOp>
+    e_op_wrapper{e_op};
+
+  auto [key_buffer, payload_buffer] =
+    detail::extract_transform_v_frontier_e<false, key_t, payload_t>(handle,
+                                                                    graph_view,
+                                                                    frontier,
+                                                                    edge_src_value_input,
+                                                                    edge_dst_value_input,
+                                                                    edge_value_input,
+                                                                    e_op_wrapper,
+                                                                    do_expensive_check);
+  // 2. reduce the buffer
+
+  std::vector<vertex_t> vertex_partition_range_offsets{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_rank = major_comm.get_rank();
+    auto const major_comm_size = major_comm.get_size();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
+    vertex_partition_range_offsets = std::vector<vertex_t>(major_comm_size + 1);
+    for (int i = 0; i < major_comm_size; ++i) {
+      auto vertex_partition_id =
+        detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{
+          major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
+      vertex_partition_range_offsets[i] =
+        graph_view.vertex_partition_range_first(vertex_partition_id);
+    }
+    vertex_partition_range_offsets.back() = graph_view.local_edge_partition_dst_range_last();
+  } else {
+    vertex_partition_range_offsets =
+      std::vector<vertex_t>{graph_view.local_edge_partition_dst_range_first(),
+                            graph_view.local_edge_partition_dst_range_last()};
+  }
+  std::conditional_t<std::is_integral_v<key_t>, std::tuple<key_t, key_t>, std::byte /* dummy */>
+    vertex_range{};
+  if constexpr (std::is_integral_v<key_t>) {
+    vertex_range = std::make_tuple(vertex_partition_range_offsets.front(),
+                                   vertex_partition_range_offsets.back());
+  }
+  std::tie(key_buffer, payload_buffer) =
+    detail::sort_and_reduce_buffer_elements<key_t, key_t, payload_t, ReduceOp>(
+      handle,
+      std::move(key_buffer),
+      std::move(payload_buffer),
+      reduce_op,
+      vertex_range,
+      std::nullopt);
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_size = major_comm.get_size();
+    if (major_comm_size > 1) {
+      size_t local_key_buffer_size = size_dataframe_buffer(key_buffer);
+      auto avg_key_buffer_size =
+        host_scalar_allreduce(
+          major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) /
+        major_comm_size;
+
+      rmm::device_uvector<vertex_t> d_vertex_partition_range_offsets(
+        vertex_partition_range_offsets.size(), handle.get_stream());
+      raft::update_device(d_vertex_partition_range_offsets.data(),
+                          vertex_partition_range_offsets.data(),
+                          vertex_partition_range_offsets.size(),
+                          handle.get_stream());
+
+      constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v<key_t, vertex_t>;
+      std::conditional_t<try_compression, vertex_t, std::byte /* dummy */>
+        max_vertex_partition_size{};
+      if constexpr (try_compression) {
+        for (int i = 0; i < major_comm_size; ++i) {
+          max_vertex_partition_size =
+            std::max(vertex_partition_range_offsets[i + 1] - vertex_partition_range_offsets[i],
+                     max_vertex_partition_size);
+        }
+      }
+
+      if constexpr (std::is_same_v<key_t, vertex_t> &&
+                    std::is_same_v<ReduceOp, reduce_op::any<typename ReduceOp::value_type>>) {
+        vertex_t min_vertex_partition_size = std::numeric_limits<vertex_t>::max();
+        for (int i = 0; i < major_comm_size; ++i) {
+          min_vertex_partition_size =
+            std::min(vertex_partition_range_offsets[i + 1] - vertex_partition_range_offsets[i],
+                     min_vertex_partition_size);
+        }
+
+        auto segment_offsets = graph_view.local_vertex_partition_segment_offsets();
+        auto& comm           = handle.get_comms();
+        auto const comm_size = comm.get_size();
+        if (segment_offsets &&
+            (static_cast<double>(avg_key_buffer_size) >
+             static_cast<double>(graph_view.number_of_vertices() / comm_size) *
+               double{0.2})) {  // duplicates expected for high in-degree vertices (and we assume
+                                // correlation between in-degrees & out-degrees)  // FIXME: we need
+                                // a better criterion
+          size_t key_size{0};
+          size_t payload_size{0};
+          if constexpr (try_compression) {
+            if (max_vertex_partition_size <= std::numeric_limits<uint32_t>::max()) {
+              key_size = sizeof(uint32_t);
+            } else {
+              key_size = sizeof(key_t);
+            }
+          } else {
+            if constexpr (std::is_arithmetic_v<key_t>) {
+              key_size = sizeof(key_t);
+            } else {
+              key_size = sum_thrust_tuple_element_sizes<key_t>();
+            }
+          }
+          if constexpr (!std::is_same_v<payload_t, void>) {
+            if constexpr (std::is_arithmetic_v<payload_t>) {
+              payload_size = sizeof(payload_t);
+            } else {
+              payload_size = sum_thrust_tuple_element_sizes<payload_t>();
+            }
+          }
+
+          int subgroup_size{};
+          int num_gpus_per_node{};
+          RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+          if (comm_size <= num_gpus_per_node) {
+            subgroup_size = major_comm_size;
+          } else {
+            auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+            auto const minor_comm_size = minor_comm.get_size();
+            subgroup_size              = partition_manager::map_major_comm_to_gpu_row_comm
+                                           ? std::min(major_comm_size, num_gpus_per_node)
+                                           : std::max(num_gpus_per_node / minor_comm_size, int{1});
+          }
+
+          auto p2p_size_per_rank       = avg_key_buffer_size * (key_size + payload_size);
+          auto p2p_size_per_node       = p2p_size_per_rank * std::min(num_gpus_per_node, comm_size);
+          auto allreduce_size_per_node = p2p_size_per_node / 16 /* tuning parameter */;
+          auto allreduce_size_per_rank =
+            allreduce_size_per_node / (major_comm_size * (num_gpus_per_node / subgroup_size));
+
+          if (major_comm_size <= std::numeric_limits<uint8_t>::max()) {  // priority = uint8_t
+            std::tie(key_buffer, payload_buffer) =
+              filter_buffer_elements<uint8_t, key_t, payload_t>(
+                handle,
+                std::move(key_buffer),
+                std::move(payload_buffer),
+                raft::device_span<vertex_t const>(d_vertex_partition_range_offsets.data(),
+                                                  d_vertex_partition_range_offsets.size()),
+                std::min(static_cast<vertex_t>(allreduce_size_per_rank / sizeof(uint8_t)),
+                         min_vertex_partition_size),
+                subgroup_size);
+          } else {  // priority = uint32_t
+            std::tie(key_buffer, payload_buffer) =
+              filter_buffer_elements<uint32_t, key_t, payload_t>(
+                handle,
+                std::move(key_buffer),
+                std::move(payload_buffer),
+                raft::device_span<vertex_t const>(d_vertex_partition_range_offsets.data(),
+                                                  d_vertex_partition_range_offsets.size()),
+                std::min(static_cast<vertex_t>(allreduce_size_per_rank / sizeof(uint32_t)),
+                         min_vertex_partition_size),
+                subgroup_size);
+          }
+        }
+      }
+
+      rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(major_comm_size, handle.get_stream());
+      auto key_v_first =
+        thrust_tuple_get_or_identity<decltype(get_dataframe_buffer_begin(key_buffer)), 0>(
+          get_dataframe_buffer_begin(key_buffer));
+      thrust::lower_bound(handle.get_thrust_policy(),
+                          key_v_first,
+                          key_v_first + size_dataframe_buffer(key_buffer),
+                          d_vertex_partition_range_offsets.begin() + 1,
+                          d_vertex_partition_range_offsets.end(),
+                          d_tx_buffer_last_boundaries.begin());
+      std::conditional_t<try_compression,
+                         std::optional<rmm::device_uvector<uint32_t>>,
+                         std::byte /* dummy */>
+        compressed_v_buffer{};
+      if constexpr (try_compression) {
+        if (max_vertex_partition_size <= std::numeric_limits<uint32_t>::max()) {
+          compressed_v_buffer =
+            rmm::device_uvector<uint32_t>(size_dataframe_buffer(key_buffer), handle.get_stream());
+          thrust::transform(
+            handle.get_thrust_policy(),
+            get_dataframe_buffer_begin(key_buffer),
+            get_dataframe_buffer_end(key_buffer),
+            (*compressed_v_buffer).begin(),
+            cuda::proclaim_return_type<uint32_t>(
+              [firsts = raft::device_span<vertex_t const>(d_vertex_partition_range_offsets.data(),
+                                                          static_cast<size_t>(major_comm_size)),
+               lasts  = raft::device_span<vertex_t const>(
+                 d_vertex_partition_range_offsets.data() + 1,
+                 static_cast<size_t>(major_comm_size))] __device__(auto v) {
+                auto major_comm_rank = thrust::distance(
+                  lasts.begin(), thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v));
+                return static_cast<uint32_t>(v - firsts[major_comm_rank]);
+              }));
+          resize_dataframe_buffer(key_buffer, 0, handle.get_stream());
+          shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream());
+        }
+      }
+      std::vector<edge_t> h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size());
+      raft::update_host(h_tx_buffer_last_boundaries.data(),
+                        d_tx_buffer_last_boundaries.data(),
+                        d_tx_buffer_last_boundaries.size(),
+                        handle.get_stream());
+      handle.sync_stream();
+      std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
+      std::adjacent_difference(
+        h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
+
+      size_t min_element_size{cache_line_size};
+      if constexpr (std::is_same_v<key_t, vertex_t>) {
+        if constexpr (try_compression) {
+          if (compressed_v_buffer) {
+            min_element_size = std::min(sizeof(uint32_t), min_element_size);
+          } else {
+            min_element_size = std::min(sizeof(key_t), min_element_size);
+          }
+        } else {
+          min_element_size = std::min(sizeof(key_t), min_element_size);
+        }
+      } else {
+        static_assert(is_thrust_tuple_of_arithmetic<key_t>::value);
+        min_element_size =
+          std::min(cugraph::min_thrust_tuple_element_sizes<key_t>(), min_element_size);
+      }
+      if constexpr (!std::is_same_v<payload_t, void>) {
+        if constexpr (std::is_arithmetic_v<payload_t>) {
+          min_element_size = std::min(sizeof(payload_t), min_element_size);
+        } else {
+          static_assert(is_thrust_tuple_of_arithmetic<payload_t>::value);
+          min_element_size =
+            std::min(cugraph::min_thrust_tuple_element_sizes<payload_t>(), min_element_size);
+        }
+      }
+      assert((cache_line_size % min_element_size) == 0);
+      auto alignment = cache_line_size / min_element_size;
+      std::optional<std::conditional_t<try_compression, std::variant<key_t, uint32_t>, key_t>>
+        invalid_key{std::nullopt};
+
+      if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) {
+        if constexpr (std::is_same_v<key_t, vertex_t>) {
+          if constexpr (try_compression) {
+            if (compressed_v_buffer) {
+              invalid_key = std::numeric_limits<uint32_t>::max();
+            } else {
+              invalid_key = invalid_vertex_id_v<vertex_t>;
+            }
+          } else {
+            invalid_key = invalid_vertex_id_v<vertex_t>;
+          }
+        } else {
+          invalid_key                  = key_t{};
+          thrust::get<0>(*invalid_key) = invalid_vertex_id_v<vertex_t>;
+        }
+
+        if constexpr (try_compression) {
+          if (compressed_v_buffer) {
+            auto rx_compressed_v_buffer =
+              allocate_dataframe_buffer<uint32_t>(size_t{0}, handle.get_stream());
+            std::tie(rx_compressed_v_buffer,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore) = shuffle_values(major_comm,
+                                                   get_dataframe_buffer_begin(*compressed_v_buffer),
+                                                   tx_counts,
+                                                   alignment,
+                                                   std::make_optional(std::get<1>(*invalid_key)),
+                                                   handle.get_stream());
+            compressed_v_buffer   = std::move(rx_compressed_v_buffer);
+          } else {
+            auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+            std::tie(rx_key_buffer,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore,
+                     std::ignore) = shuffle_values(major_comm,
+                                                   get_dataframe_buffer_begin(key_buffer),
+                                                   tx_counts,
+                                                   alignment,
+                                                   std::make_optional(std::get<0>(*invalid_key)),
+                                                   handle.get_stream());
+            key_buffer            = std::move(rx_key_buffer);
+          }
+        } else {
+          auto rx_key_buffer    = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+          std::tie(rx_key_buffer,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore) = shuffle_values(major_comm,
+                                                 get_dataframe_buffer_begin(key_buffer),
+                                                 tx_counts,
+                                                 alignment,
+                                                 invalid_key,
+                                                 handle.get_stream());
+          key_buffer            = std::move(rx_key_buffer);
+        }
+        if constexpr (!std::is_same_v<payload_t, void>) {
+          auto rx_payload_buffer =
+            allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
+          std::tie(rx_payload_buffer,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore,
+                   std::ignore) = shuffle_values(major_comm,
+                                                 get_dataframe_buffer_begin(payload_buffer),
+                                                 tx_counts,
+                                                 alignment,
+                                                 std::nullopt,
+                                                 handle.get_stream());
+          payload_buffer        = std::move(rx_payload_buffer);
+        }
+      } else {
+        if constexpr (try_compression) {
+          if (compressed_v_buffer) {
+            auto rx_compressed_v_buffer =
+              allocate_dataframe_buffer<uint32_t>(size_t{0}, handle.get_stream());
+            std::tie(rx_compressed_v_buffer, std::ignore) =
+              shuffle_values(major_comm,
+                             get_dataframe_buffer_begin(*compressed_v_buffer),
+                             tx_counts,
+                             handle.get_stream());
+            compressed_v_buffer = std::move(rx_compressed_v_buffer);
+          } else {
+            auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+            std::tie(rx_key_buffer, std::ignore) = shuffle_values(
+              major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream());
+            key_buffer = std::move(rx_key_buffer);
+          }
+        } else {
+          auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
+          std::tie(rx_key_buffer, std::ignore) = shuffle_values(
+            major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream());
+          key_buffer = std::move(rx_key_buffer);
+        }
+
+        if constexpr (!std::is_same_v<payload_t, void>) {
+          auto rx_payload_buffer =
+            allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
+          std::tie(rx_payload_buffer, std::ignore) = shuffle_values(
+            major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream());
+          payload_buffer = std::move(rx_payload_buffer);
+        }
+      }
+
+      if constexpr (std::is_integral_v<key_t>) {
+        vertex_range = std::make_tuple(graph_view.local_vertex_partition_range_first(),
+                                       graph_view.local_vertex_partition_range_last());
+      }
+      if constexpr (try_compression) {
+        if (compressed_v_buffer) {
+          std::tie(key_buffer, payload_buffer) =
+            detail::sort_and_reduce_buffer_elements<uint32_t, key_t, payload_t, ReduceOp>(
+              handle,
+              std::move(*compressed_v_buffer),
+              std::move(payload_buffer),
+              reduce_op,
+              vertex_range,
+              invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt);
+        } else {
+          std::tie(key_buffer, payload_buffer) =
+            detail::sort_and_reduce_buffer_elements<key_t, key_t, payload_t, ReduceOp>(
+              handle,
+              std::move(key_buffer),
+              std::move(payload_buffer),
+              reduce_op,
+              vertex_range,
+              invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt);
+        }
+      } else {
+        std::tie(key_buffer, payload_buffer) =
+          detail::sort_and_reduce_buffer_elements<key_t, key_t, payload_t, ReduceOp>(
+            handle,
+            std::move(key_buffer),
+            std::move(payload_buffer),
+            reduce_op,
+            vertex_range,
+            invalid_key);
+      }
+    }
+  }
+
+  if constexpr (!std::is_same_v<payload_t, void>) {
+    return std::make_tuple(std::move(key_buffer), std::move(payload_buffer));
+  } else {
+    return std::move(key_buffer);
+  }
+}
+
+}  // namespace detail
+
+template <typename GraphViewType, typename KeyBucketType>
+size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
+                                          GraphViewType const& graph_view,
+                                          KeyBucketType const& frontier)
+{
+  static_assert(!GraphViewType::is_storage_transposed,
+                "GraphViewType should support the push model.");
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using key_t    = typename KeyBucketType::key_type;
+
+  size_t ret{0};
+
+  auto local_frontier_vertex_first =
+    thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
+
+  std::vector<size_t> local_frontier_sizes{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& minor_comm     = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream());
+  } else {
+    local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(frontier.size())};
+  }
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    if constexpr (GraphViewType::is_multi_gpu) {
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_rank = minor_comm.get_rank();
+
+      rmm::device_uvector<vertex_t> edge_partition_frontier_vertices(local_frontier_sizes[i],
+                                                                     handle.get_stream());
+      device_bcast(minor_comm,
+                   local_frontier_vertex_first,
+                   edge_partition_frontier_vertices.data(),
+                   local_frontier_sizes[i],
+                   static_cast<int>(i),
+                   handle.get_stream());
+
+      if (edge_partition_e_mask) {
+        ret +=
+          edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(),
+                                                           edge_partition_frontier_vertices.begin(),
+                                                           edge_partition_frontier_vertices.end(),
+                                                           handle.get_stream());
+      } else {
+        ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(),
+                                                      edge_partition_frontier_vertices.end(),
+                                                      handle.get_stream());
+      }
+    } else {
+      assert(i == 0);
+      if (edge_partition_e_mask) {
+        ret += edge_partition.compute_number_of_edges_with_mask(
+          (*edge_partition_e_mask).value_first(),
+          local_frontier_vertex_first,
+          local_frontier_vertex_first + frontier.size(),
+          handle.get_stream());
+      } else {
+        ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first,
+                                                      local_frontier_vertex_first + frontier.size(),
+                                                      handle.get_stream());
+      }
+    }
+  }
+
+  return ret;
+}
+
+/**
+ * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor
+ * outputs by (tagged-)destination ID.
+ *
+ * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are
+ * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag
+ * type (KeyBucketType::key_type is identical to a vertex type otherwise).
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the
+ * current (tagged-)vertex frontier.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param frontier KeyBucketType class object for the current vertex frontier.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for
+ * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be
+ * discarded); 2) dummy (but valid) thrust::optional object (e.g.
+ * thrust::optional<std::byte>{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is
+ * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be
+ * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void);
+ * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type
+ * is not void).
+ * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
+ * There are pre-defined reduction operators in prims/reduce_op.cuh. It is
+ * recommended to use the pre-defined reduction operators whenever possible as the current (and
+ * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
+ * known member variables) to take a more optimized code path. See the documentation in the
+ * reduce_op.cuh file for instructions on writing custom reduction operators.
+ * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key
+ * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order
+ * using a vertex ID as the primary key and a tag (if relevant) as the secondary key.
+ */
+template <typename GraphViewType,
+          typename KeyBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename ReduceOp>
+std::conditional_t<
+  !std::is_same_v<typename ReduceOp::value_type, void>,
+  std::tuple<decltype(allocate_dataframe_buffer<typename KeyBucketType::key_type>(
+               0, rmm::cuda_stream_view{})),
+             decltype(detail::allocate_optional_dataframe_buffer<typename ReduceOp::value_type>(
+               0, rmm::cuda_stream_view{}))>,
+  decltype(allocate_dataframe_buffer<typename KeyBucketType::key_type>(0, rmm::cuda_stream_view{}))>
+transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
+                                              GraphViewType const& graph_view,
+                                              KeyBucketType const& frontier,
+                                              EdgeSrcValueInputWrapper edge_src_value_input,
+                                              EdgeDstValueInputWrapper edge_dst_value_input,
+                                              EdgeValueInputWrapper edge_value_input,
+                                              EdgeOp e_op,
+                                              ReduceOp reduce_op,
+                                              bool do_expensive_check = false)
+{
+  return detail::transform_reduce_v_frontier_outgoing_e_by_dst(handle,
+                                                               graph_view,
+                                                               frontier,
+                                                               edge_src_value_input,
+                                                               edge_dst_value_input,
+                                                               edge_value_input,
+                                                               e_op,
+                                                               reduce_op,
+                                                               do_expensive_check);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh
deleted file mode 100644
index e58ab08fa9..0000000000
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh
+++ /dev/null
@@ -1,585 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "detail/graph_partition_utils.cuh"
-#include "prims/detail/extract_transform_v_frontier_e.cuh"
-#include "prims/property_op_utils.cuh"
-#include "prims/reduce_op.cuh"
-
-#include <cugraph/edge_partition_device_view.cuh>
-#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
-#include <cugraph/edge_src_dst_property.hpp>
-#include <cugraph/graph_view.hpp>
-#include <cugraph/partition_manager.hpp>
-#include <cugraph/utilities/dataframe_buffer.hpp>
-#include <cugraph/utilities/device_comm.hpp>
-#include <cugraph/utilities/device_functors.cuh>
-#include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/host_scalar_comm.hpp>
-#include <cugraph/utilities/shuffle_comm.cuh>
-#include <cugraph/vertex_partition_device_view.cuh>
-
-#include <raft/core/handle.hpp>
-#include <raft/util/cudart_utils.hpp>
-
-#include <rmm/device_scalar.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cub/cub.cuh>
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
-#include <thrust/reduce.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/tuple.h>
-#include <thrust/type_traits/integer_sequence.h>
-#include <thrust/unique.h>
-
-#include <algorithm>
-#include <cstdlib>
-#include <limits>
-#include <numeric>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-namespace cugraph {
-
-namespace detail {
-
-int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512;
-
-template <bool reduce_by_src,
-          typename key_t,
-          typename payload_t,
-          typename vertex_t,
-          typename src_value_t,
-          typename dst_value_t,
-          typename e_value_t,
-          typename EdgeOp>
-struct transform_reduce_v_frontier_call_e_op_t {
-  EdgeOp e_op{};
-
-  __device__ thrust::optional<
-    std::conditional_t<!std::is_same_v<key_t, void> && !std::is_same_v<payload_t, void>,
-                       thrust::tuple<key_t, payload_t>,
-                       std::conditional_t<!std::is_same_v<key_t, void>, key_t, payload_t>>>
-  operator()(key_t key, vertex_t dst, src_value_t sv, dst_value_t dv, e_value_t ev) const
-  {
-    auto e_op_result = e_op(key, dst, sv, dv, ev);
-    if (e_op_result.has_value()) {
-      auto reduce_by = reduce_by_src ? thrust_tuple_get_or_identity<key_t, 0>(key) : dst;
-      if constexpr (std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
-        return reduce_by;
-      } else if constexpr (std::is_same_v<key_t, vertex_t> && !std::is_same_v<payload_t, void>) {
-        return thrust::make_tuple(reduce_by, *e_op_result);
-      } else if constexpr (!std::is_same_v<key_t, vertex_t> && std::is_same_v<payload_t, void>) {
-        return thrust::make_tuple(reduce_by, *e_op_result);
-      } else {
-        return thrust::make_tuple(thrust::make_tuple(reduce_by, thrust::get<0>(*e_op_result)),
-                                  thrust::get<1>(*e_op_result));
-      }
-    } else {
-      return thrust::nullopt;
-    }
-  }
-};
-
-template <typename key_t, typename payload_t, typename ReduceOp>
-auto sort_and_reduce_buffer_elements(
-  raft::handle_t const& handle,
-  decltype(allocate_dataframe_buffer<key_t>(0, rmm::cuda_stream_view{}))&& key_buffer,
-  decltype(allocate_optional_dataframe_buffer<payload_t>(0,
-                                                         rmm::cuda_stream_view{}))&& payload_buffer,
-  ReduceOp reduce_op)
-{
-  if constexpr (std::is_same_v<payload_t, void>) {
-    thrust::sort(handle.get_thrust_policy(),
-                 get_dataframe_buffer_begin(key_buffer),
-                 get_dataframe_buffer_end(key_buffer));
-  } else {
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        get_dataframe_buffer_begin(key_buffer),
-                        get_dataframe_buffer_end(key_buffer),
-                        get_optional_dataframe_buffer_begin<payload_t>(payload_buffer));
-  }
-
-  if constexpr (std::is_same_v<payload_t, void>) {
-    auto it = thrust::unique(handle.get_thrust_policy(),
-                             get_dataframe_buffer_begin(key_buffer),
-                             get_dataframe_buffer_end(key_buffer));
-    resize_dataframe_buffer(
-      key_buffer,
-      static_cast<size_t>(thrust::distance(get_dataframe_buffer_begin(key_buffer), it)),
-      handle.get_stream());
-    shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream());
-  } else if constexpr (std::is_same_v<ReduceOp, reduce_op::any<typename ReduceOp::value_type>>) {
-    auto it = thrust::unique_by_key(handle.get_thrust_policy(),
-                                    get_dataframe_buffer_begin(key_buffer),
-                                    get_dataframe_buffer_end(key_buffer),
-                                    get_optional_dataframe_buffer_begin<payload_t>(payload_buffer));
-    resize_dataframe_buffer(key_buffer,
-                            static_cast<size_t>(thrust::distance(
-                              get_dataframe_buffer_begin(key_buffer), thrust::get<0>(it))),
-                            handle.get_stream());
-    resize_dataframe_buffer(payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream());
-    shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream());
-    shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream());
-  } else {
-    auto num_uniques =
-      thrust::count_if(handle.get_thrust_policy(),
-                       thrust::make_counting_iterator(size_t{0}),
-                       thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)),
-                       is_first_in_run_t<decltype(get_dataframe_buffer_begin(key_buffer))>{
-                         get_dataframe_buffer_begin(key_buffer)});
-
-    auto new_key_buffer = allocate_dataframe_buffer<key_t>(num_uniques, handle.get_stream());
-    auto new_payload_buffer =
-      allocate_dataframe_buffer<payload_t>(num_uniques, handle.get_stream());
-
-    thrust::reduce_by_key(handle.get_thrust_policy(),
-                          get_dataframe_buffer_begin(key_buffer),
-                          get_dataframe_buffer_end(key_buffer),
-                          get_optional_dataframe_buffer_begin<payload_t>(payload_buffer),
-                          get_dataframe_buffer_begin(new_key_buffer),
-                          get_dataframe_buffer_begin(new_payload_buffer),
-                          thrust::equal_to<key_t>(),
-                          reduce_op);
-
-    key_buffer     = std::move(new_key_buffer);
-    payload_buffer = std::move(new_payload_buffer);
-  }
-
-  return std::make_tuple(std::move(key_buffer), std::move(payload_buffer));
-}
-
-template <bool reduce_by_src,
-          typename GraphViewType,
-          typename VertexFrontierBucketType,
-          typename EdgeSrcValueInputWrapper,
-          typename EdgeDstValueInputWrapper,
-          typename EdgeValueInputWrapper,
-          typename EdgeOp,
-          typename ReduceOp>
-std::conditional_t<
-  !std::is_same_v<typename ReduceOp::value_type, void>,
-  std::tuple<decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
-               0, rmm::cuda_stream_view{})),
-             decltype(detail::allocate_optional_dataframe_buffer<typename ReduceOp::value_type>(
-               0, rmm::cuda_stream_view{}))>,
-  decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
-    0, rmm::cuda_stream_view{}))>
-transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle,
-                                                  GraphViewType const& graph_view,
-                                                  VertexFrontierBucketType const& frontier,
-                                                  EdgeSrcValueInputWrapper edge_src_value_input,
-                                                  EdgeDstValueInputWrapper edge_dst_value_input,
-                                                  EdgeValueInputWrapper edge_value_input,
-                                                  EdgeOp e_op,
-                                                  ReduceOp reduce_op,
-                                                  bool do_expensive_check = false)
-{
-  static_assert(!GraphViewType::is_storage_transposed,
-                "GraphViewType should support the push model.");
-
-  using vertex_t  = typename GraphViewType::vertex_type;
-  using edge_t    = typename GraphViewType::edge_type;
-  using key_t     = typename VertexFrontierBucketType::key_type;
-  using payload_t = typename ReduceOp::value_type;
-
-  if (do_expensive_check) {
-    // currently, nothing to do
-  }
-
-  // 1. fill the buffer
-
-  detail::transform_reduce_v_frontier_call_e_op_t<reduce_by_src,
-                                                  key_t,
-                                                  payload_t,
-                                                  vertex_t,
-                                                  typename EdgeSrcValueInputWrapper::value_type,
-                                                  typename EdgeDstValueInputWrapper::value_type,
-                                                  typename EdgeValueInputWrapper::value_type,
-                                                  EdgeOp>
-    e_op_wrapper{e_op};
-
-  bool constexpr max_one_e_per_frontier_key =
-    reduce_by_src && std::is_same_v<ReduceOp, reduce_op::any<typename ReduceOp::value_type>>;
-  auto [key_buffer, payload_buffer] =
-    detail::extract_transform_v_frontier_e<false, max_one_e_per_frontier_key, key_t, payload_t>(
-      handle,
-      graph_view,
-      frontier,
-      edge_src_value_input,
-      edge_dst_value_input,
-      edge_value_input,
-      e_op_wrapper,
-      do_expensive_check);
-
-  // 2. reduce the buffer
-
-  std::tie(key_buffer, payload_buffer) =
-    detail::sort_and_reduce_buffer_elements<key_t, payload_t, ReduceOp>(
-      handle, std::move(key_buffer), std::move(payload_buffer), reduce_op);
-  if constexpr (GraphViewType::is_multi_gpu) {
-    // FIXME: this step is unnecessary if major_comm_size== 1
-    auto& comm                 = handle.get_comms();
-    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-    auto const major_comm_rank = major_comm.get_rank();
-    auto const major_comm_size = major_comm.get_size();
-    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-    auto const minor_comm_rank = minor_comm.get_rank();
-    auto const minor_comm_size = minor_comm.get_size();
-
-    std::vector<vertex_t> h_vertex_lasts(reduce_by_src ? minor_comm_size : major_comm_size);
-    for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
-      auto vertex_partition_id =
-        reduce_by_src
-          ? detail::compute_local_edge_partition_major_range_vertex_partition_id_t{major_comm_size,
-                                                                                   minor_comm_size,
-                                                                                   major_comm_rank,
-                                                                                   minor_comm_rank}(
-              i)
-          : detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{
-              major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-      h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id);
-    }
-
-    rmm::device_uvector<vertex_t> d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream());
-    raft::update_device(
-      d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream());
-    rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
-                                                            handle.get_stream());
-    auto reduce_by_first =
-      thrust_tuple_get_or_identity<decltype(get_dataframe_buffer_begin(key_buffer)), 0>(
-        get_dataframe_buffer_begin(key_buffer));
-    thrust::lower_bound(handle.get_thrust_policy(),
-                        reduce_by_first,
-                        reduce_by_first + size_dataframe_buffer(key_buffer),
-                        d_vertex_lasts.begin(),
-                        d_vertex_lasts.end(),
-                        d_tx_buffer_last_boundaries.begin());
-    std::vector<edge_t> h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size());
-    raft::update_host(h_tx_buffer_last_boundaries.data(),
-                      d_tx_buffer_last_boundaries.data(),
-                      d_tx_buffer_last_boundaries.size(),
-                      handle.get_stream());
-    handle.sync_stream();
-    std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
-    std::adjacent_difference(
-      h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
-
-    auto rx_key_buffer = allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
-    std::tie(rx_key_buffer, std::ignore) = shuffle_values(reduce_by_src ? minor_comm : major_comm,
-                                                          get_dataframe_buffer_begin(key_buffer),
-                                                          tx_counts,
-                                                          handle.get_stream());
-    key_buffer = std::move(rx_key_buffer);
-
-    if constexpr (!std::is_same_v<payload_t, void>) {
-      auto rx_payload_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
-      std::tie(rx_payload_buffer, std::ignore) =
-        shuffle_values(reduce_by_src ? minor_comm : major_comm,
-                       get_dataframe_buffer_begin(payload_buffer),
-                       tx_counts,
-                       handle.get_stream());
-      payload_buffer = std::move(rx_payload_buffer);
-    }
-
-    std::tie(key_buffer, payload_buffer) =
-      detail::sort_and_reduce_buffer_elements<key_t, payload_t, ReduceOp>(
-        handle, std::move(key_buffer), std::move(payload_buffer), reduce_op);
-  }
-
-  if constexpr (!std::is_same_v<payload_t, void>) {
-    return std::make_tuple(std::move(key_buffer), std::move(payload_buffer));
-  } else {
-    return std::move(key_buffer);
-  }
-}
-
-}  // namespace detail
-
-template <typename GraphViewType, typename VertexFrontierBucketType>
-size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
-                                          GraphViewType const& graph_view,
-                                          VertexFrontierBucketType const& frontier)
-{
-  static_assert(!GraphViewType::is_storage_transposed,
-                "GraphViewType should support the push model.");
-
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using key_t    = typename VertexFrontierBucketType::key_type;
-
-  size_t ret{0};
-
-  auto local_frontier_vertex_first =
-    thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
-
-  std::vector<size_t> local_frontier_sizes{};
-  if constexpr (GraphViewType::is_multi_gpu) {
-    auto& minor_comm     = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-    local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream());
-  } else {
-    local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(frontier.size())};
-  }
-
-  auto edge_mask_view = graph_view.edge_mask_view();
-
-  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    auto edge_partition =
-      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
-        graph_view.local_edge_partition_view(i));
-    auto edge_partition_e_mask =
-      edge_mask_view
-        ? thrust::make_optional<
-            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
-            *edge_mask_view, i)
-        : thrust::nullopt;
-
-    if constexpr (GraphViewType::is_multi_gpu) {
-      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-      auto const minor_comm_rank = minor_comm.get_rank();
-
-      rmm::device_uvector<vertex_t> edge_partition_frontier_vertices(local_frontier_sizes[i],
-                                                                     handle.get_stream());
-      device_bcast(minor_comm,
-                   local_frontier_vertex_first,
-                   edge_partition_frontier_vertices.data(),
-                   local_frontier_sizes[i],
-                   static_cast<int>(i),
-                   handle.get_stream());
-
-      if (edge_partition_e_mask) {
-        ret +=
-          edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(),
-                                                           edge_partition_frontier_vertices.begin(),
-                                                           edge_partition_frontier_vertices.end(),
-                                                           handle.get_stream());
-      } else {
-        ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(),
-                                                      edge_partition_frontier_vertices.end(),
-                                                      handle.get_stream());
-      }
-    } else {
-      assert(i == 0);
-      if (edge_partition_e_mask) {
-        ret += edge_partition.compute_number_of_edges_with_mask(
-          (*edge_partition_e_mask).value_first(),
-          local_frontier_vertex_first,
-          local_frontier_vertex_first + frontier.size(),
-          handle.get_stream());
-      } else {
-        ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first,
-                                                      local_frontier_vertex_first + frontier.size(),
-                                                      handle.get_stream());
-      }
-    }
-  }
-
-  return ret;
-}
-
-/**
- * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor
- * outputs by (tagged-)source ID.
- *
- * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are
- * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag
- * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise).
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the
- * current (tagged-)vertex frontier.
- * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
- * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
- * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quinary edge operator.
- * @tparam ReduceOp Type of the binary reduction operator.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param frontier VertexFrontierBucketType class object for the current vertex frontier.
- * @param edge_src_value_input Wrapper used to access source input property values (for the edge
- * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
- * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
- * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
- * wrapper.
- * @param edge_dst_value_input Wrapper used to access destination input property values (for the
- * edge destinations assigned to this process in multi-GPU). Use either
- * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
- * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
- * values). Use update_edge_dst_property to fill the wrapper.
- * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
- * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
- * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
- * access edge property values).
- * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for
- * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be
- * discarded); 2) dummy (but valid) thrust::optional object (e.g.
- * thrust::optional<std::byte>{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is
- * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be
- * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void);
- * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type
- * is not void).
- * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
- * There are pre-defined reduction operators in prims/reduce_op.cuh. It is
- * recommended to use the pre-defined reduction operators whenever possible as the current (and
- * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
- * known member variables) to take a more optimized code path. See the documentation in the
- * reduce_op.cuh file for instructions on writing custom reduction operators.
- * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key
- * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order
- * using a vertex ID as the primary key and a tag (if relevant) as the secondary key.
- */
-template <typename GraphViewType,
-          typename VertexFrontierBucketType,
-          typename EdgeSrcValueInputWrapper,
-          typename EdgeDstValueInputWrapper,
-          typename EdgeValueInputWrapper,
-          typename EdgeOp,
-          typename ReduceOp>
-std::conditional_t<
-  !std::is_same_v<typename ReduceOp::value_type, void>,
-  std::tuple<decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
-               0, rmm::cuda_stream_view{})),
-             decltype(detail::allocate_optional_dataframe_buffer<typename ReduceOp::value_type>(
-               0, rmm::cuda_stream_view{}))>,
-  decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
-    0, rmm::cuda_stream_view{}))>
-transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle,
-                                              GraphViewType const& graph_view,
-                                              VertexFrontierBucketType const& frontier,
-                                              EdgeSrcValueInputWrapper edge_src_value_input,
-                                              EdgeDstValueInputWrapper edge_dst_value_input,
-                                              EdgeValueInputWrapper edge_value_input,
-                                              EdgeOp e_op,
-                                              ReduceOp reduce_op,
-                                              bool do_expensive_check = false)
-{
-  return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst<true>(handle,
-                                                                         graph_view,
-                                                                         frontier,
-                                                                         edge_src_value_input,
-                                                                         edge_dst_value_input,
-                                                                         edge_value_input,
-                                                                         e_op,
-                                                                         reduce_op,
-                                                                         do_expensive_check);
-}
-
-/**
- * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor
- * outputs by (tagged-)destination ID.
- *
- * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are
- * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag
- * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise).
- *
- * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the
- * current (tagged-)vertex frontier.
- * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
- * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
- * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quinary edge operator.
- * @tparam ReduceOp Type of the binary reduction operator.
- * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
- * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Non-owning graph object.
- * @param frontier VertexFrontierBucketType class object for the current vertex frontier.
- * @param edge_src_value_input Wrapper used to access source input property values (for the edge
- * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
- * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
- * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
- * wrapper.
- * @param edge_dst_value_input Wrapper used to access destination input property values (for the
- * edge destinations assigned to this process in multi-GPU). Use either
- * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
- * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
- * values). Use update_edge_dst_property to fill the wrapper.
- * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
- * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
- * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
- * access edge property values).
- * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for
- * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be
- * discarded); 2) dummy (but valid) thrust::optional object (e.g.
- * thrust::optional<std::byte>{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is
- * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be
- * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void);
- * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type
- * is not void).
- * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one.
- * There are pre-defined reduction operators in prims/reduce_op.cuh. It is
- * recommended to use the pre-defined reduction operators whenever possible as the current (and
- * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has
- * known member variables) to take a more optimized code path. See the documentation in the
- * reduce_op.cuh file for instructions on writing custom reduction operators.
- * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key
- * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order
- * using a vertex ID as the primary key and a tag (if relevant) as the secondary key.
- */
-template <typename GraphViewType,
-          typename VertexFrontierBucketType,
-          typename EdgeSrcValueInputWrapper,
-          typename EdgeDstValueInputWrapper,
-          typename EdgeValueInputWrapper,
-          typename EdgeOp,
-          typename ReduceOp>
-std::conditional_t<
-  !std::is_same_v<typename ReduceOp::value_type, void>,
-  std::tuple<decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
-               0, rmm::cuda_stream_view{})),
-             decltype(detail::allocate_optional_dataframe_buffer<typename ReduceOp::value_type>(
-               0, rmm::cuda_stream_view{}))>,
-  decltype(allocate_dataframe_buffer<typename VertexFrontierBucketType::key_type>(
-    0, rmm::cuda_stream_view{}))>
-transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
-                                              GraphViewType const& graph_view,
-                                              VertexFrontierBucketType const& frontier,
-                                              EdgeSrcValueInputWrapper edge_src_value_input,
-                                              EdgeDstValueInputWrapper edge_dst_value_input,
-                                              EdgeValueInputWrapper edge_value_input,
-                                              EdgeOp e_op,
-                                              ReduceOp reduce_op,
-                                              bool do_expensive_check = false)
-{
-  return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst<false>(handle,
-                                                                          graph_view,
-                                                                          frontier,
-                                                                          edge_src_value_input,
-                                                                          edge_dst_value_input,
-                                                                          edge_value_input,
-                                                                          e_op,
-                                                                          reduce_op,
-                                                                          do_expensive_check);
-}
-
-}  // namespace cugraph
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index 1bfdc23c66..2f842f710c 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "detail/graph_partition_utils.cuh"
+#include "prims/vertex_frontier.cuh"
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
@@ -265,8 +266,8 @@ template <typename GraphViewType,
           typename EdgeMajorPropertyOutputWrapper>
 void update_edge_major_property(raft::handle_t const& handle,
                                 GraphViewType const& graph_view,
-                                VertexIterator vertex_first,
-                                VertexIterator vertex_last,
+                                VertexIterator sorted_unique_vertex_first,
+                                VertexIterator sorted_unique_vertex_last,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMajorPropertyOutputWrapper edge_major_property_output)
 {
@@ -288,12 +289,12 @@ void update_edge_major_property(raft::handle_t const& handle,
     auto const minor_comm_rank = minor_comm.get_rank();
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto rx_counts =
-      host_scalar_allgather(minor_comm,
-                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
-                            handle.get_stream());
-    auto max_rx_size =
-      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
+    auto local_v_list_sizes = host_scalar_allgather(
+      minor_comm,
+      static_cast<size_t>(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)),
+      handle.get_stream());
+    auto max_rx_size = std::reduce(
+      local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) {
         return std::max(lhs, rhs);
       });
     rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
@@ -317,7 +318,7 @@ void update_edge_major_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (contains_packed_bool_element) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first,
+            sorted_unique_vertex_first,
             cuda::proclaim_return_type<bool>([vertex_property_input_first,
                                               vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
@@ -325,34 +326,41 @@ void update_edge_major_property(raft::handle_t const& handle,
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
             }));
-          pack_bools(handle,
-                     bool_first,
-                     bool_first + thrust::distance(vertex_first, vertex_last),
-                     rx_value_first);
+          pack_bools(
+            handle,
+            bool_first,
+            bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+            rx_value_first);
         } else {
           auto map_first = thrust::make_transform_iterator(
-            vertex_first,
+            sorted_unique_vertex_first,
             cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
             }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
-          thrust::gather(handle.get_thrust_policy(),
-                         map_first,
-                         map_first + thrust::distance(vertex_first, vertex_last),
-                         vertex_property_input_first,
-                         rx_value_first);
+          thrust::gather(
+            handle.get_thrust_policy(),
+            map_first,
+            map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+            vertex_property_input_first,
+            rx_value_first);
         }
       }
 
       // FIXME: these broadcast operations can be placed between ncclGroupStart() and
       // ncclGroupEnd()
-      device_bcast(
-        minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+      device_bcast(minor_comm,
+                   sorted_unique_vertex_first,
+                   rx_vertices.begin(),
+                   local_v_list_sizes[i],
+                   i,
+                   handle.get_stream());
       device_bcast(minor_comm,
                    rx_value_first,
                    rx_value_first,
-                   contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i],
+                   contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i])
+                                                : local_v_list_sizes[i],
                    i,
                    handle.get_stream());
 
@@ -360,7 +368,7 @@ void update_edge_major_property(raft::handle_t const& handle,
         thrust::for_each(
           handle.get_thrust_policy(),
           thrust::make_counting_iterator(size_t{0}),
-          thrust::make_counting_iterator(rx_counts[i]),
+          thrust::make_counting_iterator(local_v_list_sizes[i]),
           [rx_vertex_first = rx_vertices.begin(),
            rx_value_first,
            edge_partition_key_first   = ((*edge_partition_keys)[i]).begin(),
@@ -386,7 +394,7 @@ void update_edge_major_property(raft::handle_t const& handle,
           thrust::for_each(
             handle.get_thrust_policy(),
             thrust::make_counting_iterator(vertex_t{0}),
-            thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
+            thrust::make_counting_iterator(static_cast<vertex_t>(local_v_list_sizes[i])),
             [edge_partition,
              rx_vertex_first = rx_vertices.begin(),
              rx_value_first,
@@ -407,7 +415,7 @@ void update_edge_major_property(raft::handle_t const& handle,
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
                           rx_value_first,
-                          rx_value_first + rx_counts[i],
+                          rx_value_first + local_v_list_sizes[i],
                           map_first,
                           edge_partition_value_firsts[i]);
         }
@@ -420,20 +428,22 @@ void update_edge_major_property(raft::handle_t const& handle,
     assert(edge_partition_value_firsts.size() == size_t{1});
     if constexpr (contains_packed_bool_element) {
       thrust::for_each(handle.get_thrust_policy(),
-                       vertex_first,
-                       vertex_last,
+                       sorted_unique_vertex_first,
+                       sorted_unique_vertex_last,
                        [vertex_property_input_first,
                         output_value_first = edge_partition_value_firsts[0]] __device__(auto v) {
                          bool val = static_cast<bool>(*(vertex_property_input_first + v));
                          packed_bool_atomic_set(output_value_first, v, val);
                        });
     } else {
-      auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first);
-      thrust::scatter(handle.get_thrust_policy(),
-                      val_first,
-                      val_first + thrust::distance(vertex_first, vertex_last),
-                      vertex_first,
-                      edge_partition_value_firsts[0]);
+      auto val_first =
+        thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first);
+      thrust::scatter(
+        handle.get_thrust_policy(),
+        val_first,
+        val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+        sorted_unique_vertex_first,
+        edge_partition_value_firsts[0]);
     }
   }
 }
@@ -455,13 +465,11 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
   auto edge_partition_value_first = edge_minor_property_output.value_first();
   if constexpr (GraphViewType::is_multi_gpu) {
-    using vertex_t = typename GraphViewType::vertex_type;
-    using bcast_buffer_type =
-      decltype(allocate_dataframe_buffer<
-               std::conditional_t<contains_packed_bool_element,
-                                  uint32_t,
-                                  typename EdgeMinorPropertyOutputWrapper::value_type>>(
-        size_t{0}, handle.get_stream()));
+    using vertex_t          = typename GraphViewType::vertex_type;
+    using bcast_buffer_type = dataframe_buffer_type_t<
+      std::conditional_t<contains_packed_bool_element,
+                         uint32_t,
+                         typename EdgeMinorPropertyOutputWrapper::value_type>>;
 
     auto& comm                 = handle.get_comms();
     auto const comm_rank       = comm.get_rank();
@@ -487,8 +495,8 @@ void update_edge_minor_property(raft::handle_t const& handle,
       (static_cast<size_t>(graph_view.compute_number_of_edges(handle) / comm_size) *
        sizeof(vertex_t)) /
       std::max(bcast_size, size_t{1});
-    num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1});
-    num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast<size_t>(major_comm_size));
+    num_concurrent_bcasts =
+      std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast<size_t>(major_comm_size));
     auto num_rounds = (static_cast<size_t>(major_comm_size) + num_concurrent_bcasts - size_t{1}) /
                       num_concurrent_bcasts;
 
@@ -532,15 +540,17 @@ void update_edge_minor_property(raft::handle_t const& handle,
           *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets());
       }
     } else {
-      std::vector<size_t> rx_counts(major_comm_size, size_t{0});
+      std::vector<size_t> local_v_list_sizes(major_comm_size, size_t{0});
       for (int i = 0; i < major_comm_size; ++i) {
         auto minor_range_vertex_partition_id =
           compute_local_edge_partition_minor_range_vertex_partition_id_t{
             major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        rx_counts[i] = graph_view.vertex_partition_range_size(minor_range_vertex_partition_id);
+        local_v_list_sizes[i] =
+          graph_view.vertex_partition_range_size(minor_range_vertex_partition_id);
       }
       std::vector<size_t> rx_displacements(major_comm_size, size_t{0});
-      std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0});
+      std::exclusive_scan(
+        local_v_list_sizes.begin(), local_v_list_sizes.end(), rx_displacements.begin(), size_t{0});
       key_offsets_or_rx_displacements = std::move(rx_displacements);
     }
 
@@ -683,8 +693,8 @@ template <typename GraphViewType,
           typename EdgeMinorPropertyOutputWrapper>
 void update_edge_minor_property(raft::handle_t const& handle,
                                 GraphViewType const& graph_view,
-                                VertexIterator vertex_first,
-                                VertexIterator vertex_last,
+                                VertexIterator sorted_unique_vertex_first,
+                                VertexIterator sorted_unique_vertex_last,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMinorPropertyOutputWrapper edge_minor_property_output)
 {
@@ -706,22 +716,49 @@ void update_edge_minor_property(raft::handle_t const& handle,
     auto const major_comm_rank = major_comm.get_rank();
     auto const major_comm_size = major_comm.get_size();
 
-    auto rx_counts =
-      host_scalar_allgather(major_comm,
-                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
-                            handle.get_stream());
-    auto max_rx_size =
-      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
-        return std::max(lhs, rhs);
-      });
-    rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
-    auto rx_tmp_buffer = allocate_dataframe_buffer<
-      std::conditional_t<contains_packed_bool_element,
-                         uint32_t,
-                         typename EdgeMinorPropertyOutputWrapper::value_type>>(
-      contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size,
-      handle.get_stream());
-    auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer);
+    auto v_list_size =
+      static_cast<size_t>(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last));
+    std::array<vertex_t, 2> v_list_range = {vertex_t{0}, vertex_t{0}};
+    if (v_list_size > 0) {
+      rmm::device_uvector<vertex_t> tmps(2, handle.get_stream());
+      thrust::tabulate(handle.get_thrust_policy(),
+                       tmps.begin(),
+                       tmps.end(),
+                       [sorted_unique_vertex_first, v_list_size] __device__(size_t i) {
+                         return (i == 0) ? *sorted_unique_vertex_first
+                                         : (*(sorted_unique_vertex_first + (v_list_size - 1)) + 1);
+                       });
+      raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream());
+      handle.sync_stream();
+    }
+
+    auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream());
+    auto local_v_list_range_firsts =
+      host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream());
+    auto local_v_list_range_lasts =
+      host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream());
+
+    std::optional<rmm::device_uvector<uint32_t>> v_list_bitmap{std::nullopt};
+    if (major_comm_size > 1) {
+      double avg_fill_ratio{0.0};
+      for (int i = 0; i < major_comm_size; ++i) {
+        auto num_keys   = static_cast<double>(local_v_list_sizes[i]);
+        auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i];
+        avg_fill_ratio +=
+          (range_size > 0) ? (num_keys / static_cast<double>(range_size)) : double{0.0};
+      }
+      avg_fill_ratio /= static_cast<double>(major_comm_size);
+
+      constexpr double threshold_ratio =
+        0.0 /* tuning parameter */ / static_cast<double>(sizeof(vertex_t) * 8);
+      if (avg_fill_ratio > threshold_ratio) {
+        v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first,
+                                                        sorted_unique_vertex_last,
+                                                        local_v_list_range_firsts[major_comm_rank],
+                                                        local_v_list_range_lasts[major_comm_rank],
+                                                        handle.get_stream());
+      }
+    }
 
     std::optional<raft::host_span<vertex_t const>> key_offsets{};
     if constexpr (GraphViewType::is_storage_transposed) {
@@ -735,13 +772,23 @@ void update_edge_minor_property(raft::handle_t const& handle,
         graph_view.local_edge_partition_view(size_t{0}));
     auto edge_partition_keys = edge_minor_property_output.keys();
     for (int i = 0; i < major_comm_size; ++i) {
+      rmm::device_uvector<vertex_t> rx_vertices(local_v_list_sizes[i], handle.get_stream());
+      auto rx_tmp_buffer = allocate_dataframe_buffer<
+        std::conditional_t<contains_packed_bool_element,
+                           uint32_t,
+                           typename EdgeMinorPropertyOutputWrapper::value_type>>(
+        contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i])
+                                     : local_v_list_sizes[i],
+        handle.get_stream());
+      auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer);
+
       if (i == major_comm_rank) {
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
             graph_view.local_vertex_partition_view());
         if constexpr (contains_packed_bool_element) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first,
+            sorted_unique_vertex_first,
             cuda::proclaim_return_type<bool>([vertex_property_input_first,
                                               vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
@@ -749,34 +796,53 @@ void update_edge_minor_property(raft::handle_t const& handle,
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
             }));
-          pack_bools(handle,
-                     bool_first,
-                     bool_first + thrust::distance(vertex_first, vertex_last),
-                     rx_value_first);
+          pack_bools(
+            handle,
+            bool_first,
+            bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+            rx_value_first);
         } else {
           auto map_first = thrust::make_transform_iterator(
-            vertex_first,
+            sorted_unique_vertex_first,
             cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
             }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
-          thrust::gather(handle.get_thrust_policy(),
-                         map_first,
-                         map_first + thrust::distance(vertex_first, vertex_last),
-                         vertex_property_input_first,
-                         rx_value_first);
+          thrust::gather(
+            handle.get_thrust_policy(),
+            map_first,
+            map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+            vertex_property_input_first,
+            rx_value_first);
         }
       }
 
       // FIXME: these broadcast operations can be placed between ncclGroupStart() and
       // ncclGroupEnd()
-      device_bcast(
-        major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+      std::variant<raft::device_span<uint32_t const>, decltype(sorted_unique_vertex_first)>
+        v_list{};
+      if (v_list_bitmap) {
+        v_list =
+          (i == major_comm_rank)
+            ? raft::device_span<uint32_t const>((*v_list_bitmap).data(), (*v_list_bitmap).size())
+            : raft::device_span<uint32_t const>(static_cast<uint32_t const*>(nullptr), size_t{0});
+      } else {
+        v_list = sorted_unique_vertex_first;
+      }
+      device_bcast_vertex_list(major_comm,
+                               v_list,
+                               rx_vertices.begin(),
+                               local_v_list_range_firsts[i],
+                               local_v_list_range_lasts[i],
+                               local_v_list_sizes[i],
+                               i,
+                               handle.get_stream());
       device_bcast(major_comm,
                    rx_value_first,
                    rx_value_first,
-                   contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i],
+                   contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i])
+                                                : local_v_list_sizes[i],
                    i,
                    handle.get_stream());
 
@@ -784,7 +850,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
         thrust::for_each(
           handle.get_thrust_policy(),
           thrust::make_counting_iterator(size_t{0}),
-          thrust::make_counting_iterator(rx_counts[i]),
+          thrust::make_counting_iterator(local_v_list_sizes[i]),
           [rx_vertex_first = rx_vertices.begin(),
            rx_value_first,
            subrange_key_first         = (*edge_partition_keys).begin() + (*key_offsets)[i],
@@ -812,7 +878,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
           thrust::for_each(
             handle.get_thrust_policy(),
             thrust::make_counting_iterator(vertex_t{0}),
-            thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
+            thrust::make_counting_iterator(static_cast<vertex_t>(local_v_list_sizes[i])),
             [edge_partition,
              rx_vertex_first = rx_vertices.begin(),
              rx_value_first,
@@ -833,7 +899,7 @@ void update_edge_minor_property(raft::handle_t const& handle,
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
                           rx_value_first,
-                          rx_value_first + rx_counts[i],
+                          rx_value_first + local_v_list_sizes[i],
                           map_first,
                           edge_partition_value_first);
         }
@@ -844,20 +910,22 @@ void update_edge_minor_property(raft::handle_t const& handle,
            graph_view.local_edge_partition_src_range_size());
     if constexpr (contains_packed_bool_element) {
       thrust::for_each(handle.get_thrust_policy(),
-                       vertex_first,
-                       vertex_last,
+                       sorted_unique_vertex_first,
+                       sorted_unique_vertex_last,
                        [vertex_property_input_first,
                         output_value_first = edge_partition_value_first] __device__(auto v) {
                          bool val = static_cast<bool>(*(vertex_property_input_first + v));
                          packed_bool_atomic_set(output_value_first, v, val);
                        });
     } else {
-      auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first);
-      thrust::scatter(handle.get_thrust_policy(),
-                      val_first,
-                      val_first + thrust::distance(vertex_first, vertex_last),
-                      vertex_first,
-                      edge_partition_value_first);
+      auto val_first =
+        thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first);
+      thrust::scatter(
+        handle.get_thrust_policy(),
+        val_first,
+        val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last),
+        sorted_unique_vertex_first,
+        edge_partition_value_first);
     }
   }
 }
@@ -909,8 +977,9 @@ void update_edge_src_property(raft::handle_t const& handle,
 /**
  * @brief Update graph edge source property values from the input vertex property values.
  *
- * This version updates only a subset of graph edge source property values. [@p vertex_first, @p
- * vertex_last) specifies the vertices with new property values to be updated.
+ * This version updates only a subset of graph edge source property values. [@p
+ * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new
+ * property values to be updated.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator  Type of the iterator for vertex identifiers.
@@ -919,10 +988,12 @@ void update_edge_src_property(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be
- * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex
- * partition assigned to this process in multi-GPU), otherwise undefined behavior.
- * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value.
+ * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new
+ * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be
+ * sorted & distinct (and should belong to the vertex partition assigned to this process in
+ * multi-GPU), otherwise undefined behavior.
+ * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new
+ * value.
  * @param vertex_property_input_first Iterator pointing to the vertex property value for the first
  * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU).
  * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p
@@ -937,8 +1008,8 @@ template <typename GraphViewType,
           typename EdgeSrcValueOutputWrapper>
 void update_edge_src_property(raft::handle_t const& handle,
                               GraphViewType const& graph_view,
-                              VertexIterator vertex_first,
-                              VertexIterator vertex_last,
+                              VertexIterator sorted_unique_vertex_first,
+                              VertexIterator sorted_unique_vertex_last,
                               VertexPropertyInputIterator vertex_property_input_first,
                               EdgeSrcValueOutputWrapper edge_src_property_output,
                               bool do_expensive_check = false)
@@ -946,8 +1017,8 @@ void update_edge_src_property(raft::handle_t const& handle,
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
-      vertex_first,
-      vertex_last,
+      sorted_unique_vertex_first,
+      sorted_unique_vertex_last,
       [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(),
        local_vertex_partition_range_last =
          graph_view.local_vertex_partition_range_last()] __device__(auto v) {
@@ -958,23 +1029,23 @@ void update_edge_src_property(raft::handle_t const& handle,
       num_invalids =
         host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream());
     }
-    CUGRAPH_EXPECTS(
-      num_invalids == 0,
-      "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last).");
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: invalid or non-local vertices in "
+                    "[sorted_unique_vertex_first, sorted_unique_vertex_last).");
   }
 
   if constexpr (GraphViewType::is_storage_transposed) {
     detail::update_edge_minor_property(handle,
                                        graph_view,
-                                       vertex_first,
-                                       vertex_last,
+                                       sorted_unique_vertex_first,
+                                       sorted_unique_vertex_last,
                                        vertex_property_input_first,
                                        edge_src_property_output);
   } else {
     detail::update_edge_major_property(handle,
                                        graph_view,
-                                       vertex_first,
-                                       vertex_last,
+                                       sorted_unique_vertex_first,
+                                       sorted_unique_vertex_last,
                                        vertex_property_input_first,
                                        edge_src_property_output);
   }
@@ -1026,8 +1097,9 @@ void update_edge_dst_property(raft::handle_t const& handle,
 /**
  * @brief Update graph edge destination property values from the input vertex property values.
  *
- * This version updates only a subset of graph edge destination property values. [@p vertex_first,
- * @p vertex_last) specifies the vertices with new property values to be updated.
+ * This version updates only a subset of graph edge destination property values. [@p
+ * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new
+ * property values to be updated.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator  Type of the iterator for vertex identifiers.
@@ -1037,10 +1109,12 @@ void update_edge_dst_property(raft::handle_t const& handle,
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be
- * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex
- * partition assigned to this process in multi-GPU), otherwise undefined behavior.
- * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value.
+ * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new
+ * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be
+ * sorted & distinct (and should belong to the vertex partition assigned to this process in
+ * multi-GPU), otherwise undefined behavior.
+ * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new
+ * value.
  * @param vertex_property_input_first Iterator pointing to the vertex property value for the first
  * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU).
  * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p
@@ -1055,8 +1129,8 @@ template <typename GraphViewType,
           typename EdgeDstValueOutputWrapper>
 void update_edge_dst_property(raft::handle_t const& handle,
                               GraphViewType const& graph_view,
-                              VertexIterator vertex_first,
-                              VertexIterator vertex_last,
+                              VertexIterator sorted_unique_vertex_first,
+                              VertexIterator sorted_unique_vertex_last,
                               VertexPropertyInputIterator vertex_property_input_first,
                               EdgeDstValueOutputWrapper edge_dst_property_output,
                               bool do_expensive_check = false)
@@ -1064,8 +1138,8 @@ void update_edge_dst_property(raft::handle_t const& handle,
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
-      vertex_first,
-      vertex_last,
+      sorted_unique_vertex_first,
+      sorted_unique_vertex_last,
       [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(),
        local_vertex_partition_range_last =
          graph_view.local_vertex_partition_range_last()] __device__(auto v) {
@@ -1076,23 +1150,23 @@ void update_edge_dst_property(raft::handle_t const& handle,
       num_invalids =
         host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream());
     }
-    CUGRAPH_EXPECTS(
-      num_invalids == 0,
-      "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last).");
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: invalid or non-local vertices in "
+                    "[sorted_unique_vertex_first, sorted_unique_vertex_last).");
   }
 
   if constexpr (GraphViewType::is_storage_transposed) {
     detail::update_edge_major_property(handle,
                                        graph_view,
-                                       vertex_first,
-                                       vertex_last,
+                                       sorted_unique_vertex_first,
+                                       sorted_unique_vertex_last,
                                        vertex_property_input_first,
                                        edge_dst_property_output);
   } else {
     detail::update_edge_minor_property(handle,
                                        graph_view,
-                                       vertex_first,
-                                       vertex_last,
+                                       sorted_unique_vertex_first,
+                                       sorted_unique_vertex_last,
                                        vertex_property_input_first,
                                        edge_dst_property_output);
   }
diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh
index b13e6bfd45..6e7d8515be 100644
--- a/cpp/src/prims/vertex_frontier.cuh
+++ b/cpp/src/prims/vertex_frontier.cuh
@@ -15,15 +15,24 @@
  */
 #pragma once
 
+#include "prims/detail/multi_stream_utils.cuh"
+
+#include <cugraph/utilities/device_comm.hpp>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/packed_bool_utils.hpp>
 
+#include <raft/core/device_span.hpp>
 #include <raft/core/handle.hpp>
+#include <raft/core/host_span.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/atomic>
+#include <cuda/functional>
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/fill.h>
@@ -48,6 +57,191 @@
 
 namespace cugraph {
 
+template <typename vertex_t, typename KeyIterator>
+KeyIterator compute_key_lower_bound(KeyIterator sorted_unique_key_first,
+                                    KeyIterator sorted_unique_key_last,
+                                    vertex_t v_threshold,
+                                    rmm::cuda_stream_view stream_view)
+{
+  using key_t = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  if constexpr (std::is_same_v<key_t, vertex_t>) {
+    return thrust::lower_bound(
+      rmm::exec_policy(stream_view), sorted_unique_key_first, sorted_unique_key_last, v_threshold);
+  } else {
+    key_t k_threshold{};
+    thrust::get<0>(k_threshold) = v_threshold;
+    return thrust::lower_bound(
+      rmm::exec_policy(stream_view),
+      sorted_unique_key_first,
+      sorted_unique_key_last,
+      k_threshold,
+      [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); });
+  }
+}
+
+template <typename vertex_t, typename KeyIterator>
+std::vector<size_t> compute_key_segment_offsets(KeyIterator sorted_key_first,
+                                                KeyIterator sorted_key_last,
+                                                raft::host_span<vertex_t const> segment_offsets,
+                                                vertex_t vertex_range_first,
+                                                rmm::cuda_stream_view stream_view)
+{
+  using key_t = typename thrust::iterator_traits<KeyIterator>::value_type;
+
+  std::vector<vertex_t> h_thresholds(segment_offsets.size() - 2);
+  for (size_t i = 0; i < h_thresholds.size(); ++i) {
+    h_thresholds[i] = vertex_range_first + segment_offsets[i + 1];
+  }
+
+  rmm::device_uvector<vertex_t> d_thresholds(h_thresholds.size(), stream_view);
+  raft::update_device(d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), stream_view);
+
+  rmm::device_uvector<size_t> d_offsets(d_thresholds.size(), stream_view);
+  if constexpr (std::is_same_v<key_t, vertex_t>) {
+    thrust::lower_bound(rmm::exec_policy_nosync(stream_view),
+                        sorted_key_first,
+                        sorted_key_last,
+                        d_thresholds.begin(),
+                        d_thresholds.end(),
+                        d_offsets.begin());
+  } else {
+    auto sorted_vertex_first =
+      thrust::make_transform_iterator(sorted_key_first, thrust_tuple_get<key_t, 0>{});
+    thrust::lower_bound(rmm::exec_policy_nosync(stream_view),
+                        sorted_vertex_first,
+                        sorted_vertex_first + thrust::distance(sorted_key_first, sorted_key_last),
+                        d_thresholds.begin(),
+                        d_thresholds.end(),
+                        d_offsets.begin());
+  }
+
+  std::vector<size_t> h_offsets(d_offsets.size() + 2);
+  raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view));
+  h_offsets[0]     = size_t{0};
+  h_offsets.back() = static_cast<size_t>(thrust::distance(sorted_key_first, sorted_key_last));
+
+  return h_offsets;
+}
+
+template <typename VertexIterator>
+rmm::device_uvector<uint32_t> compute_vertex_list_bitmap_info(
+  VertexIterator sorted_unique_vertex_first,
+  VertexIterator sorted_unique_vertex_last,
+  typename thrust::iterator_traits<VertexIterator>::value_type vertex_range_first,
+  typename thrust::iterator_traits<VertexIterator>::value_type vertex_range_last,
+  rmm::cuda_stream_view stream_view)
+{
+  using vertex_t = typename thrust::iterator_traits<VertexIterator>::value_type;
+
+  auto bitmap = rmm::device_uvector<uint32_t>(
+    packed_bool_size(vertex_range_last - vertex_range_first), stream_view);
+  rmm::device_uvector<vertex_t> lasts(bitmap.size(), stream_view);
+  auto bdry_first = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(vertex_t{1}),
+    cuda::proclaim_return_type<vertex_t>(
+      [vertex_range_first,
+       vertex_range_size = vertex_range_last - vertex_range_first] __device__(vertex_t i) {
+        return vertex_range_first +
+               static_cast<vertex_t>(
+                 std::min(packed_bools_per_word() * i, static_cast<size_t>(vertex_range_size)));
+      }));
+  thrust::lower_bound(rmm::exec_policy_nosync(stream_view),
+                      sorted_unique_vertex_first,
+                      sorted_unique_vertex_last,
+                      bdry_first,
+                      bdry_first + bitmap.size(),
+                      lasts.begin());
+  thrust::tabulate(
+    rmm::exec_policy_nosync(stream_view),
+    bitmap.begin(),
+    bitmap.end(),
+    cuda::proclaim_return_type<uint32_t>(
+      [sorted_unique_vertex_first,
+       vertex_range_first,
+       lasts = raft::device_span<vertex_t const>(lasts.data(), lasts.size())] __device__(size_t i) {
+        auto offset_first = (i != 0) ? lasts[i - 1] : vertex_t{0};
+        auto offset_last  = lasts[i];
+        auto ret          = packed_bool_empty_mask();
+        for (auto j = offset_first; j < offset_last; ++j) {
+          auto v_offset = *(sorted_unique_vertex_first + j) - vertex_range_first;
+          ret |= packed_bool_mask(v_offset);
+        }
+        return ret;
+      }));
+
+  return bitmap;
+}
+
+template <typename InputVertexIterator, typename OutputVertexIterator>
+void device_bcast_vertex_list(
+  raft::comms::comms_t const& comm,
+  std::variant<raft::device_span<uint32_t const>, InputVertexIterator> v_list,
+  OutputVertexIterator output_v_first,
+  typename thrust::iterator_traits<InputVertexIterator>::value_type vertex_range_first,
+  typename thrust::iterator_traits<InputVertexIterator>::value_type vertex_range_last,
+  size_t v_list_size,
+  int root,
+  rmm::cuda_stream_view stream_view)
+{
+  using vertex_t = typename thrust::iterator_traits<InputVertexIterator>::value_type;
+
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<OutputVertexIterator>::value_type, vertex_t>);
+
+  if (v_list.index() == 0) {  // bitmap
+    rmm::device_uvector<uint32_t> tmp_bitmap(
+      packed_bool_size(vertex_range_last - vertex_range_first), stream_view);
+    assert((comm.get_rank() != root) || (std::get<0>(v_list).size() == tmp_bitmap.size()));
+    device_bcast(
+      comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view);
+    rmm::device_scalar<size_t> dummy(size_t{0}, stream_view);  // we already know the count
+    detail::copy_if_nosync(
+      thrust::make_counting_iterator(vertex_range_first),
+      thrust::make_counting_iterator(vertex_range_last),
+      thrust::make_transform_iterator(
+        thrust::make_counting_iterator(vertex_t{0}),
+        cuda::proclaim_return_type<bool>(
+          [bitmap = raft::device_span<uint32_t const>(
+             tmp_bitmap.data(), tmp_bitmap.size())] __device__(vertex_t v_offset) {
+            return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) !=
+                    packed_bool_empty_mask());
+          })),
+      output_v_first,
+      raft::device_span<size_t>(dummy.data(), size_t{1}),
+      stream_view);
+  } else {
+    device_bcast(comm, std::get<1>(v_list), output_v_first, v_list_size, root, stream_view);
+  }
+}
+
+template <typename OutputVertexIterator>
+void retrieve_vertex_list_from_bitmap(
+  raft::device_span<uint32_t const> bitmap,
+  OutputVertexIterator output_v_first,
+  raft::device_span<size_t> count /* size = 1 */,
+  typename thrust::iterator_traits<OutputVertexIterator>::value_type vertex_range_first,
+  typename thrust::iterator_traits<OutputVertexIterator>::value_type vertex_range_last,
+  rmm::cuda_stream_view stream_view)
+{
+  using vertex_t = typename thrust::iterator_traits<OutputVertexIterator>::value_type;
+
+  assert((comm.get_rank() != root) ||
+         (bitmap.size() >= packed_bool_size(vertex_range_last - vertex_ragne_first)));
+  detail::copy_if_nosync(thrust::make_counting_iterator(vertex_range_first),
+                         thrust::make_counting_iterator(vertex_range_last),
+                         thrust::make_transform_iterator(
+                           thrust::make_counting_iterator(vertex_t{0}),
+                           cuda::proclaim_return_type<bool>([bitmap] __device__(vertex_t v_offset) {
+                             return ((bitmap[packed_bool_offset(v_offset)] &
+                                      packed_bool_mask(v_offset)) != packed_bool_empty_mask());
+                           })),
+                         output_v_first,
+                         count,
+                         stream_view);
+}
+
 // key type is either vertex_t (tag_t == void) or thrust::tuple<vertex_t, tag_t> (tag_t != void)
 // if sorted_unique is true, stores unique key objects in the sorted (non-descending) order.
 // if false, there can be duplicates and the elements may not be sorted.
@@ -328,20 +522,6 @@ class key_bucket_t {
     }
   }
 
-  auto const begin() const
-  {
-    if constexpr (std::is_same_v<tag_t, void>) {
-      return vertices_.index() == 0 ? std::get<0>(vertices_).begin()
-                                    : std::get<1>(vertices_).begin();
-    } else {
-      return vertices_.index() == 0
-               ? thrust::make_zip_iterator(
-                   thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin()))
-               : thrust::make_zip_iterator(
-                   thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin()));
-    }
-  }
-
   auto begin()
   {
     CUGRAPH_EXPECTS(
@@ -355,12 +535,22 @@ class key_bucket_t {
     }
   }
 
-  auto const end() const
+  auto const cbegin() const
   {
-    return begin() +
-           (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size());
+    if constexpr (std::is_same_v<tag_t, void>) {
+      return vertices_.index() == 0 ? std::get<0>(vertices_).begin()
+                                    : std::get<1>(vertices_).begin();
+    } else {
+      return vertices_.index() == 0
+               ? thrust::make_zip_iterator(
+                   thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin()))
+               : thrust::make_zip_iterator(
+                   thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin()));
+    }
   }
 
+  auto const begin() const { return cbegin(); }
+
   auto end()
   {
     CUGRAPH_EXPECTS(
@@ -369,15 +559,13 @@ class key_bucket_t {
     return begin() + std::get<0>(vertices_).size();
   }
 
-  auto const vertex_begin() const
+  auto const cend() const
   {
-    return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin();
+    return begin() +
+           (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size());
   }
 
-  auto const vertex_end() const
-  {
-    return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end();
-  }
+  auto const end() const { return cend(); }
 
   auto vertex_begin()
   {
@@ -387,6 +575,13 @@ class key_bucket_t {
     return std::get<0>(vertices_).begin();
   }
 
+  auto const vertex_cbegin() const
+  {
+    return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin();
+  }
+
+  auto const vertex_begin() const { return vertex_cbegin(); }
+
   auto vertex_end()
   {
     CUGRAPH_EXPECTS(
@@ -395,6 +590,13 @@ class key_bucket_t {
     return std::get<0>(vertices_).end();
   }
 
+  auto const vertex_cend() const
+  {
+    return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end();
+  }
+
+  auto const vertex_end() const { return vertex_cend(); }
+
   bool is_owning() { return (vertices_.index() == 0); }
 
  private:
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 9796ddd60a..cd98db3165 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -44,6 +44,7 @@
 
 #include <cstdint>
 #include <numeric>
+#include <variant>
 
 namespace cugraph {
 
@@ -299,6 +300,121 @@ bool check_no_parallel_edge(raft::handle_t const& handle,
          (org_edge_first + edgelist_srcs.size());
 }
 
+template <typename edge_t>
+std::vector<rmm::device_uvector<std::byte>>
+split_edge_chunk_compressed_elements_to_local_edge_partitions(
+  raft::handle_t const& handle,
+  std::vector<rmm::device_uvector<std::byte>>&& edgelist_compressed_elements,
+  std::vector<std::vector<edge_t>> const& edgelist_edge_offset_vectors,
+  std::vector<edge_t> const& edge_partition_edge_counts,
+  std::vector<std::vector<edge_t>> const& edge_partition_intra_partition_segment_offset_vectors,
+  std::vector<std::vector<edge_t>> const&
+    edge_partition_intra_segment_copy_output_displacement_vectors,
+  size_t element_size)
+{
+  auto num_chunks          = edgelist_compressed_elements.size();
+  auto num_edge_partitions = edge_partition_edge_counts.size();
+  auto num_segments        = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1;
+  for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) {
+    assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1));
+  }
+
+  std::vector<rmm::device_uvector<std::byte>> edge_partition_compressed_elements{};
+  edge_partition_compressed_elements.reserve(num_edge_partitions);
+  for (size_t i = 0; i < num_edge_partitions; ++i) {
+    edge_partition_compressed_elements.push_back(rmm::device_uvector<std::byte>(
+      edge_partition_edge_counts[i] * element_size, handle.get_stream()));
+  }
+
+  for (size_t i = 0; i < num_edge_partitions; ++i) {
+    for (size_t j = 0; j < num_segments; ++j) {
+      for (size_t k = 0; k < num_chunks; ++k) {
+        auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j];
+        auto segment_size   = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] -
+                            edgelist_edge_offset_vectors[k][i * num_segments + j];
+        auto output_offset =
+          edge_partition_intra_partition_segment_offset_vectors[i][j] +
+          edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k];
+        thrust::copy(
+          handle.get_thrust_policy(),
+          edgelist_compressed_elements[k].begin() + segment_offset * element_size,
+          edgelist_compressed_elements[k].begin() + (segment_offset + segment_size) * element_size,
+          edge_partition_compressed_elements[i].begin() + output_offset * element_size);
+      }
+    }
+  }
+  edgelist_compressed_elements.clear();
+
+  return edge_partition_compressed_elements;
+}
+
+template <typename edge_t, typename T>
+std::vector<rmm::device_uvector<T>> split_edge_chunk_elements_to_local_edge_partitions(
+  raft::handle_t const& handle,
+  std::vector<rmm::device_uvector<T>>&& edgelist_elements,
+  std::vector<std::vector<edge_t>> const& edgelist_edge_offset_vectors,
+  std::vector<edge_t> const& edge_partition_edge_counts,
+  std::vector<std::vector<edge_t>> const& edge_partition_intra_partition_segment_offset_vectors,
+  std::vector<std::vector<edge_t>> const&
+    edge_partition_intra_segment_copy_output_displacement_vectors)
+{
+  static_assert(std::is_arithmetic_v<T>);  // otherwise, unimplemented.
+  auto num_chunks          = edgelist_elements.size();
+  auto num_edge_partitions = edge_partition_edge_counts.size();
+  auto num_segments        = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1;
+  for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) {
+    assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1));
+  }
+
+  std::vector<rmm::device_uvector<T>> edge_partition_elements{};
+  edge_partition_elements.reserve(num_edge_partitions);
+  for (size_t i = 0; i < num_edge_partitions; ++i) {
+    edge_partition_elements.push_back(
+      rmm::device_uvector<T>(edge_partition_edge_counts[i], handle.get_stream()));
+  }
+
+  for (size_t i = 0; i < num_edge_partitions; ++i) {
+    for (size_t j = 0; j < num_segments; ++j) {
+      for (size_t k = 0; k < num_chunks; ++k) {
+        auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j];
+        auto segment_size   = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] -
+                            edgelist_edge_offset_vectors[k][i * num_segments + j];
+        auto output_offset =
+          edge_partition_intra_partition_segment_offset_vectors[i][j] +
+          edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k];
+        thrust::copy(handle.get_thrust_policy(),
+                     edgelist_elements[k].begin() + segment_offset,
+                     edgelist_elements[k].begin() + (segment_offset + segment_size),
+                     edge_partition_elements[i].begin() + output_offset);
+      }
+    }
+  }
+  edgelist_elements.clear();
+
+  return edge_partition_elements;
+}
+
+template <typename vertex_t>
+void decompress_vertices(raft::handle_t const& handle,
+                         raft::device_span<std::byte const> compressed_vertices,
+                         raft::device_span<vertex_t> vertices,
+                         size_t compressed_v_size)
+{
+  auto input_v_first = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(size_t{0}),
+    cuda::proclaim_return_type<vertex_t>(
+      [byte_first = compressed_vertices.begin(), compressed_v_size] __device__(size_t i) {
+        uint64_t v{0};
+        for (size_t j = 0; j < compressed_v_size; ++j) {
+          auto b = *(byte_first + i * compressed_v_size + j);
+          v |= static_cast<uint64_t>(b) << (8 * j);
+        }
+        return static_cast<vertex_t>(v);
+      }));
+  thrust::copy(
+    handle.get_thrust_policy(), input_v_first, input_v_first + vertices.size(), vertices.begin());
+}
+
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -325,7 +441,7 @@ create_graph_from_partitioned_edgelist(
   std::optional<std::vector<rmm::device_uvector<weight_t>>>&& edge_partition_edgelist_weights,
   std::optional<std::vector<rmm::device_uvector<edge_id_t>>>&& edge_partition_edgelist_edge_ids,
   std::optional<std::vector<rmm::device_uvector<edge_type_t>>>&& edge_partition_edgelist_edge_types,
-  std::vector<std::vector<edge_t>> const& edgelist_intra_partition_segment_offsets,
+  std::vector<std::vector<edge_t>> const& edgelist_intra_partition_segment_offset_vectors,
   graph_properties_t graph_properties,
   bool renumber)
 {
@@ -347,14 +463,14 @@ create_graph_from_partitioned_edgelist(
     src_ptrs[i] = edge_partition_edgelist_srcs[i].begin();
     dst_ptrs[i] = edge_partition_edgelist_dsts[i].begin();
   }
-  auto [renumber_map_labels, meta] =
-    cugraph::renumber_edgelist<vertex_t, edge_t, true>(handle,
-                                                       std::move(local_vertices),
-                                                       src_ptrs,
-                                                       dst_ptrs,
-                                                       edgelist_edge_counts,
-                                                       edgelist_intra_partition_segment_offsets,
-                                                       store_transposed);
+  auto [renumber_map_labels, meta] = cugraph::renumber_edgelist<vertex_t, edge_t, true>(
+    handle,
+    std::move(local_vertices),
+    src_ptrs,
+    dst_ptrs,
+    edgelist_edge_counts,
+    edgelist_intra_partition_segment_offset_vectors,
+    store_transposed);
 
   auto num_segments_per_vertex_partition =
     static_cast<size_t>(meta.edge_partition_segment_offsets.size() / minor_comm_size);
@@ -369,7 +485,7 @@ create_graph_from_partitioned_edgelist(
   if (edge_partition_edgelist_edge_ids) { element_size += sizeof(edge_id_t); }
   if (edge_partition_edgelist_edge_types) { element_size += sizeof(edge_type_t); }
   auto constexpr mem_frugal_ratio =
-    0.25;  // if the expected temporary buffer size exceeds the mem_frugal_ratio of the
+    0.05;  // if the expected temporary buffer size exceeds the mem_frugal_ratio of the
            // total_global_mem, switch to the memory frugal approach
   auto mem_frugal_threshold =
     static_cast<size_t>(static_cast<double>(total_global_mem / element_size) * mem_frugal_ratio);
@@ -684,11 +800,13 @@ create_graph_from_partitioned_edgelist(
       std::move(edge_partition_offsets),
       std::move(edge_partition_indices),
       std::move(edge_partition_dcs_nzd_vertices),
-      cugraph::graph_meta_t<vertex_t, edge_t, true>{meta.number_of_vertices,
-                                                    meta.number_of_edges,
-                                                    graph_properties,
-                                                    meta.partition,
-                                                    meta.edge_partition_segment_offsets}),
+      cugraph::graph_meta_t<vertex_t, edge_t, true>{
+        meta.number_of_vertices,
+        meta.number_of_edges,
+        graph_properties,
+        meta.partition,
+        meta.edge_partition_segment_offsets,
+        meta.edge_partition_hypersparse_degree_offsets}),
     std::move(edge_weights),
     std::move(edge_ids),
     std::move(edge_types),
@@ -790,7 +908,7 @@ create_graph_from_edgelist_impl(
   handle.sync_stream();
 
   std::vector<edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0});
-  auto edgelist_intra_partition_segment_offsets = std::vector<std::vector<edge_t>>(
+  auto edgelist_intra_partition_segment_offset_vectors = std::vector<std::vector<edge_t>>(
     minor_comm_size, std::vector<edge_t>(major_comm_size + 1, edge_t{0}));
   for (int i = 0; i < minor_comm_size; ++i) {
     edgelist_edge_counts[i] = std::accumulate(h_edge_counts.begin() + major_comm_size * i,
@@ -798,7 +916,7 @@ create_graph_from_edgelist_impl(
                                               edge_t{0});
     std::partial_sum(h_edge_counts.begin() + major_comm_size * i,
                      h_edge_counts.begin() + major_comm_size * (i + 1),
-                     edgelist_intra_partition_segment_offsets[i].begin() + 1);
+                     edgelist_intra_partition_segment_offset_vectors[i].begin() + 1);
   }
   std::vector<edge_t> edgelist_displacements(minor_comm_size, edge_t{0});
   std::partial_sum(edgelist_edge_counts.begin(),
@@ -898,7 +1016,7 @@ create_graph_from_edgelist_impl(
     std::move(edge_partition_edgelist_weights),
     std::move(edge_partition_edgelist_edge_ids),
     std::move(edge_partition_edgelist_edge_types),
-    edgelist_intra_partition_segment_offsets,
+    edgelist_intra_partition_segment_offset_vectors,
     graph_properties,
     renumber);
 }
@@ -1021,30 +1139,66 @@ create_graph_from_edgelist_impl(
     }
   }
 
-  // 1. groupby each edge chunks to their target local adjacency matrix partition (and further
+  auto num_chunks = edgelist_srcs.size();
+
+  // 1. set whether to temporarily compress vertex IDs or not in splitting edge chunks
+
+  size_t compressed_v_size =
+    sizeof(vertex_t);  // if set to a value smaller than sizeof(vertex_t), temporarily store vertex
+                       // IDs in compressed_v_size byte variables
+
+  static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8));
+  if constexpr (sizeof(vertex_t) == 8) {        // 64 bit vertex ID
+    static_assert(std::is_signed_v<vertex_t>);  // __clzll takes a signed integer
+
+    auto total_global_mem = handle.get_device_properties().totalGlobalMem;
+    size_t element_size   = sizeof(vertex_t) * 2;
+    if (edgelist_weights) { element_size += sizeof(weight_t); }
+    if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); }
+    if (edgelist_edge_types) { element_size += sizeof(edge_type_t); }
+    edge_t num_edges{0};
+    for (size_t i = 0; i < edgelist_srcs.size(); ++i) {
+      num_edges += edgelist_srcs[i].size();
+    }
+    bool compress{false};
+    if (static_cast<size_t>(num_edges) * element_size >
+        static_cast<size_t>(total_global_mem * 0.5 /* tuning parameter */)) {
+      compress = true;
+    }
+
+    if (compress) {
+      size_t min_clz{sizeof(vertex_t) * 8};
+      for (size_t i = 0; i < num_chunks; ++i) {
+        min_clz =
+          thrust::transform_reduce(handle.get_thrust_policy(),
+                                   edgelist_srcs[i].begin(),
+                                   edgelist_srcs[i].end(),
+                                   cuda::proclaim_return_type<size_t>([] __device__(auto v) {
+                                     return static_cast<size_t>(__clzll(v));
+                                   }),
+                                   min_clz,
+                                   thrust::minimum<size_t>{});
+        min_clz =
+          thrust::transform_reduce(handle.get_thrust_policy(),
+                                   edgelist_dsts[i].begin(),
+                                   edgelist_dsts[i].end(),
+                                   cuda::proclaim_return_type<size_t>([] __device__(auto v) {
+                                     return static_cast<size_t>(__clzll(v));
+                                   }),
+                                   min_clz,
+                                   thrust::minimum<size_t>{});
+      }
+      compressed_v_size = sizeof(vertex_t) - (min_clz / 8);
+      compressed_v_size = std::max(compressed_v_size, size_t{1});
+    }
+  }
+
+  // 2. groupby each edge chunks to their target local adjacency matrix partition (and further
   // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex
   // IDs).
 
-  std::vector<std::vector<rmm::device_uvector<vertex_t>>> edgelist_partitioned_srcs(
-    edgelist_srcs.size());
-  std::vector<std::vector<rmm::device_uvector<vertex_t>>> edgelist_partitioned_dsts(
-    edgelist_srcs.size());
-  auto edgelist_partitioned_weights =
-    edgelist_weights ? std::make_optional<std::vector<std::vector<rmm::device_uvector<weight_t>>>>(
-                         edgelist_srcs.size())
-                     : std::nullopt;
-  auto edgelist_partitioned_edge_ids =
-    edgelist_edge_ids
-      ? std::make_optional<std::vector<std::vector<rmm::device_uvector<edge_id_t>>>>(
-          edgelist_srcs.size())
-      : std::nullopt;
-  auto edgelist_partitioned_edge_types =
-    edgelist_edge_types
-      ? std::make_optional<std::vector<std::vector<rmm::device_uvector<edge_type_t>>>>(
-          edgelist_srcs.size())
-      : std::nullopt;
-
-  for (size_t i = 0; i < edgelist_srcs.size(); ++i) {  // iterate over input edge chunks
+  std::vector<std::vector<edge_t>> edgelist_edge_offset_vectors(num_chunks);
+  for (size_t i = 0; i < num_chunks; ++i) {  // iterate over input edge chunks
     std::optional<rmm::device_uvector<weight_t>> this_chunk_weights{std::nullopt};
     if (edgelist_weights) { this_chunk_weights = std::move((*edgelist_weights)[i]); }
     std::optional<rmm::device_uvector<edge_id_t>> this_chunk_edge_ids{std::nullopt};
@@ -1060,6 +1214,9 @@ create_graph_from_edgelist_impl(
         this_chunk_edge_ids,
         this_chunk_edge_types,
         true);
+    if (this_chunk_weights) { (*edgelist_weights)[i] = std::move(*this_chunk_weights); }
+    if (this_chunk_edge_ids) { (*edgelist_edge_ids)[i] = std::move(*this_chunk_edge_ids); }
+    if (this_chunk_edge_types) { (*edgelist_edge_types)[i] = std::move(*this_chunk_edge_types); }
 
     std::vector<size_t> h_this_chunk_edge_counts(d_this_chunk_edge_counts.size());
     raft::update_host(h_this_chunk_edge_counts.data(),
@@ -1067,132 +1224,84 @@ create_graph_from_edgelist_impl(
                       d_this_chunk_edge_counts.size(),
                       handle.get_stream());
     handle.sync_stream();
-    std::vector<size_t> h_this_chunk_edge_displacements(h_this_chunk_edge_counts.size());
-    std::exclusive_scan(h_this_chunk_edge_counts.begin(),
+    std::vector<edge_t> h_this_chunk_edge_offsets(
+      h_this_chunk_edge_counts.size() + 1,
+      0);  // size = minor_comm_size (# local edge partitions) * major_comm_size (# segments in the
+           // local minor range)
+    std::inclusive_scan(h_this_chunk_edge_counts.begin(),
                         h_this_chunk_edge_counts.end(),
-                        h_this_chunk_edge_displacements.begin(),
-                        size_t{0});
-
-    for (int j = 0; j < minor_comm_size /* # local edge partitions */ *
-                          major_comm_size /* # segments in the local minor range */;
-         ++j) {
-      rmm::device_uvector<vertex_t> tmp_srcs(h_this_chunk_edge_counts[j], handle.get_stream());
-      auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_displacements[j];
-      thrust::copy(
-        handle.get_thrust_policy(), input_first, input_first + tmp_srcs.size(), tmp_srcs.begin());
-      edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs));
-    }
-    edgelist_srcs[i].resize(0, handle.get_stream());
-    edgelist_srcs[i].shrink_to_fit(handle.get_stream());
-
-    for (int j = 0; j < minor_comm_size /* # local edge partitions */ *
-                          major_comm_size /* # segments in the local minor range */;
-         ++j) {
-      rmm::device_uvector<vertex_t> tmp_dsts(h_this_chunk_edge_counts[j], handle.get_stream());
-      auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_displacements[j];
-      thrust::copy(
-        handle.get_thrust_policy(), input_first, input_first + tmp_dsts.size(), tmp_dsts.begin());
-      edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts));
-    }
-    edgelist_dsts[i].resize(0, handle.get_stream());
-    edgelist_dsts[i].shrink_to_fit(handle.get_stream());
-
-    if (this_chunk_weights) {
-      for (int j = 0; j < minor_comm_size /* # local edge partitions */ *
-                            major_comm_size /* # segments in the local minor range */;
-           ++j) {
-        rmm::device_uvector<weight_t> tmp_weights(h_this_chunk_edge_counts[j], handle.get_stream());
-        auto input_first = (*this_chunk_weights).begin() + h_this_chunk_edge_displacements[j];
-        thrust::copy(handle.get_thrust_policy(),
-                     input_first,
-                     input_first + tmp_weights.size(),
-                     tmp_weights.begin());
-        (*edgelist_partitioned_weights)[i].push_back(std::move(tmp_weights));
-      }
-      (*this_chunk_weights).resize(0, handle.get_stream());
-      (*this_chunk_weights).shrink_to_fit(handle.get_stream());
-    }
-
-    if (this_chunk_edge_ids) {
-      for (int j = 0; j < minor_comm_size /* # local edge partitions */ *
-                            major_comm_size /* # segments in the local minor range */;
-           ++j) {
-        rmm::device_uvector<edge_id_t> tmp_edge_ids(h_this_chunk_edge_counts[j],
-                                                    handle.get_stream());
-        auto input_first = (*this_chunk_edge_ids).begin() + h_this_chunk_edge_displacements[j];
-        thrust::copy(handle.get_thrust_policy(),
-                     input_first,
-                     input_first + tmp_edge_ids.size(),
-                     tmp_edge_ids.begin());
-        (*edgelist_partitioned_edge_ids)[i].push_back(std::move(tmp_edge_ids));
-      }
-      (*this_chunk_edge_ids).resize(0, handle.get_stream());
-      (*this_chunk_edge_ids).shrink_to_fit(handle.get_stream());
-    }
+                        h_this_chunk_edge_offsets.begin() + 1);
+    edgelist_edge_offset_vectors[i] = std::move(h_this_chunk_edge_offsets);
+  }
 
-    if (this_chunk_edge_types) {
-      for (int j = 0; j < minor_comm_size /* # local edge partitions */ *
-                            major_comm_size /* # segments in the local minor range */;
-           ++j) {
-        rmm::device_uvector<edge_type_t> tmp_edge_types(h_this_chunk_edge_counts[j],
-                                                        handle.get_stream());
-        auto input_first = (*this_chunk_edge_types).begin() + h_this_chunk_edge_displacements[j];
-        thrust::copy(handle.get_thrust_policy(),
-                     input_first,
-                     input_first + tmp_edge_types.size(),
-                     tmp_edge_types.begin());
-        (*edgelist_partitioned_edge_types)[i].push_back(std::move(tmp_edge_types));
-      }
-      (*this_chunk_edge_types).resize(0, handle.get_stream());
-      (*this_chunk_edge_types).shrink_to_fit(handle.get_stream());
+  // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement
+
+  std::optional<std::vector<rmm::device_uvector<std::byte>>> edgelist_compressed_srcs{std::nullopt};
+  std::optional<std::vector<rmm::device_uvector<std::byte>>> edgelist_compressed_dsts{std::nullopt};
+  if (compressed_v_size < sizeof(vertex_t)) {
+    edgelist_compressed_srcs = std::vector<rmm::device_uvector<std::byte>>{};
+    edgelist_compressed_dsts = std::vector<rmm::device_uvector<std::byte>>{};
+    (*edgelist_compressed_srcs).reserve(num_chunks);
+    (*edgelist_compressed_dsts).reserve(num_chunks);
+    for (size_t i = 0; i < num_chunks; ++i) {  // iterate over input edge chunks
+                                               // compress source values
+      auto tmp_srcs = rmm::device_uvector<std::byte>(edgelist_srcs[i].size() * compressed_v_size,
+                                                     handle.get_stream());
+      auto input_src_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<std::byte>(
+          [src_first = edgelist_srcs[i].begin(), compressed_v_size] __device__(size_t i) {
+            auto v = static_cast<uint64_t>(*(src_first + (i / compressed_v_size)));
+            return static_cast<std::byte>((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff});
+          }));
+      thrust::copy(handle.get_thrust_policy(),
+                   input_src_first,
+                   input_src_first + edgelist_srcs[i].size() * compressed_v_size,
+                   tmp_srcs.begin());
+      edgelist_srcs[i].resize(0, handle.get_stream());
+      edgelist_srcs[i].shrink_to_fit(handle.get_stream());
+      (*edgelist_compressed_srcs).push_back(std::move(tmp_srcs));
+
+      // compress destination values
+
+      auto tmp_dsts = rmm::device_uvector<std::byte>(edgelist_dsts[i].size() * compressed_v_size,
+                                                     handle.get_stream());
+      auto input_dst_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<std::byte>(
+          [dst_first = edgelist_dsts[i].begin(), compressed_v_size] __device__(size_t i) {
+            auto v = static_cast<uint64_t>(*(dst_first + (i / compressed_v_size)));
+            return static_cast<std::byte>((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff});
+          }));
+      thrust::copy(handle.get_thrust_policy(),
+                   input_dst_first,
+                   input_dst_first + edgelist_dsts[i].size() * compressed_v_size,
+                   tmp_dsts.begin());
+      edgelist_dsts[i].resize(0, handle.get_stream());
+      edgelist_dsts[i].shrink_to_fit(handle.get_stream());
+      (*edgelist_compressed_dsts).push_back(std::move(tmp_dsts));
     }
   }
-  edgelist_srcs.clear();
-  edgelist_dsts.clear();
-  if (edgelist_weights) { (*edgelist_weights).clear(); }
-  if (edgelist_edge_ids) { (*edgelist_edge_ids).clear(); }
-  if (edgelist_edge_types) { (*edgelist_edge_types).clear(); }
 
-  // 2. split the grouped edge chunks to local partitions
+  // 4. compute additional copy_offset vectors
 
-  auto edgelist_intra_partition_segment_offsets = std::vector<std::vector<edge_t>>(minor_comm_size);
-
-  std::vector<rmm::device_uvector<vertex_t>> edge_partition_edgelist_srcs{};
-  edge_partition_edgelist_srcs.reserve(minor_comm_size);
-  std::vector<rmm::device_uvector<vertex_t>> edge_partition_edgelist_dsts{};
-  edge_partition_edgelist_dsts.reserve(minor_comm_size);
-  auto edge_partition_edgelist_weights =
-    edgelist_partitioned_weights ? std::make_optional<std::vector<rmm::device_uvector<weight_t>>>()
-                                 : std::nullopt;
-  if (edgelist_partitioned_weights) { (*edge_partition_edgelist_weights).reserve(minor_comm_size); }
-  auto edge_partition_edgelist_edge_ids =
-    edgelist_partitioned_edge_ids
-      ? std::make_optional<std::vector<rmm::device_uvector<edge_id_t>>>()
-      : std::nullopt;
-  if (edgelist_partitioned_edge_ids) {
-    (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size);
-  }
-  auto edge_partition_edgelist_edge_types =
-    edgelist_partitioned_edge_types
-      ? std::make_optional<std::vector<rmm::device_uvector<edge_type_t>>>()
-      : std::nullopt;
-  if (edgelist_partitioned_edge_types) {
-    (*edge_partition_edgelist_edge_types).reserve(minor_comm_size);
-  }
-
-  for (int i = 0; i < minor_comm_size; ++i) {  // iterate over local edge partitions
+  std::vector<edge_t> edge_partition_edge_counts(minor_comm_size);
+  std::vector<std::vector<edge_t>> edge_partition_intra_partition_segment_offset_vectors(
+    minor_comm_size);
+  std::vector<std::vector<edge_t>> edge_partition_intra_segment_copy_output_displacement_vectors(
+    minor_comm_size);
+  for (int i = 0; i < minor_comm_size; ++i) {
     edge_t edge_count{0};
     std::vector<edge_t> intra_partition_segment_sizes(major_comm_size, 0);
-    std::vector<edge_t> intra_segment_copy_output_displacements(major_comm_size *
-                                                                edgelist_partitioned_srcs.size());
+    std::vector<edge_t> intra_segment_copy_output_displacements(major_comm_size * num_chunks);
     for (int j = 0; j < major_comm_size /* # segments in the local minor range */; ++j) {
       edge_t displacement{0};
-      for (size_t k = 0; k < edgelist_partitioned_srcs.size() /* # input edge chunks */; ++k) {
-        auto segment_size = edgelist_partitioned_srcs[k][i * major_comm_size + j].size();
+      for (size_t k = 0; k < num_chunks; ++k) {
+        auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] -
+                            edgelist_edge_offset_vectors[k][i * major_comm_size + j];
         edge_count += segment_size;
         intra_partition_segment_sizes[j] += segment_size;
-        intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k] =
-          displacement;
+        intra_segment_copy_output_displacements[j * num_chunks + k] = displacement;
         displacement += segment_size;
       }
     }
@@ -1201,93 +1310,133 @@ create_graph_from_edgelist_impl(
                         intra_partition_segment_sizes.end(),
                         intra_partition_segment_offsets.begin() + 1);
 
-    rmm::device_uvector<vertex_t> tmp_srcs(edge_count, handle.get_stream());
-    for (int j = 0; j < major_comm_size; ++j) {
-      for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) {
-        auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j];
-        thrust::copy(
-          handle.get_thrust_policy(),
-          input_buffer.begin(),
-          input_buffer.end(),
-          tmp_srcs.begin() + intra_partition_segment_offsets[j] +
-            intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k]);
-        input_buffer.resize(0, handle.get_stream());
-        input_buffer.shrink_to_fit(handle.get_stream());
-      }
-    }
-    edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs));
+    edge_partition_edge_counts[i] = edge_count;
+    edge_partition_intra_partition_segment_offset_vectors[i] =
+      std::move(intra_partition_segment_offsets);
+    edge_partition_intra_segment_copy_output_displacement_vectors[i] =
+      std::move(intra_segment_copy_output_displacements);
+  }
 
-    rmm::device_uvector<vertex_t> tmp_dsts(edge_count, handle.get_stream());
-    for (int j = 0; j < major_comm_size; ++j) {
-      for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) {
-        auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j];
-        thrust::copy(
-          handle.get_thrust_policy(),
-          input_buffer.begin(),
-          input_buffer.end(),
-          tmp_dsts.begin() + intra_partition_segment_offsets[j] +
-            intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]);
-        input_buffer.resize(0, handle.get_stream());
-        input_buffer.shrink_to_fit(handle.get_stream());
-      }
-    }
-    edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts));
+  // 5. split the grouped edge chunks to local partitions
 
-    if (edge_partition_edgelist_weights) {
-      rmm::device_uvector<weight_t> tmp_weights(edge_count, handle.get_stream());
-      for (int j = 0; j < major_comm_size; ++j) {
-        for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) {
-          auto& input_buffer = (*edgelist_partitioned_weights)[k][i * major_comm_size + j];
-          thrust::copy(
-            handle.get_thrust_policy(),
-            input_buffer.begin(),
-            input_buffer.end(),
-            tmp_weights.begin() + intra_partition_segment_offsets[j] +
-              intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]);
-          input_buffer.resize(0, handle.get_stream());
-          input_buffer.shrink_to_fit(handle.get_stream());
-        }
-      }
-      (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights));
-    }
+  std::vector<rmm::device_uvector<vertex_t>> edge_partition_edgelist_srcs{};
+  std::vector<rmm::device_uvector<vertex_t>> edge_partition_edgelist_dsts{};
+  std::optional<std::vector<rmm::device_uvector<weight_t>>> edge_partition_edgelist_weights{
+    std::nullopt};
+  std::optional<std::vector<rmm::device_uvector<edge_id_t>>> edge_partition_edgelist_edge_ids{
+    std::nullopt};
+  std::optional<std::vector<rmm::device_uvector<edge_type_t>>> edge_partition_edgelist_edge_types{
+    std::nullopt};
 
-    if (edge_partition_edgelist_edge_ids) {
-      rmm::device_uvector<edge_id_t> tmp_edge_ids(edge_count, handle.get_stream());
-      for (int j = 0; j < major_comm_size; ++j) {
-        for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) {
-          auto& input_buffer = (*edgelist_partitioned_edge_ids)[k][i * major_comm_size + j];
-          thrust::copy(
-            handle.get_thrust_policy(),
-            input_buffer.begin(),
-            input_buffer.end(),
-            tmp_edge_ids.begin() + intra_partition_segment_offsets[j] +
-              intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]);
-          input_buffer.resize(0, handle.get_stream());
-          input_buffer.shrink_to_fit(handle.get_stream());
-        }
-      }
-      (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids));
-    }
+  std::optional<std::vector<rmm::device_uvector<std::byte>>>
+    edge_partition_edgelist_compressed_srcs{};
+  std::optional<std::vector<rmm::device_uvector<std::byte>>>
+    edge_partition_edgelist_compressed_dsts{};
 
-    if (edge_partition_edgelist_edge_types) {
-      rmm::device_uvector<edge_type_t> tmp_edge_types(edge_count, handle.get_stream());
-      for (int j = 0; j < major_comm_size; ++j) {
-        for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) {
-          auto& input_buffer = (*edgelist_partitioned_edge_types)[k][i * major_comm_size + j];
-          thrust::copy(
-            handle.get_thrust_policy(),
-            input_buffer.begin(),
-            input_buffer.end(),
-            tmp_edge_types.begin() + intra_partition_segment_offsets[j] +
-              intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]);
-          input_buffer.resize(0, handle.get_stream());
-          input_buffer.shrink_to_fit(handle.get_stream());
-        }
-      }
-      (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types));
-    }
+  if (compressed_v_size < sizeof(vertex_t)) {
+    edge_partition_edgelist_compressed_srcs =
+      split_edge_chunk_compressed_elements_to_local_edge_partitions<edge_t>(
+        handle,
+        std::move(*edgelist_compressed_srcs),
+        edgelist_edge_offset_vectors,
+        edge_partition_edge_counts,
+        edge_partition_intra_partition_segment_offset_vectors,
+        edge_partition_intra_segment_copy_output_displacement_vectors,
+        compressed_v_size);
+
+    edge_partition_edgelist_compressed_dsts =
+      split_edge_chunk_compressed_elements_to_local_edge_partitions<edge_t>(
+        handle,
+        std::move(*edgelist_compressed_dsts),
+        edgelist_edge_offset_vectors,
+        edge_partition_edge_counts,
+        edge_partition_intra_partition_segment_offset_vectors,
+        edge_partition_intra_segment_copy_output_displacement_vectors,
+        compressed_v_size);
+  } else {
+    edge_partition_edgelist_srcs =
+      split_edge_chunk_elements_to_local_edge_partitions<edge_t, vertex_t>(
+        handle,
+        std::move(edgelist_srcs),
+        edgelist_edge_offset_vectors,
+        edge_partition_edge_counts,
+        edge_partition_intra_partition_segment_offset_vectors,
+        edge_partition_intra_segment_copy_output_displacement_vectors);
+
+    edge_partition_edgelist_dsts =
+      split_edge_chunk_elements_to_local_edge_partitions<edge_t, vertex_t>(
+        handle,
+        std::move(edgelist_dsts),
+        edgelist_edge_offset_vectors,
+        edge_partition_edge_counts,
+        edge_partition_intra_partition_segment_offset_vectors,
+        edge_partition_intra_segment_copy_output_displacement_vectors);
+  }
+
+  if (edgelist_weights) {
+    edge_partition_edgelist_weights =
+      split_edge_chunk_elements_to_local_edge_partitions<edge_t, weight_t>(
+        handle,
+        std::move(*edgelist_weights),
+        edgelist_edge_offset_vectors,
+        edge_partition_edge_counts,
+        edge_partition_intra_partition_segment_offset_vectors,
+        edge_partition_intra_segment_copy_output_displacement_vectors);
+  }
+  if (edgelist_edge_ids) {
+    edge_partition_edgelist_edge_ids =
+      split_edge_chunk_elements_to_local_edge_partitions<edge_t, edge_id_t>(
+        handle,
+        std::move(*edgelist_edge_ids),
+        edgelist_edge_offset_vectors,
+        edge_partition_edge_counts,
+        edge_partition_intra_partition_segment_offset_vectors,
+        edge_partition_intra_segment_copy_output_displacement_vectors);
+  }
+  if (edgelist_edge_types) {
+    edge_partition_edgelist_edge_types =
+      split_edge_chunk_elements_to_local_edge_partitions<edge_t, edge_type_t>(
+        handle,
+        std::move(*edgelist_edge_types),
+        edgelist_edge_offset_vectors,
+        edge_partition_edge_counts,
+        edge_partition_intra_partition_segment_offset_vectors,
+        edge_partition_intra_segment_copy_output_displacement_vectors);
+  }
 
-    edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets);
+  // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory
+  // requirement
+
+  if (compressed_v_size < sizeof(vertex_t)) {
+    assert(edge_partition_edgelist_compressed_srcs);
+    assert(edge_partition_edgelist_compressed_dsts);
+
+    edge_partition_edgelist_srcs.reserve(minor_comm_size);
+    edge_partition_edgelist_dsts.reserve(minor_comm_size);
+
+    for (int i = 0; i < minor_comm_size; ++i) {
+      rmm::device_uvector<vertex_t> tmp_srcs(edge_partition_edge_counts[i], handle.get_stream());
+      decompress_vertices(
+        handle,
+        raft::device_span<std::byte const>((*edge_partition_edgelist_compressed_srcs)[i].data(),
+                                           (*edge_partition_edgelist_compressed_srcs)[i].size()),
+        raft::device_span<vertex_t>(tmp_srcs.data(), tmp_srcs.size()),
+        compressed_v_size);
+      edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs));
+      (*edge_partition_edgelist_compressed_srcs)[i].resize(0, handle.get_stream());
+      (*edge_partition_edgelist_compressed_srcs)[i].shrink_to_fit(handle.get_stream());
+
+      rmm::device_uvector<vertex_t> tmp_dsts(edge_partition_edge_counts[i], handle.get_stream());
+      decompress_vertices(
+        handle,
+        raft::device_span<std::byte const>((*edge_partition_edgelist_compressed_dsts)[i].data(),
+                                           (*edge_partition_edgelist_compressed_dsts)[i].size()),
+        raft::device_span<vertex_t>(tmp_dsts.data(), tmp_dsts.size()),
+        compressed_v_size);
+      edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts));
+      (*edge_partition_edgelist_compressed_dsts)[i].resize(0, handle.get_stream());
+      (*edge_partition_edgelist_compressed_dsts)[i].shrink_to_fit(handle.get_stream());
+    }
   }
 
   return create_graph_from_partitioned_edgelist<vertex_t,
@@ -1304,7 +1453,7 @@ create_graph_from_edgelist_impl(
     std::move(edge_partition_edgelist_weights),
     std::move(edge_partition_edgelist_edge_ids),
     std::move(edge_partition_edgelist_edge_types),
-    edgelist_intra_partition_segment_offsets,
+    edge_partition_intra_partition_segment_offset_vectors,
     graph_properties,
     renumber);
 }
@@ -1367,7 +1516,8 @@ create_graph_from_edgelist_impl(
           handle,
           raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
           raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()))),
-        "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is "
+        "Invalid input arguments: graph_properties.is_symmetric is true but the input edge "
+        "list is "
         "not symmetric.");
     }
 
@@ -1377,7 +1527,8 @@ create_graph_from_edgelist_impl(
           handle,
           raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
           raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size())),
-        "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list "
+        "Invalid input arguments: graph_properties.is_multigraph is false but the input edge "
+        "list "
         "has parallel edges.");
     }
   }
@@ -1605,7 +1756,8 @@ create_graph_from_edgelist_impl(
       cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{
         num_vertices,
         graph_properties,
-        renumber ? std::optional<std::vector<vertex_t>>{meta.segment_offsets} : std::nullopt}),
+        renumber ? std::optional<std::vector<vertex_t>>{meta.segment_offsets} : std::nullopt,
+        meta.hypersparse_degree_offsets}),
     std::move(edge_weights),
     std::move(edge_ids),
     std::move(edge_types),
@@ -1759,15 +1911,15 @@ create_graph_from_edgelist_impl(
       renumber);
 
     if (graph_properties.is_symmetric) {
-      CUGRAPH_EXPECTS(
-        (check_symmetric<vertex_t, store_transposed, multi_gpu>(
-          handle,
-          raft::device_span<vertex_t const>(aggregate_edgelist_srcs.data(),
-                                            aggregate_edgelist_srcs.size()),
-          raft::device_span<vertex_t const>(aggregate_edgelist_dsts.data(),
-                                            aggregate_edgelist_dsts.size()))),
-        "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is "
-        "not symmetric.");
+      CUGRAPH_EXPECTS((check_symmetric<vertex_t, store_transposed, multi_gpu>(
+                        handle,
+                        raft::device_span<vertex_t const>(aggregate_edgelist_srcs.data(),
+                                                          aggregate_edgelist_srcs.size()),
+                        raft::device_span<vertex_t const>(aggregate_edgelist_dsts.data(),
+                                                          aggregate_edgelist_dsts.size()))),
+                      "Invalid input arguments: graph_properties.is_symmetric is true but the "
+                      "input edge list is "
+                      "not symmetric.");
     }
 
     if (!graph_properties.is_multigraph) {
@@ -1777,7 +1929,8 @@ create_graph_from_edgelist_impl(
                                                                  aggregate_edgelist_srcs.size()),
                                raft::device_span<vertex_t const>(aggregate_edgelist_dsts.data(),
                                                                  aggregate_edgelist_dsts.size())),
-        "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list "
+        "Invalid input arguments: graph_properties.is_multigraph is false but "
+        "the input edge list "
         "has parallel edges.");
     }
   }
diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh
index 1ef975c1de..86e3c45ca2 100644
--- a/cpp/src/structure/detail/structure_utils.cuh
+++ b/cpp/src/structure/detail/structure_utils.cuh
@@ -60,7 +60,8 @@ rmm::device_uvector<edge_t> compute_sparse_offsets(
   bool edgelist_major_sorted,
   rmm::cuda_stream_view stream_view)
 {
-  rmm::device_uvector<edge_t> offsets((major_range_last - major_range_first) + 1, stream_view);
+  rmm::device_uvector<edge_t> offsets(static_cast<size_t>(major_range_last - major_range_first) + 1,
+                                      stream_view);
   if (edgelist_major_sorted) {
     offsets.set_element_to_zero_async(0, stream_view);
     thrust::upper_bound(rmm::exec_policy(stream_view),
@@ -77,7 +78,9 @@ rmm::device_uvector<edge_t> compute_sparse_offsets(
                      edgelist_major_first,
                      edgelist_major_last,
                      [offset_view, major_range_first] __device__(auto v) {
-                       atomicAdd(&offset_view[v - major_range_first], edge_t{1});
+                       cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(
+                         offset_view[v - major_range_first]);
+                       atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed);
                      });
 
     thrust::exclusive_scan(
@@ -246,30 +249,112 @@ sort_and_compress_edgelist(rmm::device_uvector<vertex_t>&& edgelist_srcs,
 
   rmm::device_uvector<edge_t> offsets(0, stream_view);
   rmm::device_uvector<vertex_t> indices(0, stream_view);
-  auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin());
   if (edgelist_minors.size() > mem_frugal_threshold) {
-    offsets = compute_sparse_offsets<edge_t>(edgelist_majors.begin(),
-                                             edgelist_majors.end(),
-                                             major_range_first,
-                                             major_range_last,
-                                             false,
-                                             stream_view);
+    static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8));
+    if ((sizeof(vertex_t) == 8) && (static_cast<size_t>(major_range_last - major_range_first) <=
+                                    static_cast<size_t>(std::numeric_limits<uint32_t>::max()))) {
+      rmm::device_uvector<uint32_t> edgelist_major_offsets(edgelist_majors.size(), stream_view);
+      thrust::transform(
+        rmm::exec_policy_nosync(stream_view),
+        edgelist_majors.begin(),
+        edgelist_majors.end(),
+        edgelist_major_offsets.begin(),
+        cuda::proclaim_return_type<uint32_t>([major_range_first] __device__(vertex_t major) {
+          return static_cast<uint32_t>(major - major_range_first);
+        }));
+      edgelist_majors.resize(0, stream_view);
+      edgelist_majors.shrink_to_fit(stream_view);
+
+      offsets =
+        compute_sparse_offsets<edge_t>(edgelist_major_offsets.begin(),
+                                       edgelist_major_offsets.end(),
+                                       uint32_t{0},
+                                       static_cast<uint32_t>(major_range_last - major_range_first),
+                                       false,
+                                       stream_view);
+      std::array<uint32_t, 3> pivots{};
+      for (size_t i = 0; i < 3; ++i) {
+        pivots[i] = static_cast<uint32_t>(thrust::distance(
+          offsets.begin(),
+          thrust::lower_bound(rmm::exec_policy(stream_view),
+                              offsets.begin(),
+                              offsets.end(),
+                              static_cast<edge_t>((edgelist_major_offsets.size() * (i + 1)) / 4))));
+      }
 
-    auto pivot = major_range_first + static_cast<vertex_t>(thrust::distance(
-                                       offsets.begin(),
-                                       thrust::lower_bound(rmm::exec_policy(stream_view),
-                                                           offsets.begin(),
-                                                           offsets.end(),
-                                                           edgelist_minors.size() / 2)));
-    auto second_first =
-      detail::mem_frugal_partition(edge_first,
-                                   edge_first + edgelist_minors.size(),
-                                   thrust_tuple_get<thrust::tuple<vertex_t, vertex_t>, 0>{},
-                                   pivot,
-                                   stream_view);
-    thrust::sort(rmm::exec_policy(stream_view), edge_first, second_first);
-    thrust::sort(rmm::exec_policy(stream_view), second_first, edge_first + edgelist_minors.size());
+      auto pair_first =
+        thrust::make_zip_iterator(edgelist_major_offsets.begin(), edgelist_minors.begin());
+      auto second_half_first =
+        detail::mem_frugal_partition(pair_first,
+                                     pair_first + edgelist_major_offsets.size(),
+                                     thrust_tuple_get<thrust::tuple<uint32_t, vertex_t>, 0>{},
+                                     pivots[1],
+                                     stream_view);
+      auto second_quarter_first =
+        detail::mem_frugal_partition(pair_first,
+                                     second_half_first,
+                                     thrust_tuple_get<thrust::tuple<uint32_t, vertex_t>, 0>{},
+                                     pivots[0],
+                                     stream_view);
+      auto last_quarter_first =
+        detail::mem_frugal_partition(second_half_first,
+                                     pair_first + edgelist_major_offsets.size(),
+                                     thrust_tuple_get<thrust::tuple<uint32_t, vertex_t>, 0>{},
+                                     pivots[2],
+                                     stream_view);
+      thrust::sort(rmm::exec_policy(stream_view), pair_first, second_quarter_first);
+      thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first);
+      thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first);
+      thrust::sort(rmm::exec_policy(stream_view),
+                   last_quarter_first,
+                   pair_first + edgelist_major_offsets.size());
+    } else {
+      offsets = compute_sparse_offsets<edge_t>(edgelist_majors.begin(),
+                                               edgelist_majors.end(),
+                                               major_range_first,
+                                               major_range_last,
+                                               false,
+                                               stream_view);
+      std::array<vertex_t, 3> pivots{};
+      for (size_t i = 0; i < 3; ++i) {
+        pivots[i] =
+          major_range_first +
+          static_cast<vertex_t>(thrust::distance(
+            offsets.begin(),
+            thrust::lower_bound(rmm::exec_policy(stream_view),
+                                offsets.begin(),
+                                offsets.end(),
+                                static_cast<edge_t>((edgelist_minors.size() * (i + 1)) / 4))));
+      }
+      auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin());
+      auto second_half_first =
+        detail::mem_frugal_partition(edge_first,
+                                     edge_first + edgelist_majors.size(),
+                                     thrust_tuple_get<thrust::tuple<vertex_t, vertex_t>, 0>{},
+                                     pivots[1],
+                                     stream_view);
+      auto second_quarter_first =
+        detail::mem_frugal_partition(edge_first,
+                                     second_half_first,
+                                     thrust_tuple_get<thrust::tuple<vertex_t, vertex_t>, 0>{},
+                                     pivots[0],
+                                     stream_view);
+      auto last_quarter_first =
+        detail::mem_frugal_partition(second_half_first,
+                                     edge_first + edgelist_majors.size(),
+                                     thrust_tuple_get<thrust::tuple<vertex_t, vertex_t>, 0>{},
+                                     pivots[2],
+                                     stream_view);
+      thrust::sort(rmm::exec_policy(stream_view), edge_first, second_quarter_first);
+      thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first);
+      thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first);
+      thrust::sort(
+        rmm::exec_policy(stream_view), last_quarter_first, edge_first + edgelist_majors.size());
+      edgelist_majors.resize(0, stream_view);
+      edgelist_majors.shrink_to_fit(stream_view);
+    }
   } else {
+    auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin());
     thrust::sort(rmm::exec_policy(stream_view), edge_first, edge_first + edgelist_minors.size());
     offsets = compute_sparse_offsets<edge_t>(edgelist_majors.begin(),
                                              edgelist_majors.end(),
@@ -277,12 +362,11 @@ sort_and_compress_edgelist(rmm::device_uvector<vertex_t>&& edgelist_srcs,
                                              major_range_last,
                                              true,
                                              stream_view);
+    edgelist_majors.resize(0, stream_view);
+    edgelist_majors.shrink_to_fit(stream_view);
   }
   indices = std::move(edgelist_minors);
 
-  edgelist_majors.resize(0, stream_view);
-  edgelist_majors.shrink_to_fit(stream_view);
-
   std::optional<rmm::device_uvector<vertex_t>> dcs_nzd_vertices{std::nullopt};
   if (major_hypersparse_first) {
     std::tie(offsets, dcs_nzd_vertices) = compress_hypersparse_offsets(std::move(offsets),
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index ef43b7b13e..6661f0488d 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -146,8 +146,7 @@ update_local_sorted_unique_edge_majors_minors(
 
   auto num_segments_per_vertex_partition =
     static_cast<size_t>(meta.edge_partition_segment_offsets.size() / minor_comm_size);
-  auto use_dcs =
-    num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2);
+  auto use_dcs = edge_partition_dcs_nzd_vertices.has_value();
 
   std::optional<std::vector<rmm::device_uvector<vertex_t>>> local_sorted_unique_edge_majors{
     std::nullopt};
@@ -166,14 +165,15 @@ update_local_sorted_unique_edge_majors_minors(
 
   // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets
 
-  {
+  if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) {
     auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range();
     auto minor_range_size = meta.partition.local_edge_partition_minor_range_size();
-    rmm::device_uvector<uint32_t> minor_bitmaps(
-      (minor_range_size + (sizeof(uint32_t) * 8 - 1)) / (sizeof(uint32_t) * 8),
-      handle.get_stream());
-    thrust::fill(
-      handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0});
+    rmm::device_uvector<uint32_t> minor_bitmaps(packed_bool_size(minor_range_size),
+                                                handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 minor_bitmaps.begin(),
+                 minor_bitmaps.end(),
+                 packed_bool_empty_mask());
     for (size_t i = 0; i < edge_partition_indices.size(); ++i) {
       thrust::for_each(handle.get_thrust_policy(),
                        edge_partition_indices[i].begin(),
@@ -281,92 +281,96 @@ update_local_sorted_unique_edge_majors_minors(
 
   // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets
 
-  std::vector<vertex_t> num_local_unique_edge_major_counts(edge_partition_offsets.size());
-  for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
-    num_local_unique_edge_major_counts[i] += thrust::count_if(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(vertex_t{0}),
-      thrust::make_counting_iterator(static_cast<vertex_t>(edge_partition_offsets[i].size() - 1)),
-      has_nzd_t<vertex_t, edge_t>{edge_partition_offsets[i].data(), vertex_t{0}});
-  }
-  auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(),
-                                                  num_local_unique_edge_major_counts.end());
-
-  vertex_t aggregate_major_range_size{0};
-  for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) {
-    aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i);
-  }
-
-  auto max_major_properties_fill_ratio =
-    host_scalar_allreduce(comm,
-                          static_cast<double>(num_local_unique_edge_majors) /
-                            static_cast<double>(aggregate_major_range_size),
-                          raft::comms::op_t::MAX,
-                          handle.get_stream());
+  if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) {
+    std::vector<vertex_t> num_local_unique_edge_major_counts(edge_partition_offsets.size());
+    for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
+      num_local_unique_edge_major_counts[i] = thrust::count_if(
+        handle.get_thrust_policy(),
+        thrust::make_counting_iterator(vertex_t{0}),
+        thrust::make_counting_iterator(static_cast<vertex_t>(edge_partition_offsets[i].size() - 1)),
+        has_nzd_t<vertex_t, edge_t>{edge_partition_offsets[i].data(), vertex_t{0}});
+    }
+    auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(),
+                                                    num_local_unique_edge_major_counts.end());
 
-  if (max_major_properties_fill_ratio <
-      detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) {
-    auto const chunk_size =
-      static_cast<size_t>(std::min(1.0 / max_major_properties_fill_ratio, 1024.0));
+    vertex_t aggregate_major_range_size{0};
+    for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) {
+      aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i);
+    }
 
-    local_sorted_unique_edge_majors = std::vector<rmm::device_uvector<vertex_t>>{};
-    local_sorted_unique_edge_major_chunk_start_offsets =
-      std::vector<rmm::device_uvector<vertex_t>>{};
+    auto max_major_properties_fill_ratio =
+      host_scalar_allreduce(comm,
+                            static_cast<double>(num_local_unique_edge_majors) /
+                              static_cast<double>(aggregate_major_range_size),
+                            raft::comms::op_t::MAX,
+                            handle.get_stream());
 
-    (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size());
-    (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size());
-    for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
-      auto [major_range_first, major_range_last] =
-        meta.partition.local_edge_partition_major_range(i);
-      auto sparse_range_last =
-        use_dcs
-          ? (major_range_first +
-             meta.edge_partition_segment_offsets[num_segments_per_vertex_partition * i +
+    if (max_major_properties_fill_ratio <
+        detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) {
+      auto const chunk_size =
+        static_cast<size_t>(std::min(1.0 / max_major_properties_fill_ratio, 1024.0));
+
+      local_sorted_unique_edge_majors = std::vector<rmm::device_uvector<vertex_t>>{};
+      local_sorted_unique_edge_major_chunk_start_offsets =
+        std::vector<rmm::device_uvector<vertex_t>>{};
+
+      (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size());
+      (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size());
+      for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
+        auto [major_range_first, major_range_last] =
+          meta.partition.local_edge_partition_major_range(i);
+        auto sparse_range_last =
+          use_dcs
+            ? (major_range_first +
+               meta
+                 .edge_partition_segment_offsets[num_segments_per_vertex_partition * i +
                                                  detail::num_sparse_segments_per_vertex_partition])
-          : major_range_last;
-
-      rmm::device_uvector<vertex_t> unique_edge_majors(num_local_unique_edge_major_counts[i],
-                                                       handle.get_stream());
-      CUGRAPH_EXPECTS(
-        sparse_range_last - major_range_first < std::numeric_limits<int32_t>::max(),
-        "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), work-around required.");
-      auto cur_size = thrust::distance(
-        unique_edge_majors.begin(),
-        thrust::copy_if(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator(major_range_first),
-          thrust::make_counting_iterator(sparse_range_last),
+            : major_range_last;
+
+        rmm::device_uvector<vertex_t> unique_edge_majors(num_local_unique_edge_major_counts[i],
+                                                         handle.get_stream());
+        CUGRAPH_EXPECTS(sparse_range_last - major_range_first < std::numeric_limits<int32_t>::max(),
+                        "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), "
+                        "work-around required.");
+        auto cur_size = thrust::distance(
           unique_edge_majors.begin(),
-          has_nzd_t<vertex_t, edge_t>{edge_partition_offsets[i].data(), major_range_first}));
-      if (use_dcs) {
-        thrust::copy(handle.get_thrust_policy(),
-                     (*edge_partition_dcs_nzd_vertices)[i].begin(),
-                     (*edge_partition_dcs_nzd_vertices)[i].end(),
-                     unique_edge_majors.begin() + cur_size);
+          thrust::copy_if(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(major_range_first),
+            thrust::make_counting_iterator(sparse_range_last),
+            unique_edge_majors.begin(),
+            has_nzd_t<vertex_t, edge_t>{edge_partition_offsets[i].data(), major_range_first}));
+        if (use_dcs) {
+          thrust::copy(handle.get_thrust_policy(),
+                       (*edge_partition_dcs_nzd_vertices)[i].begin(),
+                       (*edge_partition_dcs_nzd_vertices)[i].end(),
+                       unique_edge_majors.begin() + cur_size);
+        }
+
+        auto num_chunks = static_cast<size_t>(
+          ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size);
+        rmm::device_uvector<vertex_t> unique_edge_major_chunk_start_offsets(num_chunks + size_t{1},
+                                                                            handle.get_stream());
+
+        auto chunk_start_vertex_first =
+          thrust::make_transform_iterator(thrust::make_counting_iterator(vertex_t{0}),
+                                          detail::multiply_and_add_t<vertex_t>{
+                                            static_cast<vertex_t>(chunk_size), major_range_first});
+        thrust::lower_bound(handle.get_thrust_policy(),
+                            unique_edge_majors.begin(),
+                            unique_edge_majors.end(),
+                            chunk_start_vertex_first,
+                            chunk_start_vertex_first + num_chunks,
+                            unique_edge_major_chunk_start_offsets.begin());
+        unique_edge_major_chunk_start_offsets.set_element(
+          num_chunks, static_cast<vertex_t>(unique_edge_majors.size()), handle.get_stream());
+
+        (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors));
+        (*local_sorted_unique_edge_major_chunk_start_offsets)
+          .push_back(std::move(unique_edge_major_chunk_start_offsets));
       }
-
-      auto num_chunks = static_cast<size_t>(
-        ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size);
-      rmm::device_uvector<vertex_t> unique_edge_major_chunk_start_offsets(num_chunks + size_t{1},
-                                                                          handle.get_stream());
-
-      auto chunk_start_vertex_first = thrust::make_transform_iterator(
-        thrust::make_counting_iterator(vertex_t{0}),
-        detail::multiply_and_add_t<vertex_t>{static_cast<vertex_t>(chunk_size), major_range_first});
-      thrust::lower_bound(handle.get_thrust_policy(),
-                          unique_edge_majors.begin(),
-                          unique_edge_majors.end(),
-                          chunk_start_vertex_first,
-                          chunk_start_vertex_first + num_chunks,
-                          unique_edge_major_chunk_start_offsets.begin());
-      unique_edge_major_chunk_start_offsets.set_element(
-        num_chunks, static_cast<vertex_t>(unique_edge_majors.size()), handle.get_stream());
-
-      (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors));
-      (*local_sorted_unique_edge_major_chunk_start_offsets)
-        .push_back(std::move(unique_edge_major_chunk_start_offsets));
+      local_sorted_unique_edge_major_chunk_size = chunk_size;
     }
-    local_sorted_unique_edge_major_chunk_size = chunk_size;
   }
 
   return std::make_tuple(std::move(local_sorted_unique_edge_majors),
@@ -378,6 +382,50 @@ update_local_sorted_unique_edge_majors_minors(
                          std::move(local_sorted_unique_edge_minor_vertex_partition_offsets));
 }
 
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu, std::vector<rmm::device_uvector<uint32_t>>>
+compute_edge_partition_dcs_nzd_range_bitmaps(
+  raft::handle_t const& handle,
+  graph_meta_t<vertex_t, edge_t, multi_gpu> const& meta,
+  std::vector<rmm::device_uvector<vertex_t>> const& edge_partition_dcs_nzd_vertices)
+{
+  auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+  auto const minor_comm_size = minor_comm.get_size();
+
+  auto num_segments_per_vertex_partition =
+    static_cast<size_t>(meta.edge_partition_segment_offsets.size() / minor_comm_size);
+
+  std::vector<rmm::device_uvector<uint32_t>> edge_partition_dcs_nzd_range_bitmaps{};
+  edge_partition_dcs_nzd_range_bitmaps.reserve(edge_partition_dcs_nzd_vertices.size());
+  for (size_t i = 0; i < edge_partition_dcs_nzd_vertices.size(); ++i) {
+    raft::host_span<vertex_t const> segment_offsets(
+      meta.edge_partition_segment_offsets.data() + num_segments_per_vertex_partition * i,
+      num_segments_per_vertex_partition);
+    rmm::device_uvector<uint32_t> bitmap(
+      packed_bool_size(segment_offsets[detail::num_sparse_segments_per_vertex_partition + 1] -
+                       segment_offsets[detail::num_sparse_segments_per_vertex_partition]),
+      handle.get_stream());
+    thrust::fill(
+      handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask());
+    auto major_range_first = meta.partition.local_edge_partition_major_range_first(i);
+    auto major_hypersparse_first =
+      major_range_first + segment_offsets[detail::num_sparse_segments_per_vertex_partition];
+    thrust::for_each(handle.get_thrust_policy(),
+                     edge_partition_dcs_nzd_vertices[i].begin(),
+                     edge_partition_dcs_nzd_vertices[i].end(),
+                     [bitmap = raft::device_span<uint32_t>(bitmap.data(), bitmap.size()),
+                      major_hypersparse_first] __device__(auto major) {
+                       auto offset = major - major_hypersparse_first;
+                       cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+                         bitmap[packed_bool_offset(offset)]);
+                       word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed);
+                     });
+    edge_partition_dcs_nzd_range_bitmaps.push_back(std::move(bitmap));
+  }
+
+  return edge_partition_dcs_nzd_range_bitmaps;
+}
+
 }  // namespace
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
@@ -400,7 +448,8 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
                   "Invalid input argument: edge_partition_dcs_nzd_vertices.has_value() && "
                   "edge_partition_indices.size() != (*edge_partition_dcs_nzd_vertices).size().");
 
-  edge_partition_segment_offsets_ = meta.edge_partition_segment_offsets;
+  edge_partition_segment_offsets_            = meta.edge_partition_segment_offsets;
+  edge_partition_hypersparse_degree_offsets_ = meta.edge_partition_hypersparse_degree_offsets;
 
   // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
 
@@ -439,6 +488,11 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
         edge_partition_indices_,
         edge_partition_dcs_nzd_vertices_);
   }
+
+  if (edge_partition_dcs_nzd_vertices_) {
+    edge_partition_dcs_nzd_range_bitmaps_ =
+      compute_edge_partition_dcs_nzd_range_bitmaps(handle, meta, *edge_partition_dcs_nzd_vertices_);
+  }
 }
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
@@ -452,7 +506,8 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_g
       meta.number_of_vertices, static_cast<edge_t>(indices.size()), meta.properties),
     offsets_(std::move(offsets)),
     indices_(std::move(indices)),
-    segment_offsets_(meta.segment_offsets)
+    segment_offsets_(meta.segment_offsets),
+    hypersparse_degree_offsets_(meta.hypersparse_degree_offsets)
 {
 }
 
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index f925a14273..31de9b1e5d 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -488,14 +488,18 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
                std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
                std::optional<std::vector<raft::device_span<vertex_t const>>> const&
                  edge_partition_dcs_nzd_vertices,
+               std::optional<std::vector<raft::device_span<uint32_t const>>> const&
+                 edge_partition_dcs_nzd_range_bitmaps,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta)
   : detail::graph_base_t<vertex_t, edge_t>(
       meta.number_of_vertices, meta.number_of_edges, meta.properties),
     edge_partition_offsets_(edge_partition_offsets),
     edge_partition_indices_(edge_partition_indices),
     edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices),
+    edge_partition_dcs_nzd_range_bitmaps_(edge_partition_dcs_nzd_range_bitmaps),
     partition_(meta.partition),
     edge_partition_segment_offsets_(meta.edge_partition_segment_offsets),
+    edge_partition_hypersparse_degree_offsets_(meta.edge_partition_hypersparse_degree_offsets),
     local_sorted_unique_edge_srcs_(meta.local_sorted_unique_edge_srcs),
     local_sorted_unique_edge_src_chunk_start_offsets_(
       meta.local_sorted_unique_edge_src_chunk_start_offsets),
@@ -538,7 +542,8 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
       meta.number_of_vertices, meta.number_of_edges, meta.properties),
     offsets_(offsets),
     indices_(indices),
-    segment_offsets_(meta.segment_offsets)
+    segment_offsets_(meta.segment_offsets),
+    hypersparse_degree_offsets_(meta.hypersparse_degree_offsets)
 {
   // cheap error checks
 
diff --git a/cpp/src/structure/induced_subgraph_impl.cuh b/cpp/src/structure/induced_subgraph_impl.cuh
index a8cd2f6f50..3822055b03 100644
--- a/cpp/src/structure/induced_subgraph_impl.cuh
+++ b/cpp/src/structure/induced_subgraph_impl.cuh
@@ -15,8 +15,6 @@
  */
 #pragma once
 
-// #define TIMING
-
 #include "prims/extract_transform_v_frontier_outgoing_e.cuh"
 #include "prims/vertex_frontier.cuh"
 #include "structure/detail/structure_utils.cuh"
@@ -31,9 +29,6 @@
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/misc_utils.cuh>
 #include <cugraph/vertex_partition_device_view.cuh>
-#ifdef TIMING
-#include <cugraph/utilities/high_res_timer.hpp>
-#endif
 
 #include <raft/core/handle.hpp>
 
@@ -127,10 +122,6 @@ extract_induced_subgraphs(
   raft::device_span<vertex_t const> subgraph_vertices,
   bool do_expensive_check)
 {
-#ifdef TIMING
-  HighResTimer hr_timer;
-  hr_timer.start("extract_induced_subgraphs");
-#endif
   // 1. check input arguments
 
   if (do_expensive_check) {
@@ -281,10 +272,6 @@ extract_induced_subgraphs(
                                            true,
                                            handle.get_stream());
 
-#ifdef TIMING
-  hr_timer.stop();
-  hr_timer.display_and_clear(std::cout);
-#endif
   return std::make_tuple(std::move(edge_majors),
                          std::move(edge_minors),
                          std::move(edge_weights),
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 41f81d72ab..bd7d48ac31 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -51,6 +51,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuco/hash_functions.cuh>
+
 #include <algorithm>
 #include <iterator>
 #include <numeric>
@@ -233,128 +235,299 @@ std::optional<vertex_t> find_locally_unused_ext_vertex_id(
            : std::nullopt /* if the entire range of vertex_t is used */;
 }
 
-// returns renumber map and segment_offsets
+// returns renumber map, segment_offsets, and hypersparse_degree_offsets
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compute_renumber_map(
-  raft::handle_t const& handle,
-  std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
-  std::vector<vertex_t const*> const& edgelist_majors,
-  std::vector<vertex_t const*> const& edgelist_minors,
-  std::vector<edge_t> const& edgelist_edge_counts)
+std::tuple<rmm::device_uvector<vertex_t>,
+           std::vector<vertex_t>,
+           std::optional<std::vector<vertex_t>>,
+           vertex_t>
+compute_renumber_map(raft::handle_t const& handle,
+                     std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
+                     std::vector<vertex_t const*> const& edgelist_majors,
+                     std::vector<vertex_t const*> const& edgelist_minors,
+                     std::vector<edge_t> const& edgelist_edge_counts)
 {
-  rmm::device_uvector<vertex_t> sorted_local_vertices(0, handle.get_stream());
-
-  edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end());
-
-  // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct
-  // local_vertices)
+  // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to
+  // construct local_vertices)
 
-  rmm::device_uvector<vertex_t> sorted_unique_majors(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> sorted_local_vertices(0, handle.get_stream());
   if (!local_vertices) {
-    sorted_unique_majors.resize(num_local_edges, handle.get_stream());
-    size_t major_offset{0};
-    for (size_t i = 0; i < edgelist_majors.size(); ++i) {
-      thrust::copy(handle.get_thrust_policy(),
-                   edgelist_majors[i],
-                   edgelist_majors[i] + edgelist_edge_counts[i],
-                   sorted_unique_majors.begin() + major_offset);
-      thrust::sort(handle.get_thrust_policy(),
-                   sorted_unique_majors.begin() + major_offset,
-                   sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]);
-      major_offset += static_cast<size_t>(thrust::distance(
-        sorted_unique_majors.begin() + major_offset,
-        thrust::unique(handle.get_thrust_policy(),
-                       sorted_unique_majors.begin() + major_offset,
-                       sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i])));
+    constexpr size_t num_bins{
+      8};  // increase the number of bins to cut peak memory usage (at the expense of additional
+           // computing), limit the maximum temporary memory usage to "size of local edge list
+           // majors|minors * 2 / # bins"
+    constexpr uint32_t hash_seed =
+      1;  // shouldn't be 0 (in that case this hash function will coincide with the hash function
+          // used to map vertices to GPUs, and we may not see the expected randomization)
+
+    auto edge_major_count_vectors = num_bins > 1
+                                      ? std::make_optional<std::vector<std::vector<edge_t>>>(
+                                          edgelist_majors.size(), std::vector<edge_t>(num_bins))
+                                      : std::nullopt;
+    if (edge_major_count_vectors) {
+      for (size_t i = 0; i < edgelist_majors.size(); ++i) {
+        rmm::device_uvector<edge_t> d_edge_major_counts(num_bins, handle.get_stream());
+        thrust::fill(handle.get_thrust_policy(),
+                     d_edge_major_counts.begin(),
+                     d_edge_major_counts.end(),
+                     edge_t{0});
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          edgelist_majors[i],
+          edgelist_majors[i] + edgelist_edge_counts[i],
+          [counts = raft::device_span<edge_t>(d_edge_major_counts.data(),
+                                              d_edge_major_counts.size())] __device__(auto v) {
+            cuco::detail::MurmurHash3_32<vertex_t> hash_func{hash_seed};
+            cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(
+              counts[hash_func(v) % num_bins]);
+            atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed);
+          });
+        raft::update_host((*edge_major_count_vectors)[i].data(),
+                          d_edge_major_counts.data(),
+                          d_edge_major_counts.size(),
+                          handle.get_stream());
+      }
     }
-    sorted_unique_majors.resize(major_offset, handle.get_stream());
 
-    if (edgelist_majors.size() > 1) {
-      thrust::sort(
-        handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end());
+    auto edge_minor_count_vectors = num_bins > 1
+                                      ? std::make_optional<std::vector<std::vector<edge_t>>>(
+                                          edgelist_minors.size(), std::vector<edge_t>(num_bins))
+                                      : std::nullopt;
+    if (edge_minor_count_vectors) {
+      for (size_t i = 0; i < edgelist_minors.size(); ++i) {
+        rmm::device_uvector<edge_t> d_edge_minor_counts(num_bins, handle.get_stream());
+        thrust::fill(handle.get_thrust_policy(),
+                     d_edge_minor_counts.begin(),
+                     d_edge_minor_counts.end(),
+                     edge_t{0});
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          edgelist_minors[i],
+          edgelist_minors[i] + edgelist_edge_counts[i],
+          [counts = raft::device_span<edge_t>(d_edge_minor_counts.data(),
+                                              d_edge_minor_counts.size())] __device__(auto v) {
+            cuco::detail::MurmurHash3_32<vertex_t> hash_func{hash_seed};
+            cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(
+              counts[hash_func(v) % num_bins]);
+            atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed);
+          });
+        raft::update_host((*edge_minor_count_vectors)[i].data(),
+                          d_edge_minor_counts.data(),
+                          d_edge_minor_counts.size(),
+                          handle.get_stream());
+      }
     }
-    sorted_unique_majors.shrink_to_fit(handle.get_stream());
-  }
-
-  // 2. if local_vertices.has_value() is false, find unique vertices from edge minors (to construct
-  // local_vertices)
 
-  rmm::device_uvector<vertex_t> sorted_unique_minors(0, handle.get_stream());
-  if (!local_vertices) {
-    sorted_unique_minors.resize(num_local_edges, handle.get_stream());
-    size_t minor_offset{0};
-    for (size_t i = 0; i < edgelist_minors.size(); ++i) {
-      thrust::copy(handle.get_thrust_policy(),
-                   edgelist_minors[i],
-                   edgelist_minors[i] + edgelist_edge_counts[i],
-                   sorted_unique_minors.begin() + minor_offset);
-      thrust::sort(handle.get_thrust_policy(),
-                   sorted_unique_minors.begin() + minor_offset,
-                   sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]);
-      minor_offset += static_cast<size_t>(thrust::distance(
-        sorted_unique_minors.begin() + minor_offset,
-        thrust::unique(handle.get_thrust_policy(),
-                       sorted_unique_minors.begin() + minor_offset,
-                       sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i])));
-    }
-    sorted_unique_minors.resize(minor_offset, handle.get_stream());
-    if (edgelist_minors.size() > 1) {
-      thrust::sort(
-        handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end());
-      sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(),
-                                                   thrust::unique(handle.get_thrust_policy(),
-                                                                  sorted_unique_minors.begin(),
-                                                                  sorted_unique_minors.end())),
-                                  handle.get_stream());
-    }
-    sorted_unique_minors.shrink_to_fit(handle.get_stream());
-  }
+    handle.sync_stream();
 
-  // 3. update sorted_local_vertices.
-  // if local_vertices.has_value() is false, reconstruct local_vertices first
+    for (size_t i = 0; i < num_bins; ++i) {
+      rmm::device_uvector<vertex_t> this_bin_sorted_unique_majors(0, handle.get_stream());
+      {
+        std::vector<rmm::device_uvector<vertex_t>> edge_partition_tmp_majors{};  // for bin "i"
+        edge_partition_tmp_majors.reserve(edgelist_majors.size());
+        for (size_t j = 0; j < edgelist_majors.size(); ++j) {
+          rmm::device_uvector<vertex_t> tmp_majors(0, handle.get_stream());
+          if (num_bins > 1) {
+            tmp_majors.resize((*edge_major_count_vectors)[j][i], handle.get_stream());
+            thrust::copy_if(handle.get_thrust_policy(),
+                            edgelist_majors[j],
+                            edgelist_majors[j] + edgelist_edge_counts[j],
+                            tmp_majors.begin(),
+                            [i] __device__(auto v) {
+                              cuco::detail::MurmurHash3_32<vertex_t> hash_func{hash_seed};
+                              return (static_cast<size_t>(hash_func(v) % num_bins) == i);
+                            });
+          } else {
+            tmp_majors.resize(edgelist_edge_counts[j], handle.get_stream());
+            thrust::copy(handle.get_thrust_policy(),
+                         edgelist_majors[j],
+                         edgelist_majors[j] + edgelist_edge_counts[j],
+                         tmp_majors.begin());
+          }
+          thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end());
+          tmp_majors.resize(
+            thrust::distance(
+              tmp_majors.begin(),
+              thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())),
+            handle.get_stream());
+          tmp_majors.shrink_to_fit(handle.get_stream());
+
+          edge_partition_tmp_majors.push_back(std::move(tmp_majors));
+        }
+        if constexpr (multi_gpu) {
+          auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+          auto const minor_comm_size = minor_comm.get_size();
+          if (minor_comm_size > 1) {
+            std::vector<size_t> tx_counts(minor_comm_size);
+            for (int j = 0; j < minor_comm_size; ++j) {
+              tx_counts[j] = edge_partition_tmp_majors[j].size();
+            }
+            this_bin_sorted_unique_majors.resize(std::reduce(tx_counts.begin(), tx_counts.end()),
+                                                 handle.get_stream());
+            size_t output_offset{0};
+            for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) {
+              thrust::copy(handle.get_thrust_policy(),
+                           edge_partition_tmp_majors[j].begin(),
+                           edge_partition_tmp_majors[j].end(),
+                           this_bin_sorted_unique_majors.begin() + output_offset);
+              output_offset += edge_partition_tmp_majors[j].size();
+            }
+            this_bin_sorted_unique_majors = shuffle_and_unique_segment_sorted_values(
+              minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream());
+          } else {
+            this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]);
+          }
+        } else {
+          this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]);
+        }
+      }
 
-  if (local_vertices) {
+      rmm::device_uvector<vertex_t> this_bin_sorted_unique_minors(0, handle.get_stream());
+      {
+        std::vector<rmm::device_uvector<vertex_t>> edge_partition_tmp_minors{};  // for bin "i"
+        edge_partition_tmp_minors.reserve(edgelist_minors.size());
+        for (size_t j = 0; j < edgelist_minors.size(); ++j) {
+          rmm::device_uvector<vertex_t> tmp_minors(0, handle.get_stream());
+          if (num_bins > 1) {
+            tmp_minors.resize((*edge_minor_count_vectors)[j][i], handle.get_stream());
+            thrust::copy_if(handle.get_thrust_policy(),
+                            edgelist_minors[j],
+                            edgelist_minors[j] + edgelist_edge_counts[j],
+                            tmp_minors.begin(),
+                            [i] __device__(auto v) {
+                              cuco::detail::MurmurHash3_32<vertex_t> hash_func{hash_seed};
+                              return (static_cast<size_t>(hash_func(v) % num_bins) == i);
+                            });
+          } else {
+            tmp_minors.resize(edgelist_edge_counts[j], handle.get_stream());
+            thrust::copy(handle.get_thrust_policy(),
+                         edgelist_minors[j],
+                         edgelist_minors[j] + edgelist_edge_counts[j],
+                         tmp_minors.begin());
+          }
+          thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end());
+          tmp_minors.resize(
+            thrust::distance(
+              tmp_minors.begin(),
+              thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())),
+            handle.get_stream());
+          tmp_minors.shrink_to_fit(handle.get_stream());
+
+          edge_partition_tmp_minors.push_back(std::move(tmp_minors));
+        }
+        if (edge_partition_tmp_minors.size() == 1) {
+          this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]);
+        } else {
+          edge_t aggregate_size{0};
+          for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) {
+            aggregate_size += edge_partition_tmp_minors[j].size();
+          }
+          this_bin_sorted_unique_minors.resize(aggregate_size, handle.get_stream());
+          size_t output_offset{0};
+          for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) {
+            thrust::copy(handle.get_thrust_policy(),
+                         edge_partition_tmp_minors[j].begin(),
+                         edge_partition_tmp_minors[j].end(),
+                         this_bin_sorted_unique_minors.begin() + output_offset);
+            output_offset += edge_partition_tmp_minors[j].size();
+          }
+          edge_partition_tmp_minors.clear();
+          thrust::sort(handle.get_thrust_policy(),
+                       this_bin_sorted_unique_minors.begin(),
+                       this_bin_sorted_unique_minors.end());
+          this_bin_sorted_unique_minors.resize(
+            thrust::distance(this_bin_sorted_unique_minors.begin(),
+                             thrust::unique(handle.get_thrust_policy(),
+                                            this_bin_sorted_unique_minors.begin(),
+                                            this_bin_sorted_unique_minors.end())),
+            handle.get_stream());
+          this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream());
+        }
+        if constexpr (multi_gpu) {
+          auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+          auto const major_comm_size = major_comm.get_size();
+          if (major_comm_size > 1) {
+            auto& comm           = handle.get_comms();
+            auto const comm_size = comm.get_size();
+            auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+            auto const minor_comm_size = minor_comm.get_size();
+            compute_gpu_id_from_ext_vertex_t<vertex_t> gpu_id_func{
+              comm_size, major_comm_size, minor_comm_size};
+            auto d_tx_counts = groupby_and_count(
+              this_bin_sorted_unique_minors.begin(),
+              this_bin_sorted_unique_minors.end(),
+              [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) {
+                return partition_manager::compute_major_comm_rank_from_global_comm_rank(
+                  major_comm_size, minor_comm_size, gpu_id_func(v));
+              },
+              major_comm_size,
+              std::numeric_limits<size_t>::max(),
+              handle.get_stream());
+            std::vector<size_t> h_tx_counts(d_tx_counts.size());
+            raft::update_host(
+              h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream());
+            handle.sync_stream();
+            std::vector<size_t> tx_displacements(h_tx_counts.size());
+            std::exclusive_scan(
+              h_tx_counts.begin(), h_tx_counts.end(), tx_displacements.begin(), size_t{0});
+            for (int j = 0; j < major_comm_size; ++j) {
+              thrust::sort(
+                handle.get_thrust_policy(),
+                this_bin_sorted_unique_minors.begin() + tx_displacements[j],
+                this_bin_sorted_unique_minors.begin() + (tx_displacements[j] + h_tx_counts[j]));
+            }
+            this_bin_sorted_unique_minors = shuffle_and_unique_segment_sorted_values(
+              major_comm, this_bin_sorted_unique_minors.begin(), h_tx_counts, handle.get_stream());
+          }
+        }
+      }
+      rmm::device_uvector<vertex_t> this_bin_sorted_unique_vertices(0, handle.get_stream());
+      {
+        rmm::device_uvector<vertex_t> merged_vertices(
+          this_bin_sorted_unique_majors.size() + this_bin_sorted_unique_minors.size(),
+          handle.get_stream());
+        thrust::merge(handle.get_thrust_policy(),
+                      this_bin_sorted_unique_majors.begin(),
+                      this_bin_sorted_unique_majors.end(),
+                      this_bin_sorted_unique_minors.begin(),
+                      this_bin_sorted_unique_minors.end(),
+                      merged_vertices.begin());
+        this_bin_sorted_unique_majors.resize(0, handle.get_stream());
+        this_bin_sorted_unique_majors.shrink_to_fit(handle.get_stream());
+        this_bin_sorted_unique_minors.resize(0, handle.get_stream());
+        this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream());
+        merged_vertices.resize(thrust::distance(merged_vertices.begin(),
+                                                thrust::unique(handle.get_thrust_policy(),
+                                                               merged_vertices.begin(),
+                                                               merged_vertices.end())),
+                               handle.get_stream());
+        merged_vertices.shrink_to_fit(handle.get_stream());
+        this_bin_sorted_unique_vertices = std::move(merged_vertices);
+      }
+      if (sorted_local_vertices.size() == 0) {
+        sorted_local_vertices = std::move(this_bin_sorted_unique_vertices);
+      } else {
+        rmm::device_uvector<vertex_t> merged_vertices(
+          sorted_local_vertices.size() + this_bin_sorted_unique_vertices.size(),
+          handle.get_stream());
+        thrust::merge(handle.get_thrust_policy(),
+                      sorted_local_vertices.begin(),
+                      sorted_local_vertices.end(),
+                      this_bin_sorted_unique_vertices.begin(),
+                      this_bin_sorted_unique_vertices.end(),
+                      merged_vertices.begin());  // merging two unique sets from different hash
+                                                 // bins, so the merged set can't have duplicates
+        sorted_local_vertices = std::move(merged_vertices);
+      }
+    }
+  } else {
     sorted_local_vertices = std::move(*local_vertices);
     thrust::sort(
       handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
-  } else {
-    sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(),
-                                 handle.get_stream());
-
-    thrust::merge(handle.get_thrust_policy(),
-                  sorted_unique_majors.begin(),
-                  sorted_unique_majors.end(),
-                  sorted_unique_minors.begin(),
-                  sorted_unique_minors.end(),
-                  sorted_local_vertices.begin());
-
-    sorted_unique_majors.resize(0, handle.get_stream());
-    sorted_unique_majors.shrink_to_fit(handle.get_stream());
-    sorted_unique_minors.resize(0, handle.get_stream());
-    sorted_unique_minors.shrink_to_fit(handle.get_stream());
-
-    sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(),
-                                                  thrust::unique(handle.get_thrust_policy(),
-                                                                 sorted_local_vertices.begin(),
-                                                                 sorted_local_vertices.end())),
-                                 handle.get_stream());
-    sorted_local_vertices.shrink_to_fit(handle.get_stream());
-
-    if constexpr (multi_gpu) {
-      sorted_local_vertices =
-        cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
-          handle, std::move(sorted_local_vertices));
-      thrust::sort(
-        handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
-      sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(),
-                                                    thrust::unique(handle.get_thrust_policy(),
-                                                                   sorted_local_vertices.begin(),
-                                                                   sorted_local_vertices.end())),
-                                   handle.get_stream());
-      sorted_local_vertices.shrink_to_fit(handle.get_stream());
-    }
   }
 
+  // 2. find an unused vertex ID
+
   auto locally_unused_vertex_id = find_locally_unused_ext_vertex_id(
     handle,
     raft::device_span<vertex_t const>(sorted_local_vertices.data(), sorted_local_vertices.size()),
@@ -363,17 +536,9 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
                   "Invalid input arguments: there is no unused value in the entire range of "
                   "vertex_t, increase vertex_t to 64 bit.");
 
-  // 4. compute global degrees for the sorted local vertices
+  // 3. compute global degrees for the sorted local vertices
 
   rmm::device_uvector<edge_t> sorted_local_vertex_degrees(0, handle.get_stream());
-  std::optional<std::vector<size_t>> stream_pool_indices{
-    std::nullopt};  // FIXME: move this inside the if statement
-
-  auto constexpr num_chunks = size_t{
-    2};  // tuning parameter, this trade-offs # binary searches (up to num_chunks times more binary
-         // searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and temporary
-         // buffer requirement (cut by num_chunks times), currently set to 2 to avoid peak memory
-         // usage happening in this part (especially when minor_comm_size is small)
 
   if constexpr (multi_gpu) {
     auto& comm                 = handle.get_comms();
@@ -386,94 +551,37 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
     auto edge_partition_major_range_sizes =
       host_scalar_allgather(minor_comm, sorted_local_vertices.size(), handle.get_stream());
 
-    if ((minor_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) {
-      auto vertex_edge_counts = host_scalar_allreduce(
-        comm,
-        thrust::make_tuple(static_cast<vertex_t>(sorted_local_vertices.size()), num_local_edges),
-        raft::comms::op_t::SUM,
-        handle.get_stream());
-      // memory footprint vs parallelism trade-off
-      // peak memory requirement per loop is approximately
-      //   (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) +
-      //   (E / (comm_size * minor_comm_size)) / num_chunks * sizeof(vertex_t) * 2 +
-      //   std::min(V/P, (E / (comm_size * minor_comm_size)) / num_chunks) * (sizeof(vertex_t) +
-      //   sizeof(edge_t))
-      // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t)
-      auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0
-                                 ? static_cast<double>(thrust::get<1>(vertex_edge_counts)) /
-                                     static_cast<double>(thrust::get<0>(vertex_edge_counts))
-                                 : double{0.0};
-      auto num_streams       = static_cast<size_t>(
-        (avg_vertex_degree * sizeof(vertex_t)) /
-        (static_cast<double>(sizeof(vertex_t) + sizeof(edge_t)) +
-         (((avg_vertex_degree / minor_comm_size) / num_chunks) * sizeof(vertex_t) * 2) +
-         (std::min(1.0, ((avg_vertex_degree / minor_comm_size) / num_chunks)) *
-          (sizeof(vertex_t) + sizeof(edge_t)))));
-      if (num_streams >= 2) {
-        stream_pool_indices = std::vector<size_t>(num_streams);
-        std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
-        handle.sync_stream();
-      }
-    }
-
     for (int i = 0; i < minor_comm_size; ++i) {
-      auto loop_stream = stream_pool_indices
-                           ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size())
-                           : handle.get_stream();
-
-      rmm::device_uvector<vertex_t> sorted_majors(edge_partition_major_range_sizes[i], loop_stream);
+      rmm::device_uvector<vertex_t> sorted_majors(edge_partition_major_range_sizes[i],
+                                                  handle.get_stream());
       device_bcast(minor_comm,
                    sorted_local_vertices.data(),
                    sorted_majors.data(),
                    edge_partition_major_range_sizes[i],
                    i,
-                   loop_stream);
+                   handle.get_stream());
 
-      rmm::device_uvector<edge_t> sorted_major_degrees(sorted_majors.size(), loop_stream);
-      thrust::fill(rmm::exec_policy(loop_stream),
+      rmm::device_uvector<edge_t> sorted_major_degrees(sorted_majors.size(), handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
                    sorted_major_degrees.begin(),
                    sorted_major_degrees.end(),
                    edge_t{0});
 
-      rmm::device_uvector<vertex_t> tmp_majors(0, loop_stream);
-      tmp_majors.reserve(
-        (static_cast<size_t>(edgelist_edge_counts[i]) + (num_chunks - 1)) / num_chunks,
-        loop_stream);
-      size_t offset{0};
-      for (size_t j = 0; j < num_chunks; ++j) {
-        size_t this_chunk_size =
-          std::min(tmp_majors.capacity(), static_cast<size_t>(edgelist_edge_counts[i]) - offset);
-        tmp_majors.resize(this_chunk_size, loop_stream);
-        thrust::copy(rmm::exec_policy(loop_stream),
-                     edgelist_majors[i] + offset,
-                     edgelist_majors[i] + offset + tmp_majors.size(),
-                     tmp_majors.begin());
-        thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end());
-        auto num_unique_majors =
-          thrust::count_if(rmm::exec_policy(loop_stream),
-                           thrust::make_counting_iterator(size_t{0}),
-                           thrust::make_counting_iterator(tmp_majors.size()),
-                           is_first_in_run_t<vertex_t const*>{tmp_majors.data()});
-        rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, loop_stream);
-        rmm::device_uvector<edge_t> tmp_values(num_unique_majors, loop_stream);
-        thrust::reduce_by_key(rmm::exec_policy(loop_stream),
-                              tmp_majors.begin(),
-                              tmp_majors.end(),
-                              thrust::make_constant_iterator(edge_t{1}),
-                              tmp_keys.begin(),
-                              tmp_values.begin());
-
-        auto kv_pair_first =
-          thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
-        thrust::for_each(rmm::exec_policy(loop_stream),
-                         kv_pair_first,
-                         kv_pair_first + tmp_keys.size(),
-                         search_and_increment_degree_t<vertex_t, edge_t>{
-                           sorted_majors.data(),
-                           static_cast<vertex_t>(sorted_majors.size()),
-                           sorted_major_degrees.data()});
-        offset += this_chunk_size;
-      }
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        edgelist_majors[i],
+        edgelist_majors[i] + edgelist_edge_counts[i],
+        [sorted_majors =
+           raft::device_span<vertex_t const>(sorted_majors.data(), sorted_majors.size()),
+         sorted_major_degrees = raft::device_span<edge_t>(
+           sorted_major_degrees.data(), sorted_major_degrees.size())] __device__(auto major) {
+          auto it =
+            thrust::lower_bound(thrust::seq, sorted_majors.begin(), sorted_majors.end(), major);
+          assert((it != sorted_majors.end()) && (*it == major));
+          cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(
+            sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]);
+          atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed);
+        });
 
       device_reduce(minor_comm,
                     sorted_major_degrees.begin(),
@@ -481,11 +589,9 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
                     edge_partition_major_range_sizes[i],
                     raft::comms::op_t::SUM,
                     i,
-                    loop_stream);
+                    handle.get_stream());
       if (i == minor_comm_rank) { sorted_local_vertex_degrees = std::move(sorted_major_degrees); }
     }
-
-    if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
   } else {
     assert(edgelist_majors.size() == 1);
 
@@ -495,47 +601,24 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
                  sorted_local_vertex_degrees.end(),
                  edge_t{0});
 
-    rmm::device_uvector<vertex_t> tmp_majors(0, handle.get_stream());
-    tmp_majors.reserve(static_cast<size_t>(edgelist_edge_counts[0] + (num_chunks - 1)) / num_chunks,
-                       handle.get_stream());
-    size_t offset{0};
-    for (size_t i = 0; i < num_chunks; ++i) {
-      size_t this_chunk_size =
-        std::min(tmp_majors.capacity(), static_cast<size_t>(edgelist_edge_counts[0]) - offset);
-      tmp_majors.resize(this_chunk_size, handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(),
-                   edgelist_majors[0] + offset,
-                   edgelist_majors[0] + offset + tmp_majors.size(),
-                   tmp_majors.begin());
-      thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end());
-      auto num_unique_majors =
-        thrust::count_if(handle.get_thrust_policy(),
-                         thrust::make_counting_iterator(size_t{0}),
-                         thrust::make_counting_iterator(tmp_majors.size()),
-                         is_first_in_run_t<vertex_t const*>{tmp_majors.data()});
-      rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, handle.get_stream());
-      rmm::device_uvector<edge_t> tmp_values(num_unique_majors, handle.get_stream());
-      thrust::reduce_by_key(handle.get_thrust_policy(),
-                            tmp_majors.begin(),
-                            tmp_majors.end(),
-                            thrust::make_constant_iterator(edge_t{1}),
-                            tmp_keys.begin(),
-                            tmp_values.begin());
-
-      auto kv_pair_first =
-        thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
-      thrust::for_each(handle.get_thrust_policy(),
-                       kv_pair_first,
-                       kv_pair_first + tmp_keys.size(),
-                       search_and_increment_degree_t<vertex_t, edge_t>{
-                         sorted_local_vertices.data(),
-                         static_cast<vertex_t>(sorted_local_vertices.size()),
-                         sorted_local_vertex_degrees.data()});
-      offset += this_chunk_size;
-    }
+    thrust::for_each(handle.get_thrust_policy(),
+                     edgelist_majors[0],
+                     edgelist_majors[0] + edgelist_edge_counts[0],
+                     [sorted_majors = raft::device_span<vertex_t const>(
+                        sorted_local_vertices.data(), sorted_local_vertices.size()),
+                      sorted_major_degrees = raft::device_span<edge_t>(
+                        sorted_local_vertex_degrees.data(),
+                        sorted_local_vertex_degrees.size())] __device__(auto major) {
+                       auto it = thrust::lower_bound(
+                         thrust::seq, sorted_majors.begin(), sorted_majors.end(), major);
+                       assert((it != sorted_majors.end()) && (*it == major));
+                       cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(
+                         sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]);
+                       atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed);
+                     });
   }
 
-  // 4. sort local vertices by degree (descending)
+  // 5. sort local vertices by degree (descending)
 
   thrust::sort_by_key(handle.get_thrust_policy(),
                       sorted_local_vertex_degrees.begin(),
@@ -543,7 +626,7 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
                       sorted_local_vertices.begin(),
                       thrust::greater<edge_t>());
 
-  // 5. compute segment_offsets
+  // 6. compute segment_offsets
 
   static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
   static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) &&
@@ -553,57 +636,85 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t> compu
                 (detail::hypersparse_threshold_ratio <= 1.0));
   size_t mid_degree_threshold{detail::mid_degree_threshold};
   size_t low_degree_threshold{detail::low_degree_threshold};
-  size_t hypersparse_degree_threshold{0};
+  size_t hypersparse_degree_threshold{1};
   if (multi_gpu) {
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
     mid_degree_threshold *= minor_comm_size;
     low_degree_threshold *= minor_comm_size;
-    hypersparse_degree_threshold =
-      static_cast<size_t>(minor_comm_size * detail::hypersparse_threshold_ratio);
+    hypersparse_degree_threshold = std::max(
+      static_cast<size_t>(minor_comm_size * detail::hypersparse_threshold_ratio), size_t{1});
   }
-  auto num_segments_per_vertex_partition =
-    detail::num_sparse_segments_per_vertex_partition +
-    (hypersparse_degree_threshold > 0 ? size_t{2} : size_t{1});  // last is 0-degree segment
-  rmm::device_uvector<edge_t> d_thresholds(num_segments_per_vertex_partition - 1,
-                                           handle.get_stream());
-  auto h_thresholds =
-    hypersparse_degree_threshold > 0
-      ? std::vector<edge_t>{static_cast<edge_t>(mid_degree_threshold),
-                            static_cast<edge_t>(low_degree_threshold),
-                            static_cast<edge_t>(hypersparse_degree_threshold),
-                            std::min(static_cast<edge_t>(hypersparse_degree_threshold), edge_t{1})}
-      : std::vector<edge_t>{static_cast<edge_t>(mid_degree_threshold),
-                            static_cast<edge_t>(low_degree_threshold),
-                            edge_t{1}};
-  raft::update_device(
-    d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream());
-
-  rmm::device_uvector<vertex_t> d_segment_offsets(num_segments_per_vertex_partition + 1,
-                                                  handle.get_stream());
 
-  auto vertex_count = static_cast<vertex_t>(sorted_local_vertices.size());
-  d_segment_offsets.set_element_to_zero_async(0, handle.get_stream());
-  d_segment_offsets.set_element(
-    num_segments_per_vertex_partition, vertex_count, handle.get_stream());
+  std::vector<vertex_t> h_segment_offsets{};
+  std::optional<std::vector<vertex_t>> h_hypersparse_degree_offsets{};
+  {
+    auto num_partitions = detail::num_sparse_segments_per_vertex_partition /* high, mid, low */ +
+                          (hypersparse_degree_threshold > 1
+                             ? hypersparse_degree_threshold - size_t{1}
+                             /* one partition per each global degree in the hypersparse region */
+                             : size_t{0}) +
+                          size_t{1} /* zero */;
+    rmm::device_uvector<edge_t> d_thresholds(num_partitions - 1, handle.get_stream());
+    thrust::tabulate(handle.get_thrust_policy(),
+                     d_thresholds.begin(),
+                     d_thresholds.end(),
+                     [mid_degree_threshold,
+                      low_degree_threshold,
+                      hypersparse_degree_threshold] __device__(size_t i) {
+                       if (i == 0) {
+                         return mid_degree_threshold;  // high,mid boundary
+                       } else if (i == 1) {
+                         return low_degree_threshold;  // mid, low boundary
+                       } else {
+                         assert(hypersparse_degree_threshold > (i - 2));
+                         return hypersparse_degree_threshold - (i - 2);
+                       }
+                     });
+    rmm::device_uvector<vertex_t> d_offsets(num_partitions + 1, handle.get_stream());
+    d_offsets.set_element_to_zero_async(0, handle.get_stream());
+    auto vertex_count = static_cast<vertex_t>(sorted_local_vertices.size());
+    d_offsets.set_element(num_partitions, vertex_count, handle.get_stream());
+    thrust::upper_bound(handle.get_thrust_policy(),
+                        sorted_local_vertex_degrees.begin(),
+                        sorted_local_vertex_degrees.end(),
+                        d_thresholds.begin(),
+                        d_thresholds.end(),
+                        d_offsets.begin() + 1,
+                        thrust::greater<edge_t>{});
+    std::vector<vertex_t> h_offsets(d_offsets.size());
+    raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream());
+    handle.sync_stream();
 
-  thrust::upper_bound(handle.get_thrust_policy(),
-                      sorted_local_vertex_degrees.begin(),
-                      sorted_local_vertex_degrees.end(),
-                      d_thresholds.begin(),
-                      d_thresholds.end(),
-                      d_segment_offsets.begin() + 1,
-                      thrust::greater<edge_t>{});
-
-  std::vector<vertex_t> h_segment_offsets(d_segment_offsets.size());
-  raft::update_host(h_segment_offsets.data(),
-                    d_segment_offsets.data(),
-                    d_segment_offsets.size(),
-                    handle.get_stream());
-  handle.sync_stream();
+    auto num_segments_per_vertex_partition =
+      detail::num_sparse_segments_per_vertex_partition +
+      (hypersparse_degree_threshold > 1 ? size_t{2} : size_t{1});  // last is 0-degree segment
+    h_segment_offsets.resize(num_segments_per_vertex_partition + 1);
+    std::copy(h_offsets.begin(),
+              h_offsets.begin() + num_sparse_segments_per_vertex_partition + 1,
+              h_segment_offsets.begin());
+    *(h_segment_offsets.rbegin()) = *(h_offsets.rbegin());
+    if (hypersparse_degree_threshold > 1) {
+      *(h_segment_offsets.rbegin() + 1) = *(h_offsets.rbegin() + 1);
+
+      h_hypersparse_degree_offsets = std::vector<vertex_t>(hypersparse_degree_threshold);
+      std::copy(h_offsets.begin() + num_sparse_segments_per_vertex_partition,
+                h_offsets.begin() + num_sparse_segments_per_vertex_partition +
+                  (hypersparse_degree_threshold - 1),
+                (*h_hypersparse_degree_offsets).begin());
+      auto shift = (*h_hypersparse_degree_offsets)[0];
+      std::transform((*h_hypersparse_degree_offsets).begin(),
+                     (*h_hypersparse_degree_offsets).end(),
+                     (*h_hypersparse_degree_offsets).begin(),
+                     [shift](auto offset) { return offset - shift; });
+      *((*h_hypersparse_degree_offsets).rbegin()) = *(h_offsets.rbegin() + 1);
+    }
+  }
 
-  return std::make_tuple(
-    std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id);
+  return std::make_tuple(std::move(sorted_local_vertices),
+                         h_segment_offsets,
+                         h_hypersparse_degree_offsets,
+                         *locally_unused_vertex_id);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -789,32 +900,28 @@ void expensive_check_edgelist(
 }
 
 template <typename vertex_t>
-std::vector<vertex_t> aggregate_segment_offsets(raft::handle_t const& handle,
-                                                std::vector<vertex_t> const& segment_offsets)
+std::vector<vertex_t> aggregate_offset_vectors(raft::handle_t const& handle,
+                                               std::vector<vertex_t> const& offsets)
 {
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
   auto const minor_comm_size = minor_comm.get_size();
 
-  rmm::device_uvector<vertex_t> d_segment_offsets(segment_offsets.size(), handle.get_stream());
-  raft::update_device(
-    d_segment_offsets.data(), segment_offsets.data(), segment_offsets.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> d_aggregate_segment_offsets(
-    minor_comm_size * d_segment_offsets.size(), handle.get_stream());
-  minor_comm.allgather(d_segment_offsets.data(),
-                       d_aggregate_segment_offsets.data(),
-                       d_segment_offsets.size(),
-                       handle.get_stream());
-
-  std::vector<vertex_t> h_aggregate_segment_offsets(d_aggregate_segment_offsets.size(),
-                                                    vertex_t{0});
-  raft::update_host(h_aggregate_segment_offsets.data(),
-                    d_aggregate_segment_offsets.data(),
-                    d_aggregate_segment_offsets.size(),
+  rmm::device_uvector<vertex_t> d_offsets(offsets.size(), handle.get_stream());
+  raft::update_device(d_offsets.data(), offsets.data(), offsets.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_aggregate_offset_vectors(minor_comm_size * d_offsets.size(),
+                                                           handle.get_stream());
+  minor_comm.allgather(
+    d_offsets.data(), d_aggregate_offset_vectors.data(), d_offsets.size(), handle.get_stream());
+
+  std::vector<vertex_t> h_aggregate_offset_vectors(d_aggregate_offset_vectors.size(), vertex_t{0});
+  raft::update_host(h_aggregate_offset_vectors.data(),
+                    d_aggregate_offset_vectors.data(),
+                    d_aggregate_offset_vectors.size(),
                     handle.get_stream());
 
   handle.sync_stream();  // this is necessary as h_aggregate_offsets can be used right after return.
 
-  return h_aggregate_segment_offsets;
+  return h_aggregate_offset_vectors;
 }
 
 }  // namespace detail
@@ -857,10 +964,10 @@ renumber_edgelist(
       (*edgelist_intra_partition_segment_offsets).size() == static_cast<size_t>(minor_comm_size),
       "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets).size().");
     for (size_t i = 0; i < edgelist_majors.size(); ++i) {
-      CUGRAPH_EXPECTS(
-        (*edgelist_intra_partition_segment_offsets)[i].size() ==
-          static_cast<size_t>(major_comm_size + 1),
-        "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets)[].size().");
+      CUGRAPH_EXPECTS((*edgelist_intra_partition_segment_offsets)[i].size() ==
+                        static_cast<size_t>(major_comm_size + 1),
+                      "Invalid input arguments: erroneous "
+                      "(*edgelist_intra_partition_segment_offsets)[].size().");
       CUGRAPH_EXPECTS(
         std::is_sorted((*edgelist_intra_partition_segment_offsets)[i].begin(),
                        (*edgelist_intra_partition_segment_offsets)[i].end()),
@@ -868,8 +975,8 @@ renumber_edgelist(
       CUGRAPH_EXPECTS(
         ((*edgelist_intra_partition_segment_offsets)[i][0] == 0) &&
           ((*edgelist_intra_partition_segment_offsets)[i].back() == edgelist_edge_counts[i]),
-        "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 and "
-        "(*edgelist_intra_partition_segment_offsets)[].back() should coincide with "
+        "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 "
+        "and (*edgelist_intra_partition_segment_offsets)[].back() should coincide with "
         "edgelist_edge_counts[].");
     }
   }
@@ -893,7 +1000,10 @@ renumber_edgelist(
 
   // 1. compute renumber map
 
-  auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] =
+  auto [renumber_map_labels,
+        vertex_partition_segment_offsets,
+        vertex_partition_hypersparse_degree_offsets,
+        locally_unused_vertex_id] =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
                                                               std::move(local_vertices),
                                                               edgelist_const_majors,
@@ -966,11 +1076,16 @@ renumber_edgelist(
     }
   }
 
-  if ((static_cast<double>(partition.local_edge_partition_minor_range_size() *
-                           2.5 /* tuning parameter */) >=
-       static_cast<double>(number_of_edges / comm_size)) &&
-      edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
-                                                   // part than the O(E/P) part
+  double approx_mem_requirements =
+    static_cast<double>(partition.local_edge_partition_minor_range_size()) *
+    (static_cast<double>(
+       sizeof(vertex_t)) /* rmm::device_uvector<vertex_t> renumber_map_minor_labels */
+     +
+     static_cast<double>(sizeof(vertex_t) * 2) *
+       2.5 /* kv_store_t<vertex_t, vertex_t, false> renumber_map, * 2.5 to consider load factor */);
+  if ((approx_mem_requirements >
+       static_cast<double>(handle.get_device_properties().totalGlobalMem) * 0.05) &&
+      edgelist_intra_partition_segment_offsets) {
     vertex_t max_segment_size{0};
     for (int i = 0; i < major_comm_size; ++i) {
       auto minor_range_vertex_partition_id =
@@ -1020,10 +1135,10 @@ renumber_edgelist(
       recvcounts[i] = partition.vertex_partition_range_size(minor_range_vertex_partition_id);
     }
     std::vector<size_t> displacements(recvcounts.size(), 0);
-    std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+    std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0});
     device_allgatherv(major_comm,
-                      renumber_map_labels.begin(),
-                      renumber_map_minor_labels.begin(),
+                      renumber_map_labels.data(),
+                      renumber_map_minor_labels.data(),
                       recvcounts,
                       displacements,
                       handle.get_stream());
@@ -1045,12 +1160,20 @@ renumber_edgelist(
   }
 
   auto edge_partition_segment_offsets =
-    detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets);
+    detail::aggregate_offset_vectors(handle, vertex_partition_segment_offsets);
+  auto edge_partition_hypersparse_degree_offsets =
+    vertex_partition_hypersparse_degree_offsets
+      ? std::make_optional(
+          detail::aggregate_offset_vectors(handle, *vertex_partition_hypersparse_degree_offsets))
+      : std::nullopt;
 
   return std::make_tuple(
     std::move(renumber_map_labels),
-    renumber_meta_t<vertex_t, edge_t, multi_gpu>{
-      number_of_vertices, number_of_edges, partition, edge_partition_segment_offsets});
+    renumber_meta_t<vertex_t, edge_t, multi_gpu>{number_of_vertices,
+                                                 number_of_edges,
+                                                 partition,
+                                                 edge_partition_segment_offsets,
+                                                 edge_partition_hypersparse_degree_offsets});
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -1078,7 +1201,10 @@ renumber_edgelist(raft::handle_t const& handle,
       std::nullopt);
   }
 
-  auto [renumber_map_labels, segment_offsets, locally_unused_vertex_id] =
+  auto [renumber_map_labels,
+        segment_offsets,
+        hypersparse_degree_offsets,
+        locally_unused_vertex_id] =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(
       handle,
       std::move(vertices),
@@ -1099,8 +1225,9 @@ renumber_edgelist(raft::handle_t const& handle,
   renumber_map_view.find(
     edgelist_minors, edgelist_minors + num_edgelist_edges, edgelist_minors, handle.get_stream());
 
-  return std::make_tuple(std::move(renumber_map_labels),
-                         renumber_meta_t<vertex_t, edge_t, multi_gpu>{segment_offsets});
+  return std::make_tuple(
+    std::move(renumber_map_labels),
+    renumber_meta_t<vertex_t, edge_t, multi_gpu>{segment_offsets, hypersparse_degree_offsets});
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh
index 3efa58d963..69c7c556bd 100644
--- a/cpp/src/structure/renumber_utils_impl.cuh
+++ b/cpp/src/structure/renumber_utils_impl.cuh
@@ -21,6 +21,7 @@
 
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
@@ -363,7 +364,7 @@ void renumber_ext_vertices(raft::handle_t const& handle,
   }
 
   std::unique_ptr<kv_store_t<vertex_t, vertex_t, false>> renumber_map_ptr{nullptr};
-  if (multi_gpu) {
+  if constexpr (multi_gpu) {
     auto& comm                 = handle.get_comms();
     auto const comm_size       = comm.get_size();
     auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
@@ -402,11 +403,12 @@ void renumber_ext_vertices(raft::handle_t const& handle,
     rmm::device_uvector<vertex_t> int_vertices_for_sorted_unique_ext_vertices(0,
                                                                               handle.get_stream());
     auto [unique_ext_vertices, int_vertices_for_unique_ext_vertices] =
-      collect_values_for_unique_keys(handle,
+      collect_values_for_unique_keys(comm,
                                      local_renumber_map.view(),
                                      std::move(sorted_unique_ext_vertices),
                                      detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
-                                       comm_size, major_comm_size, minor_comm_size});
+                                       comm_size, major_comm_size, minor_comm_size},
+                                     handle.get_stream());
 
     renumber_map_ptr = std::make_unique<kv_store_t<vertex_t, vertex_t, false>>(
       unique_ext_vertices.begin(),
@@ -573,7 +575,6 @@ void unrenumber_int_vertices(raft::handle_t const& handle,
     auto local_int_vertex_first = vertex_partition_id == 0
                                     ? vertex_t{0}
                                     : vertex_partition_range_lasts[vertex_partition_id - 1];
-    auto local_int_vertex_last  = vertex_partition_range_lasts[vertex_partition_id];
 
     rmm::device_uvector<vertex_t> sorted_unique_int_vertices(num_vertices, handle.get_stream());
     sorted_unique_int_vertices.resize(
@@ -595,16 +596,20 @@ void unrenumber_int_vertices(raft::handle_t const& handle,
                                       sorted_unique_int_vertices.end())),
       handle.get_stream());
 
-    auto [unique_int_vertices, ext_vertices_for_unique_int_vertices] =
-      collect_values_for_unique_int_vertices(handle,
-                                             std::move(sorted_unique_int_vertices),
-                                             renumber_map_labels,
-                                             vertex_partition_range_lasts);
+    auto ext_vertices_for_sorted_unique_int_vertices =
+      collect_values_for_sorted_unique_int_vertices(
+        comm,
+        raft::device_span<vertex_t const>(sorted_unique_int_vertices.data(),
+                                          sorted_unique_int_vertices.size()),
+        renumber_map_labels,
+        vertex_partition_range_lasts,
+        local_int_vertex_first,
+        handle.get_stream());
 
     kv_store_t<vertex_t, vertex_t, false> renumber_map(
-      unique_int_vertices.begin(),
-      unique_int_vertices.begin() + unique_int_vertices.size(),
-      ext_vertices_for_unique_int_vertices.begin(),
+      sorted_unique_int_vertices.begin(),
+      sorted_unique_int_vertices.end(),
+      ext_vertices_for_sorted_unique_int_vertices.begin(),
       invalid_vertex_id<vertex_t>::value,
       invalid_vertex_id<vertex_t>::value,
       handle.get_stream());
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 8a18dedd2a..ba40db1f08 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -16,8 +16,9 @@
 #pragma once
 
 #include "prims/fill_edge_src_dst_property.cuh"
+#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh"
 #include "prims/reduce_op.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
 
@@ -51,6 +52,24 @@ namespace cugraph {
 
 namespace {
 
+template <typename vertex_t, typename edge_t>
+struct direction_optimizing_info_t {
+  rmm::device_uvector<edge_t>
+    approx_out_degrees;  // if graph_view.local_vertex_partition_segment_offsets().has_value() is
+                         // true, holds approximate degrees only for the high and mid degree
+                         // segments; otherwise, exact
+  rmm::device_uvector<uint32_t> visited_bitmap;
+  std::optional<rmm::device_uvector<vertex_t>> nzd_unvisited_vertices{
+    std::nullopt};  // valid only during bottom-up iterations
+  std::optional<vertex_t> num_nzd_unvisited_low_degree_vertices{
+    std::nullopt};  // to decide between topdown vs bottomup, relevant only when
+                    // graph_view.local_vertex_partition_segment_offsets().has_value() is true
+  std::optional<vertex_t> num_nzd_unvisited_hypersparse_vertices{
+    std::nullopt};  // to decide between topdown vs bottomup, relevant only when
+                    // graph_view.local_vertex_partition_segment_offsets().has_value() &&
+                    // graph_view.use_dcs() are both true
+};
+
 template <typename vertex_t, bool multi_gpu>
 struct topdown_e_op_t {
   detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>
@@ -69,18 +88,25 @@ struct topdown_e_op_t {
   }
 };
 
-template <typename vertex_t, bool multi_gpu>
+template <typename vertex_t>
 struct bottomup_e_op_t {
-  detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>
+  __device__ vertex_t operator()(
+    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  {
+    return dst;
+  }
+};
+
+template <typename vertex_t, bool multi_gpu>
+struct bottomup_pred_op_t {
+  detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t const*, bool>
     prev_visited_flags{};  // visited in the previous iterations
   vertex_t dst_first{};
 
-  __device__ thrust::optional<vertex_t> operator()(
+  __device__ bool operator()(
     vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
   {
-    auto dst_offset = dst - dst_first;
-    auto old        = prev_visited_flags.get(dst_offset);
-    return old ? thrust::optional<vertex_t>{dst} : thrust::nullopt;
+    return prev_visited_flags.get(dst - dst_first);
   }
 };
 
@@ -144,14 +170,27 @@ void bfs(raft::handle_t const& handle,
     if constexpr (GraphViewType::is_multi_gpu) {
       is_sorted = static_cast<bool>(host_scalar_allreduce(handle.get_comms(),
                                                           static_cast<int32_t>(is_sorted),
-                                                          raft::comms::op_t::SUM,
+                                                          raft::comms::op_t::MIN,
                                                           handle.get_stream()));
     }
-
     CUGRAPH_EXPECTS(
       is_sorted,
       "Invalid input arguments: input sources should be sorted in the non-descending order.");
 
+    bool no_duplicates = (static_cast<size_t>(thrust::count_if(
+                            handle.get_thrust_policy(),
+                            thrust::make_counting_iterator(size_t{0}),
+                            thrust::make_counting_iterator(n_sources),
+                            is_first_in_run_t<decltype(sources)>{sources})) == n_sources);
+    if constexpr (GraphViewType::is_multi_gpu) {
+      no_duplicates = static_cast<bool>(host_scalar_allreduce(handle.get_comms(),
+                                                              static_cast<int32_t>(no_duplicates),
+                                                              raft::comms::op_t::MIN,
+                                                              handle.get_stream()));
+    }
+    CUGRAPH_EXPECTS(no_duplicates,
+                    "Invalid input arguments: input sources should not have duplicates.");
+
     auto num_invalid_vertices =
       thrust::count_if(handle.get_thrust_policy(),
                        sources,
@@ -189,34 +228,119 @@ void bfs(raft::handle_t const& handle,
 
   // 3. update meta data for direction optimizing BFS
 
-  constexpr edge_t direction_optimizing_alpha  = 14;
-  constexpr vertex_t direction_optimizing_beta = 24;
+  auto segment_offsets = graph_view.local_vertex_partition_segment_offsets();
+
+  double direction_optimizing_alpha =
+    (graph_view.number_of_vertices() > 0)
+      ? ((static_cast<double>(graph_view.compute_number_of_edges(handle)) /
+          static_cast<double>(graph_view.number_of_vertices())) *
+         (1.0 / 3.75) /* tuning parametger */)
+      : double{1.0};
+  constexpr vertex_t direction_optimizing_beta = 24;  // tuning parameter
 
-  std::optional<rmm::device_uvector<edge_t>> out_degrees{std::nullopt};
-  std::optional<rmm::device_uvector<vertex_t>> nzd_unvisited_vertices{std::nullopt};
+  std::optional<direction_optimizing_info_t<vertex_t, edge_t>> aux_info{std::nullopt};
   if (direction_optimizing) {
-    out_degrees            = graph_view.compute_out_degrees(handle);
-    nzd_unvisited_vertices = rmm::device_uvector<vertex_t>(
-      graph_view.local_vertex_partition_range_size(), handle.get_stream());
-    (*nzd_unvisited_vertices)
-      .resize(thrust::distance(
-                (*nzd_unvisited_vertices).begin(),
-                thrust::copy_if(
-                  handle.get_thrust_policy(),
-                  thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()),
-                  thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()),
-                  (*nzd_unvisited_vertices).begin(),
-                  [vertex_partition,
-                   sources     = raft::device_span<vertex_t const>(sources, n_sources),
-                   out_degrees = raft::device_span<edge_t const>(
-                     (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) {
-                    auto v_offset =
-                      vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-                    return (out_degrees[v_offset] > edge_t{0}) &&
-                           !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v);
-                  })),
-              handle.get_stream());
-    (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream());
+    rmm::device_uvector<vertex_t> approx_out_degrees(0, handle.get_stream());
+    if (segment_offsets) {  // exploit internal knowedge for exhaustive performance optimization for
+                            // large-scale benchmarking (the else path is sufficient for small
+                            // clusters with few tens of GPUs)
+      size_t partition_idx{0};
+      size_t partition_size{1};
+      if constexpr (GraphViewType::is_multi_gpu) {
+        auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+        auto const minor_comm_rank = minor_comm.get_rank();
+        auto const minor_comm_size = minor_comm.get_size();
+        partition_idx              = static_cast<size_t>(minor_comm_rank);
+        partition_size             = static_cast<size_t>(minor_comm_size);
+      }
+
+      auto edge_partition =
+        edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+          graph_view.local_edge_partition_view(partition_idx));
+      auto edge_mask_view = graph_view.edge_mask_view();
+      auto edge_partition_e_mask =
+        edge_mask_view
+          ? thrust::make_optional<
+              detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+              *edge_mask_view, partition_idx)
+          : thrust::nullopt;
+      auto high_and_mid_degree_segment_size =
+        (*segment_offsets)[2];  // compute local degrees for high & mid degree segments only, for
+                                // low & hypersparse segments, use low_degree_threshold *
+                                // partition_size * 0.5 & partition_size *
+                                // hypersparse_threshold_ratio * 0.5 as approximate out degrees
+      if (edge_partition_e_mask) {
+        approx_out_degrees = edge_partition.compute_local_degrees_with_mask(
+          (*edge_partition_e_mask).value_first(),
+          thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) +
+            high_and_mid_degree_segment_size,
+          handle.get_stream());
+      } else {
+        approx_out_degrees = edge_partition.compute_local_degrees(
+          thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) +
+            high_and_mid_degree_segment_size,
+          handle.get_stream());
+      }
+      thrust::transform(handle.get_thrust_policy(),
+                        approx_out_degrees.begin(),
+                        approx_out_degrees.end(),
+                        approx_out_degrees.begin(),
+                        multiplier_t<edge_t>{static_cast<edge_t>(
+                          partition_size)});  // local_degrees => approximate global degrees
+    } else {
+      approx_out_degrees = graph_view.compute_out_degrees(handle);  // exact
+    }
+
+    rmm::device_uvector<uint32_t> visited_bitmap(
+      packed_bool_size(graph_view.local_vertex_partition_range_size()), handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 visited_bitmap.begin(),
+                 visited_bitmap.end(),
+                 packed_bool_empty_mask());
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      sources,
+      sources + n_sources,
+      [bitmap  = raft::device_span<uint32_t>(visited_bitmap.data(), visited_bitmap.size()),
+       v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+        auto v_offset = v - v_first;
+        cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+          bitmap[packed_bool_offset(v_offset)]);
+        word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed);
+      });
+
+    std::optional<vertex_t> num_nzd_unvisited_low_degree_vertices{std::nullopt};
+    std::optional<vertex_t> num_nzd_unvisited_hypersparse_vertices{std::nullopt};
+    if (segment_offsets) {
+      num_nzd_unvisited_low_degree_vertices = (*segment_offsets)[3] - (*segment_offsets)[2];
+      if (graph_view.use_dcs()) {
+        num_nzd_unvisited_hypersparse_vertices = (*segment_offsets)[4] - (*segment_offsets)[3];
+      }
+      if (n_sources > 0) {
+        std::vector<vertex_t> h_sources(n_sources);
+        raft::update_host(h_sources.data(), sources, n_sources, handle.get_stream());
+        handle.sync_stream();
+        for (size_t i = 0; i < h_sources.size(); ++i) {
+          auto v_offset = h_sources[i] - graph_view.local_vertex_partition_range_first();
+          if ((v_offset >= (*segment_offsets)[2]) && (v_offset < (*segment_offsets)[3])) {
+            --(*num_nzd_unvisited_low_degree_vertices);
+          } else if (graph_view.use_dcs()) {
+            if ((v_offset >= (*segment_offsets)[3]) && (v_offset < (*segment_offsets)[4])) {
+              --(*num_nzd_unvisited_hypersparse_vertices);
+            }
+          }
+        }
+      }
+    }
+
+    aux_info =
+      direction_optimizing_info_t<vertex_t, edge_t>{std::move(approx_out_degrees),
+                                                    std::move(visited_bitmap),
+                                                    std::nullopt,
+                                                    num_nzd_unvisited_low_degree_vertices,
+                                                    num_nzd_unvisited_hypersparse_vertices};
   }
 
   // 4. initialize BFS frontier
@@ -237,7 +361,6 @@ void bfs(raft::handle_t const& handle,
     handle, graph_view);  // this may mark some vertices visited in previous iterations as unvisited
                           // (but this is OK as we check prev_dst_visited_flags first)
   fill_edge_dst_property(handle, graph_view, dst_visited_flags.mutable_view(), false);
-
   fill_edge_dst_property(handle,
                          graph_view,
                          vertex_frontier.bucket(bucket_idx_cur).begin(),
@@ -247,12 +370,12 @@ void bfs(raft::handle_t const& handle,
 
   // 4. BFS iteration
   vertex_t depth{0};
-  bool top_down = true;
-  auto cur_aggregate_vertex_frontier_size =
+  bool topdown = true;
+  auto cur_aggregate_frontier_size =
     static_cast<vertex_t>(vertex_frontier.bucket(bucket_idx_cur).aggregate_size());
   while (true) {
-    vertex_t next_aggregate_vertex_frontier_size{};
-    if (top_down) {
+    vertex_t next_aggregate_frontier_size{};
+    if (topdown) {
       topdown_e_op_t<vertex_t, GraphViewType::is_multi_gpu> e_op{};
       e_op.prev_visited_flags =
         detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
@@ -263,14 +386,15 @@ void bfs(raft::handle_t const& handle,
       e_op.dst_first = graph_view.local_edge_partition_dst_range_first();
 
       auto [new_frontier_vertex_buffer, predecessor_buffer] =
-        transform_reduce_v_frontier_outgoing_e_by_dst(handle,
-                                                      graph_view,
-                                                      vertex_frontier.bucket(bucket_idx_cur),
-                                                      edge_src_dummy_property_t{}.view(),
-                                                      edge_dst_dummy_property_t{}.view(),
-                                                      edge_dummy_property_t{}.view(),
-                                                      e_op,
-                                                      reduce_op::any<vertex_t>());
+        cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
+          handle,
+          graph_view,
+          vertex_frontier.bucket(bucket_idx_cur),
+          edge_src_dummy_property_t{}.view(),
+          edge_dst_dummy_property_t{}.view(),
+          edge_dummy_property_t{}.view(),
+          e_op,
+          reduce_op::any<vertex_t>());
 
       auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1),
                                                         predecessor_buffer.begin());
@@ -286,9 +410,9 @@ void bfs(raft::handle_t const& handle,
         key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
           handle, std::move(new_frontier_vertex_buffer));
 
-      next_aggregate_vertex_frontier_size =
+      next_aggregate_frontier_size =
         static_cast<vertex_t>(vertex_frontier.bucket(bucket_idx_next).aggregate_size());
-      if (next_aggregate_vertex_frontier_size == 0) { break; }
+      if (next_aggregate_frontier_size == 0) { break; }
 
       fill_edge_dst_property(handle,
                              graph_view,
@@ -298,65 +422,146 @@ void bfs(raft::handle_t const& handle,
                              true);
 
       if (direction_optimizing) {
-        auto m_f = thrust::transform_reduce(
-          handle.get_thrust_policy(),
-          vertex_frontier.bucket(bucket_idx_next).begin(),
-          vertex_frontier.bucket(bucket_idx_next).end(),
-          cuda::proclaim_return_type<edge_t>(
-            [vertex_partition,
-             out_degrees = raft::device_span<edge_t const>(
-               (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) {
-              auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-              return out_degrees[v_offset];
-            }),
-          edge_t{0},
-          thrust::plus<edge_t>{});
+        if (vertex_frontier.bucket(bucket_idx_next).size() > 0) {
+          thrust::for_each(
+            handle.get_thrust_policy(),
+            vertex_frontier.bucket(bucket_idx_next).begin(),
+            vertex_frontier.bucket(bucket_idx_next).end(),
+            [bitmap  = raft::device_span<uint32_t>((*aux_info).visited_bitmap.data(),
+                                                  (*aux_info).visited_bitmap.size()),
+             v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+              auto v_offset = v - v_first;
+              cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+                bitmap[packed_bool_offset(v_offset)]);
+              word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed);
+            });
+        }
 
+        double m_f{0.0};
+        double m_u{0.0};
         {
-          rmm::device_uvector<vertex_t> tmp_vertices((*nzd_unvisited_vertices).size(),
-                                                     handle.get_stream());
-          tmp_vertices.resize(
-            thrust::distance(tmp_vertices.begin(),
-                             thrust::set_difference(handle.get_thrust_policy(),
-                                                    (*nzd_unvisited_vertices).begin(),
-                                                    (*nzd_unvisited_vertices).end(),
-                                                    vertex_frontier.bucket(bucket_idx_next).begin(),
-                                                    vertex_frontier.bucket(bucket_idx_next).end(),
-                                                    tmp_vertices.begin())),
-            handle.get_stream());
-          nzd_unvisited_vertices = std::move(tmp_vertices);
+          size_t partition_size{1};
+          if constexpr (GraphViewType::is_multi_gpu) {
+            auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+            auto const minor_comm_size = minor_comm.get_size();
+            partition_size             = static_cast<size_t>(minor_comm_size);
+          }
+
+          auto f_vertex_first = vertex_frontier.bucket(bucket_idx_next).begin();
+          auto f_vertex_last  = vertex_frontier.bucket(bucket_idx_next).end();
+
+          if (segment_offsets) {
+            // FIXME: this actually over-estimates for graphs with power-law degree distribution
+            auto approx_low_segment_degree =
+              static_cast<double>(low_degree_threshold * partition_size) * 0.5;
+            auto approx_hypersparse_segment_degree =
+              static_cast<double>(partition_size) * hypersparse_threshold_ratio * 0.5;
+            auto f_segment_offsets = compute_key_segment_offsets(
+              vertex_frontier.bucket(bucket_idx_next).begin(),
+              vertex_frontier.bucket(bucket_idx_next).end(),
+              raft::host_span<vertex_t const>((*segment_offsets).data(), (*segment_offsets).size()),
+              graph_view.local_vertex_partition_range_first(),
+              handle.get_stream());
+            *((*aux_info).num_nzd_unvisited_low_degree_vertices) -=
+              (f_segment_offsets[3] - f_segment_offsets[2]);
+            if (graph_view.use_dcs()) {
+              *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -=
+                (f_segment_offsets[4] - f_segment_offsets[3]);
+            }
+            f_vertex_last = f_vertex_first + f_segment_offsets[2];
+            m_f           = static_cast<double>((f_segment_offsets[3] - f_segment_offsets[2])) *
+                  approx_low_segment_degree;
+            if (graph_view.use_dcs()) {
+              m_f += static_cast<double>(f_segment_offsets[4] - f_segment_offsets[3]) *
+                     approx_hypersparse_segment_degree;
+            }
+
+            m_u = static_cast<double>(*((*aux_info).num_nzd_unvisited_low_degree_vertices)) *
+                  approx_low_segment_degree;
+            if (graph_view.use_dcs()) {
+              m_u += static_cast<double>(*((*aux_info).num_nzd_unvisited_hypersparse_vertices)) *
+                     approx_hypersparse_segment_degree;
+            }
+          }
+
+          m_f += static_cast<double>(thrust::transform_reduce(
+            handle.get_thrust_policy(),
+            f_vertex_first,
+            f_vertex_last,
+            cuda::proclaim_return_type<edge_t>(
+              [out_degrees = raft::device_span<edge_t const>((*aux_info).approx_out_degrees.data(),
+                                                             (*aux_info).approx_out_degrees.size()),
+               v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+                auto v_offset = v - v_first;
+                return out_degrees[v_offset];
+              }),
+            edge_t{0},
+            thrust::plus<edge_t>{}));
+
+          m_u += static_cast<double>(thrust::transform_reduce(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(vertex_t{0}),
+            thrust::make_counting_iterator(segment_offsets
+                                             ? (*segment_offsets)[2]
+                                             : graph_view.local_vertex_partition_range_size()),
+            cuda::proclaim_return_type<edge_t>(
+              [out_degrees = raft::device_span<edge_t const>((*aux_info).approx_out_degrees.data(),
+                                                             (*aux_info).approx_out_degrees.size()),
+               bitmap      = raft::device_span<uint32_t const>(
+                 (*aux_info).visited_bitmap.data(),
+                 (*aux_info).visited_bitmap.size())] __device__(vertex_t v_offset) {
+                auto word = bitmap[packed_bool_offset(v_offset)];
+                if ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()) {  // visited
+                  return edge_t{0};
+                } else {
+                  return out_degrees[v_offset];
+                }
+              }),
+            edge_t{0},
+            thrust::plus<edge_t>{}));
         }
 
-        auto m_u = thrust::transform_reduce(
-          handle.get_thrust_policy(),
-          (*nzd_unvisited_vertices).begin(),
-          (*nzd_unvisited_vertices).end(),
-          cuda::proclaim_return_type<edge_t>(
-            [vertex_partition,
-             out_degrees = raft::device_span<edge_t const>(
-               (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) {
-              auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-              return out_degrees[v_offset];
-            }),
-          edge_t{0},
-          thrust::plus<edge_t>{});
-        auto aggregate_m_f =
-          GraphViewType::is_multi_gpu
-            ? host_scalar_allreduce(
-                handle.get_comms(), m_f, raft::comms::op_t::SUM, handle.get_stream())
-            : m_f;
-        auto aggregate_m_u =
-          GraphViewType::is_multi_gpu
-            ? host_scalar_allreduce(
-                handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream())
-            : m_u;
+        auto aggregate_m_f = m_f;
+        auto aggregate_m_u = m_u;
+        if constexpr (GraphViewType::is_multi_gpu) {
+          auto tmp      = host_scalar_allreduce(handle.get_comms(),
+                                           thrust::make_tuple(m_f, m_u),
+                                           raft::comms::op_t::SUM,
+                                           handle.get_stream());
+          aggregate_m_f = thrust::get<0>(tmp);
+          aggregate_m_u = thrust::get<1>(tmp);
+        }
         if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) &&
-            (next_aggregate_vertex_frontier_size >= cur_aggregate_vertex_frontier_size)) {
-          top_down = false;
+            (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) {
+          topdown                            = false;
+          (*aux_info).nzd_unvisited_vertices = rmm::device_uvector<vertex_t>(
+            segment_offsets ? *((*segment_offsets).rbegin() + 1)
+                            : graph_view.local_vertex_partition_range_size(),
+            handle.get_stream());
+          (*((*aux_info).nzd_unvisited_vertices))
+            .resize(
+              thrust::distance(
+                (*((*aux_info).nzd_unvisited_vertices)).begin(),
+                thrust::copy_if(
+                  handle.get_thrust_policy(),
+                  thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()),
+                  thrust::make_counting_iterator(
+                    segment_offsets ? graph_view.local_vertex_partition_range_first() +
+                                        *((*segment_offsets).rbegin() + 1)
+                                    : graph_view.local_vertex_partition_range_last()),
+                  (*((*aux_info).nzd_unvisited_vertices)).begin(),
+                  [bitmap  = raft::device_span<uint32_t const>((*aux_info).visited_bitmap.data(),
+                                                              (*aux_info).visited_bitmap.size()),
+                   v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+                    auto v_offset = v - v_first;
+                    auto word     = bitmap[packed_bool_offset(v_offset)];
+                    return ((word & packed_bool_mask(v_offset)) == packed_bool_empty_mask());
+                  })),
+              handle.get_stream());
         }
       }
 
-      if (top_down) {  // staying in top-down
+      if (topdown) {  // staying in top-down
         vertex_frontier.bucket(bucket_idx_cur) =
           key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(handle);
         vertex_frontier.swap_buckets(bucket_idx_cur, bucket_idx_next);
@@ -364,63 +569,122 @@ void bfs(raft::handle_t const& handle,
         vertex_frontier.bucket(bucket_idx_cur) =
           key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
             handle,
-            raft::device_span<vertex_t const>((*nzd_unvisited_vertices).data(),
-                                              (*nzd_unvisited_vertices).size()));
+            raft::device_span<vertex_t const>((*((*aux_info).nzd_unvisited_vertices)).data(),
+                                              (*((*aux_info).nzd_unvisited_vertices)).size()));
         vertex_frontier.bucket(bucket_idx_next) =
           key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(handle);
       }
     } else {  // bottom up
-      bottomup_e_op_t<vertex_t, GraphViewType::is_multi_gpu> e_op{};
-      e_op.prev_visited_flags =
-        detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
-          prev_dst_visited_flags.mutable_view());
-      e_op.dst_first = graph_view.local_edge_partition_dst_range_first();
-      auto [new_frontier_vertex_buffer, predecessor_buffer] =
-        transform_reduce_v_frontier_outgoing_e_by_src(handle,
-                                                      graph_view,
-                                                      vertex_frontier.bucket(bucket_idx_cur),
-                                                      edge_src_dummy_property_t{}.view(),
-                                                      edge_dst_dummy_property_t{}.view(),
-                                                      edge_dummy_property_t{}.view(),
-                                                      e_op,
-                                                      reduce_op::any<vertex_t>());
+      rmm::device_uvector<vertex_t> new_frontier_vertex_buffer(0, handle.get_stream());
+      {
+        bottomup_e_op_t<vertex_t> e_op{};
+        bottomup_pred_op_t<vertex_t, GraphViewType::is_multi_gpu> pred_op{};
+        pred_op.prev_visited_flags =
+          detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t const*, bool>(
+            prev_dst_visited_flags.view());
+        pred_op.dst_first = graph_view.local_edge_partition_dst_range_first();
+
+        rmm::device_uvector<vertex_t> predecessor_buffer(
+          vertex_frontier.bucket(bucket_idx_cur).size(), handle.get_stream());
+        per_v_transform_reduce_if_outgoing_e(handle,
+                                             graph_view,
+                                             vertex_frontier.bucket(bucket_idx_cur),
+                                             edge_src_dummy_property_t{}.view(),
+                                             edge_dst_dummy_property_t{}.view(),
+                                             edge_dummy_property_t{}.view(),
+                                             e_op,
+                                             invalid_vertex,
+                                             reduce_op::any<vertex_t>(),
+                                             pred_op,
+                                             predecessor_buffer.begin(),
+                                             true);
+        auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1),
+                                                          predecessor_buffer.begin());
+
+        // FIXME: this scatter_if and the resize below can be concurrently executed.
+        thrust::scatter_if(
+          handle.get_thrust_policy(),
+          input_pair_first,
+          input_pair_first + predecessor_buffer.size(),
+          thrust::make_transform_iterator(
+            vertex_frontier.bucket(bucket_idx_cur).cbegin(),
+            detail::shift_left_t<vertex_t>{graph_view.local_vertex_partition_range_first()}),
+          predecessor_buffer.begin(),
+          thrust::make_zip_iterator(distances, predecessor_first),
+          detail::is_not_equal_t<vertex_t>{invalid_vertex});
+
+        new_frontier_vertex_buffer.resize(predecessor_buffer.size(), handle.get_stream());
+        new_frontier_vertex_buffer.resize(
+          thrust::distance(new_frontier_vertex_buffer.begin(),
+                           thrust::copy_if(handle.get_thrust_policy(),
+                                           vertex_frontier.bucket(bucket_idx_cur).cbegin(),
+                                           vertex_frontier.bucket(bucket_idx_cur).cend(),
+                                           predecessor_buffer.begin(),
+                                           new_frontier_vertex_buffer.begin(),
+                                           detail::is_not_equal_t<vertex_t>{invalid_vertex})),
+          handle.get_stream());
 
-      auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1),
-                                                        predecessor_buffer.begin());
-      thrust::scatter(
-        handle.get_thrust_policy(),
-        input_pair_first,
-        input_pair_first + new_frontier_vertex_buffer.size(),
-        thrust::make_transform_iterator(
+        assert(direction_optimizing);
+
+        thrust::for_each(
+          handle.get_thrust_policy(),
           new_frontier_vertex_buffer.begin(),
-          detail::shift_left_t<vertex_t>{graph_view.local_vertex_partition_range_first()}),
-        thrust::make_zip_iterator(distances, predecessor_first));
+          new_frontier_vertex_buffer.end(),
+          [bitmap  = raft::device_span<uint32_t>((*aux_info).visited_bitmap.data(),
+                                                (*aux_info).visited_bitmap.size()),
+           v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+            auto v_offset = v - v_first;
+            cuda::atomic_ref<uint32_t, cuda::thread_scope_device> word(
+              bitmap[packed_bool_offset(v_offset)]);
+            word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed);
+          });
+        (*((*aux_info).nzd_unvisited_vertices))
+          .resize(
+            thrust::distance(
+              (*((*aux_info).nzd_unvisited_vertices)).begin(),
+              thrust::remove_if(
+                handle.get_thrust_policy(),
+                (*((*aux_info).nzd_unvisited_vertices)).begin(),
+                (*((*aux_info).nzd_unvisited_vertices)).end(),
+                [bitmap  = raft::device_span<uint32_t const>((*aux_info).visited_bitmap.data(),
+                                                            (*aux_info).visited_bitmap.size()),
+                 v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+                  auto v_offset = v - v_first;
+                  auto word     = bitmap[packed_bool_offset(v_offset)];
+                  return ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask());
+                })),
+            handle.get_stream());
 
-      assert(direction_optimizing);
+        if (segment_offsets) {
+          auto key_segment_offsets = compute_key_segment_offsets(
+            new_frontier_vertex_buffer.begin(),
+            new_frontier_vertex_buffer.end(),
+            raft::host_span<vertex_t const>((*segment_offsets).data(), (*segment_offsets).size()),
+            graph_view.local_vertex_partition_range_first(),
+            handle.get_stream());
+          *((*aux_info).num_nzd_unvisited_low_degree_vertices) -=
+            key_segment_offsets[3] - key_segment_offsets[2];
+          if (graph_view.use_dcs()) {
+            *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -=
+              key_segment_offsets[4] - key_segment_offsets[3];
+          }
+        }
+      }
 
-      {
-        rmm::device_uvector<vertex_t> tmp_vertices((*nzd_unvisited_vertices).size(),
-                                                   handle.get_stream());
-        tmp_vertices.resize(
-          thrust::distance(tmp_vertices.begin(),
-                           thrust::set_difference(handle.get_thrust_policy(),
-                                                  (*nzd_unvisited_vertices).begin(),
-                                                  (*nzd_unvisited_vertices).end(),
-                                                  new_frontier_vertex_buffer.begin(),
-                                                  new_frontier_vertex_buffer.end(),
-                                                  tmp_vertices.begin())),
+      next_aggregate_frontier_size = static_cast<vertex_t>(new_frontier_vertex_buffer.size());
+      auto aggregate_nzd_unvisited_vertices =
+        static_cast<vertex_t>((*((*aux_info).nzd_unvisited_vertices)).size());
+      if constexpr (GraphViewType::is_multi_gpu) {
+        auto tmp = host_scalar_allreduce(
+          handle.get_comms(),
+          thrust::make_tuple(next_aggregate_frontier_size, aggregate_nzd_unvisited_vertices),
+          raft::comms::op_t::SUM,
           handle.get_stream());
-        nzd_unvisited_vertices = std::move(tmp_vertices);
+        next_aggregate_frontier_size     = thrust::get<0>(tmp);
+        aggregate_nzd_unvisited_vertices = thrust::get<1>(tmp);
       }
 
-      next_aggregate_vertex_frontier_size =
-        GraphViewType::is_multi_gpu
-          ? host_scalar_allreduce(handle.get_comms(),
-                                  static_cast<vertex_t>(new_frontier_vertex_buffer.size()),
-                                  raft::comms::op_t::SUM,
-                                  handle.get_stream())
-          : static_cast<vertex_t>(new_frontier_vertex_buffer.size());
-      if (next_aggregate_vertex_frontier_size == 0) { break; }
+      if (next_aggregate_frontier_size == 0) { break; }
 
       fill_edge_dst_property(handle,
                              graph_view,
@@ -429,21 +693,13 @@ void bfs(raft::handle_t const& handle,
                              prev_dst_visited_flags.mutable_view(),
                              true);
 
-      auto aggregate_nzd_unvisted_vertices =
-        GraphViewType::is_multi_gpu
-          ? host_scalar_allreduce(handle.get_comms(),
-                                  static_cast<vertex_t>((*nzd_unvisited_vertices).size()),
-                                  raft::comms::op_t::SUM,
-                                  handle.get_stream())
-          : static_cast<vertex_t>((*nzd_unvisited_vertices).size());
-
-      if ((next_aggregate_vertex_frontier_size * direction_optimizing_beta <
-           aggregate_nzd_unvisted_vertices) &&
-          (next_aggregate_vertex_frontier_size < cur_aggregate_vertex_frontier_size)) {
-        top_down = true;
+      if ((next_aggregate_frontier_size * direction_optimizing_beta <
+           aggregate_nzd_unvisited_vertices) &&
+          (next_aggregate_frontier_size < cur_aggregate_frontier_size)) {
+        topdown = true;
       }
 
-      if (top_down) {  // swithcing to top-down
+      if (topdown) {  // swithcing to top-down
         vertex_frontier.bucket(bucket_idx_cur) =
           key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
             handle, std::move(new_frontier_vertex_buffer));
@@ -451,11 +707,11 @@ void bfs(raft::handle_t const& handle,
         vertex_frontier.bucket(bucket_idx_cur) =
           key_bucket_t<vertex_t, void, GraphViewType::is_multi_gpu, true>(
             handle,
-            raft::device_span<vertex_t const>((*nzd_unvisited_vertices).data(),
-                                              (*nzd_unvisited_vertices).size()));
+            raft::device_span<vertex_t const>((*((*aux_info).nzd_unvisited_vertices)).data(),
+                                              ((*(*aux_info).nzd_unvisited_vertices)).size()));
       }
     }
-    cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size;
+    cur_aggregate_frontier_size = next_aggregate_frontier_size;
 
     depth++;
     if (depth >= depth_limit) { break; }
diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh
index 40030e2e39..d228460bec 100644
--- a/cpp/src/traversal/extract_bfs_paths_impl.cuh
+++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh
@@ -220,11 +220,15 @@ std::tuple<rmm::device_uvector<vertex_t>, vertex_t> extract_bfs_paths(
                       detail::decrement_position{});
 
     if constexpr (multi_gpu) {
-      current_frontier = collect_values_for_int_vertices(handle,
-                                                         current_frontier.begin(),
-                                                         current_frontier.end(),
-                                                         predecessors,
-                                                         h_vertex_partition_range_lasts);
+      auto& comm = handle.get_comms();
+      current_frontier =
+        collect_values_for_int_vertices(comm,
+                                        current_frontier.begin(),
+                                        current_frontier.end(),
+                                        predecessors,
+                                        h_vertex_partition_range_lasts,
+                                        graph_view.local_vertex_partition_range_first(),
+                                        handle.get_stream());
     } else {
       thrust::transform(handle.get_thrust_policy(),
                         current_frontier.begin(),
diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh
index acf3cfe8fc..44fa21a525 100644
--- a/cpp/src/traversal/k_hop_nbrs_impl.cuh
+++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include "prims/reduce_op.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/vertex_frontier.cuh"
 
 #include <cugraph/algorithms.hpp>
@@ -147,15 +147,15 @@ k_hop_nbrs(raft::handle_t const& handle,
   rmm::device_uvector<vertex_t> nbrs(0, handle.get_stream());
   for (size_t iter = 0; iter < k; ++iter) {
     auto new_frontier_key_buffer =
-      transform_reduce_v_frontier_outgoing_e_by_dst(handle,
-                                                    push_graph_view,
-                                                    frontier.bucket(bucket_idx_cur),
-                                                    edge_src_dummy_property_t{}.view(),
-                                                    edge_dst_dummy_property_t{}.view(),
-                                                    edge_dummy_property_t{}.view(),
-                                                    e_op_t<vertex_t>{},
-                                                    reduce_op::null{},
-                                                    do_expensive_check);
+      cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(handle,
+                                                             push_graph_view,
+                                                             frontier.bucket(bucket_idx_cur),
+                                                             edge_src_dummy_property_t{}.view(),
+                                                             edge_dst_dummy_property_t{}.view(),
+                                                             edge_dummy_property_t{}.view(),
+                                                             e_op_t<vertex_t>{},
+                                                             reduce_op::null{},
+                                                             do_expensive_check);
     if (iter < (k - 1)) {
       frontier.bucket(bucket_idx_cur).clear();
       frontier.bucket(bucket_idx_cur)
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index e1b7444b92..b3cd0d57c6 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -22,7 +22,7 @@
 #include "prims/kv_store.cuh"
 #include "prims/reduce_op.cuh"
 #include "prims/transform_reduce_e.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -641,7 +641,6 @@ rmm::device_uvector<weight_t> od_shortest_distances(
         cutoff,
         invalid_distance};
       detail::transform_reduce_v_frontier_call_e_op_t<
-        false,
         thrust::tuple<vertex_t, od_idx_t>,
         weight_t,
         vertex_t,
@@ -653,8 +652,8 @@ rmm::device_uvector<weight_t> od_shortest_distances(
 
       auto new_frontier_tagged_vertex_buffer =
         allocate_dataframe_buffer<thrust::tuple<vertex_t, od_idx_t>>(0, handle.get_stream());
-      std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = detail::
-        extract_transform_v_frontier_e<false, false, thrust::tuple<vertex_t, od_idx_t>, weight_t>(
+      std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) =
+        detail::extract_transform_v_frontier_e<false, thrust::tuple<vertex_t, od_idx_t>, weight_t>(
           handle,
           graph_view,
           vertex_frontier.bucket(bucket_idx_near),
@@ -675,12 +674,14 @@ rmm::device_uvector<weight_t> od_shortest_distances(
       resize_dataframe_buffer(new_frontier_tagged_vertex_buffer, 0, handle.get_stream());
       shrink_to_fit_dataframe_buffer(new_frontier_tagged_vertex_buffer, handle.get_stream());
 
-      std::tie(new_frontier_keys, distance_buffer) =
-        detail::sort_and_reduce_buffer_elements<key_t, weight_t, reduce_op::minimum<weight_t>>(
+      std::tie(new_frontier_keys, distance_buffer) = detail::
+        sort_and_reduce_buffer_elements<key_t, key_t, weight_t, reduce_op::minimum<weight_t>>(
           handle,
           std::move(new_frontier_keys),
           std::move(distance_buffer),
-          reduce_op::minimum<weight_t>());
+          reduce_op::minimum<weight_t>(),
+          std::make_tuple(vertex_t{0}, graph_view.number_of_vertices()),
+          std::nullopt);
     }
     vertex_frontier.bucket(bucket_idx_near).clear();
 
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index 47908524fe..3429672b15 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -19,7 +19,7 @@
 #include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/reduce_op.cuh"
 #include "prims/transform_reduce_e.cuh"
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "prims/update_v_frontier.cuh"
 #include "prims/vertex_frontier.cuh"
@@ -197,7 +197,7 @@ void sssp(raft::handle_t const& handle,
       push_graph_view.local_vertex_partition_view());
 
     auto [new_frontier_vertex_buffer, distance_predecessor_buffer] =
-      transform_reduce_v_frontier_outgoing_e_by_dst(
+      cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
         handle,
         push_graph_view,
         vertex_frontier.bucket(bucket_idx_cur_near),
diff --git a/cpp/src/utilities/collect_comm.cuh b/cpp/src/utilities/collect_comm.cuh
index 2197409fe2..dc4267aac5 100644
--- a/cpp/src/utilities/collect_comm.cuh
+++ b/cpp/src/utilities/collect_comm.cuh
@@ -50,79 +50,73 @@
 
 namespace cugraph {
 
-// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank()
-template <typename KVStoreViewType, typename KeyIterator, typename KeyToGPUIdOp>
-decltype(allocate_dataframe_buffer<typename KVStoreViewType::value_type>(0,
-                                                                         rmm::cuda_stream_view{}))
-collect_values_for_keys(raft::handle_t const& handle,
-                        KVStoreViewType kv_store_view,
-                        KeyIterator collect_key_first,
-                        KeyIterator collect_key_last,
-                        KeyToGPUIdOp key_to_gpu_id_op)
+// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank()
+template <typename KVStoreViewType, typename KeyIterator, typename KeyToCommRankOp>
+dataframe_buffer_type_t<typename KVStoreViewType::value_type> collect_values_for_keys(
+  raft::comms::comms_t const& comm,
+  KVStoreViewType kv_store_view,
+  KeyIterator collect_key_first,
+  KeyIterator collect_key_last,
+  KeyToCommRankOp key_to_comm_rank_op,
+  rmm::cuda_stream_view stream_view)
 {
   using key_t = typename KVStoreViewType::key_type;
   static_assert(std::is_same_v<typename thrust::iterator_traits<KeyIterator>::value_type, key_t>);
   using value_t = typename KVStoreViewType::value_type;
 
-  auto& comm = handle.get_comms();
-
   // 1. collect values for the unique keys in [collect_key_first, collect_key_last)
 
   rmm::device_uvector<key_t> unique_keys(thrust::distance(collect_key_first, collect_key_last),
-                                         handle.get_stream());
+                                         stream_view);
   thrust::copy(
-    handle.get_thrust_policy(), collect_key_first, collect_key_last, unique_keys.begin());
-  thrust::sort(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end());
+    rmm::exec_policy_nosync(stream_view), collect_key_first, collect_key_last, unique_keys.begin());
+  thrust::sort(rmm::exec_policy_nosync(stream_view), unique_keys.begin(), unique_keys.end());
   unique_keys.resize(
     thrust::distance(
       unique_keys.begin(),
-      thrust::unique(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end())),
-    handle.get_stream());
+      thrust::unique(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end())),
+    stream_view);
 
-  auto values_for_unique_keys = allocate_dataframe_buffer<value_t>(0, handle.get_stream());
+  auto values_for_unique_keys = allocate_dataframe_buffer<value_t>(0, stream_view);
   {
-    rmm::device_uvector<key_t> rx_unique_keys(0, handle.get_stream());
+    rmm::device_uvector<key_t> rx_unique_keys(0, stream_view);
     std::vector<size_t> rx_value_counts{};
     std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
       comm,
       unique_keys.begin(),
       unique_keys.end(),
-      [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); },
-      handle.get_stream());
+      [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); },
+      stream_view);
 
     auto values_for_rx_unique_keys =
-      allocate_dataframe_buffer<value_t>(rx_unique_keys.size(), handle.get_stream());
+      allocate_dataframe_buffer<value_t>(rx_unique_keys.size(), stream_view);
 
     kv_store_view.find(rx_unique_keys.begin(),
                        rx_unique_keys.end(),
                        get_dataframe_buffer_begin(values_for_rx_unique_keys),
-                       handle.get_stream());
+                       stream_view);
 
-    auto rx_values_for_unique_keys = allocate_dataframe_buffer<value_t>(0, handle.get_stream());
-    std::tie(rx_values_for_unique_keys, std::ignore) =
-      shuffle_values(comm,
-                     get_dataframe_buffer_begin(values_for_rx_unique_keys),
-                     rx_value_counts,
-                     handle.get_stream());
+    auto rx_values_for_unique_keys = allocate_dataframe_buffer<value_t>(0, stream_view);
+    std::tie(rx_values_for_unique_keys, std::ignore) = shuffle_values(
+      comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view);
 
     values_for_unique_keys = std::move(rx_values_for_unique_keys);
   }
 
   // 2. build a kv_store_t object for the k, v pairs in unique_keys, values_for_unique_keys.
 
-  kv_store_t<key_t, value_t, KVStoreViewType::binary_search> unique_key_value_store(
-    handle.get_stream());
+  kv_store_t<key_t, value_t, KVStoreViewType::binary_search> unique_key_value_store(stream_view);
   if constexpr (KVStoreViewType::binary_search) {
     unique_key_value_store = kv_store_t<key_t, value_t, true>(std::move(unique_keys),
                                                               std::move(values_for_unique_keys),
                                                               kv_store_view.invalid_value(),
                                                               false,
-                                                              handle.get_stream());
+                                                              stream_view);
   } else {
     auto kv_pair_first = thrust::make_zip_iterator(
       thrust::make_tuple(unique_keys.begin(), get_dataframe_buffer_begin(values_for_unique_keys)));
     auto valid_kv_pair_last =
-      thrust::remove_if(handle.get_thrust_policy(),
+      thrust::remove_if(rmm::exec_policy(stream_view),
                         kv_pair_first,
                         kv_pair_first + unique_keys.size(),
                         [invalid_value = kv_store_view.invalid_value()] __device__(auto pair) {
@@ -136,176 +130,173 @@ collect_values_for_keys(raft::handle_t const& handle,
                                         get_dataframe_buffer_begin(values_for_unique_keys),
                                         kv_store_view.invalid_key(),
                                         kv_store_view.invalid_value(),
-                                        handle.get_stream());
+                                        stream_view);
 
-    unique_keys.resize(0, handle.get_stream());
-    resize_dataframe_buffer(values_for_unique_keys, 0, handle.get_stream());
-    unique_keys.shrink_to_fit(handle.get_stream());
-    shrink_to_fit_dataframe_buffer(values_for_unique_keys, handle.get_stream());
+    unique_keys.resize(0, stream_view);
+    resize_dataframe_buffer(values_for_unique_keys, 0, stream_view);
+    unique_keys.shrink_to_fit(stream_view);
+    shrink_to_fit_dataframe_buffer(values_for_unique_keys, stream_view);
   }
   auto unique_key_value_store_view = unique_key_value_store.view();
 
   // 3. find values for [collect_key_first, collect_key_last)
 
   auto value_buffer = allocate_dataframe_buffer<value_t>(
-    thrust::distance(collect_key_first, collect_key_last), handle.get_stream());
-  unique_key_value_store_view.find(collect_key_first,
-                                   collect_key_last,
-                                   get_dataframe_buffer_begin(value_buffer),
-                                   handle.get_stream());
+    thrust::distance(collect_key_first, collect_key_last), stream_view);
+  unique_key_value_store_view.find(
+    collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer), stream_view);
 
   return value_buffer;
 }
 
-// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank()
-template <typename KVStoreViewType, typename KeyToGPUIdOp>
+// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank()
+template <typename KVStoreViewType, typename KeyToCommRankOp>
 std::tuple<rmm::device_uvector<typename KVStoreViewType::key_type>,
-           decltype(allocate_dataframe_buffer<typename KVStoreViewType::value_type>(
-             0, cudaStream_t{nullptr}))>
+           dataframe_buffer_type_t<typename KVStoreViewType::value_type>>
 collect_values_for_unique_keys(
-  raft::handle_t const& handle,
+  raft::comms::comms_t const& comm,
   KVStoreViewType kv_store_view,
   rmm::device_uvector<typename KVStoreViewType::key_type>&& collect_unique_keys,
-  KeyToGPUIdOp key_to_gpu_id_op)
+  KeyToCommRankOp key_to_comm_rank_op,
+  rmm::cuda_stream_view stream_view)
 {
   using key_t   = typename KVStoreViewType::key_type;
   using value_t = typename KVStoreViewType::value_type;
 
-  auto& comm = handle.get_comms();
-
-  auto values_for_collect_unique_keys = allocate_dataframe_buffer<value_t>(0, handle.get_stream());
+  auto values_for_collect_unique_keys = allocate_dataframe_buffer<value_t>(0, stream_view);
   {
     auto [rx_unique_keys, rx_value_counts] = groupby_gpu_id_and_shuffle_values(
       comm,
       collect_unique_keys.begin(),
       collect_unique_keys.end(),
-      [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); },
-      handle.get_stream());
+      [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); },
+      stream_view);
     auto values_for_rx_unique_keys =
-      allocate_dataframe_buffer<value_t>(rx_unique_keys.size(), handle.get_stream());
+      allocate_dataframe_buffer<value_t>(rx_unique_keys.size(), stream_view);
     kv_store_view.find(rx_unique_keys.begin(),
                        rx_unique_keys.end(),
                        get_dataframe_buffer_begin(values_for_rx_unique_keys),
-                       handle.get_stream());
+                       stream_view);
 
-    std::tie(values_for_collect_unique_keys, std::ignore) =
-      shuffle_values(comm,
-                     get_dataframe_buffer_begin(values_for_rx_unique_keys),
-                     rx_value_counts,
-                     handle.get_stream());
+    std::tie(values_for_collect_unique_keys, std::ignore) = shuffle_values(
+      comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view);
   }
 
   return std::make_tuple(std::move(collect_unique_keys), std::move(values_for_collect_unique_keys));
 }
 
 template <typename vertex_t, typename ValueIterator>
-std::tuple<
-  rmm::device_uvector<vertex_t>,
-  decltype(allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
-    0, cudaStream_t{nullptr}))>
-collect_values_for_unique_int_vertices(raft::handle_t const& handle,
-                                       rmm::device_uvector<vertex_t>&& collect_unique_int_vertices,
-                                       ValueIterator local_value_first,
-                                       std::vector<vertex_t> const& vertex_partition_range_lasts)
+dataframe_buffer_type_t<typename thrust::iterator_traits<ValueIterator>::value_type>
+collect_values_for_sorted_unique_int_vertices(
+  raft::comms::comms_t const& comm,
+  raft::device_span<vertex_t const> collect_sorted_unique_int_vertices,
+  ValueIterator local_value_first,
+  std::vector<vertex_t> const& comm_rank_vertex_partition_range_lasts,
+  vertex_t local_vertex_partition_range_first,
+  rmm::cuda_stream_view stream_view)
 {
   using value_t = typename thrust::iterator_traits<ValueIterator>::value_type;
 
-  auto& comm                 = handle.get_comms();
-  auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-  auto const major_comm_size = major_comm.get_size();
-  auto const major_comm_rank = major_comm.get_rank();
-  auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-  auto const minor_comm_size = minor_comm.get_size();
-  auto const minor_comm_rank = minor_comm.get_rank();
+  // 1.find tx_counts
 
-  // 1. groupby and shuffle internal vertices
+  rmm::device_uvector<vertex_t> d_range_lasts(comm_rank_vertex_partition_range_lasts.size(),
+                                              stream_view);
+  raft::update_device(d_range_lasts.data(),
+                      comm_rank_vertex_partition_range_lasts.data(),
+                      comm_rank_vertex_partition_range_lasts.size(),
+                      stream_view);
 
-  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(),
-                                                               handle.get_stream());
-  raft::update_device(d_vertex_partition_range_lasts.data(),
-                      vertex_partition_range_lasts.data(),
-                      vertex_partition_range_lasts.size(),
-                      handle.get_stream());
+  rmm::device_uvector<size_t> d_offsets(d_range_lasts.size() - 1, stream_view);
+  thrust::lower_bound(rmm::exec_policy_nosync(stream_view),
+                      collect_sorted_unique_int_vertices.begin(),
+                      collect_sorted_unique_int_vertices.end(),
+                      d_range_lasts.begin(),
+                      d_range_lasts.begin() + (d_range_lasts.size() - 1),
+                      d_offsets.begin());
 
-  auto [rx_int_vertices, rx_int_vertex_counts] = groupby_gpu_id_and_shuffle_values(
-    comm,
-    collect_unique_int_vertices.begin(),
-    collect_unique_int_vertices.end(),
-    detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
-      raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
-                                        d_vertex_partition_range_lasts.size()),
-      major_comm_size,
-      minor_comm_size},
-    handle.get_stream());
-
-  // 2: Lookup return values
-
-  auto vertex_partition_id =
-    partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
-      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
-  auto local_int_vertex_first =
-    vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1];
-
-  auto value_buffer =
-    allocate_dataframe_buffer<value_t>(rx_int_vertices.size(), handle.get_stream());
-  thrust::transform(handle.get_thrust_policy(),
+  std::vector<size_t> h_offsets(d_offsets.size() + 2);
+  raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view);
+  h_offsets[0]     = 0;
+  h_offsets.back() = collect_sorted_unique_int_vertices.size();
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view));
+
+  std::vector<size_t> tx_counts(comm_rank_vertex_partition_range_lasts.size());
+  std::adjacent_difference(h_offsets.begin() + 1, h_offsets.end(), tx_counts.begin());
+
+  // 2. shuffle sorted unique internal vertices to the owning ranks
+
+  auto [rx_int_vertices, rx_counts] =
+    shuffle_values(comm, collect_sorted_unique_int_vertices.begin(), tx_counts, stream_view);
+
+  // 3.Lookup return values
+
+  auto value_buffer = allocate_dataframe_buffer<value_t>(rx_int_vertices.size(), stream_view);
+  thrust::transform(rmm::exec_policy_nosync(stream_view),
                     rx_int_vertices.begin(),
                     rx_int_vertices.end(),
                     get_dataframe_buffer_begin(value_buffer),
-                    [local_value_first, local_int_vertex_first] __device__(auto v) {
-                      return local_value_first[v - local_int_vertex_first];
+                    [local_value_first, local_vertex_partition_range_first] __device__(auto v) {
+                      return local_value_first[v - local_vertex_partition_range_first];
                     });
+  rx_int_vertices.resize(0, stream_view);
+  rx_int_vertices.shrink_to_fit(stream_view);
 
-  // 3: Shuffle results back to original GPU
+  // 4. Shuffle results back to the original ranks
 
-  std::tie(value_buffer, std::ignore) = shuffle_values(
-    comm, get_dataframe_buffer_begin(value_buffer), rx_int_vertex_counts, handle.get_stream());
+  std::tie(value_buffer, std::ignore) =
+    shuffle_values(comm, get_dataframe_buffer_begin(value_buffer), rx_counts, stream_view);
 
-  return std::make_tuple(std::move(collect_unique_int_vertices), std::move(value_buffer));
+  return value_buffer;
 }
 
 template <typename VertexIterator, typename ValueIterator>
-decltype(allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
-  0, cudaStream_t{nullptr}))
+dataframe_buffer_type_t<typename thrust::iterator_traits<ValueIterator>::value_type>
 collect_values_for_int_vertices(
-  raft::handle_t const& handle,
+  raft::comms::comms_t const& comm,
   VertexIterator collect_vertex_first,
   VertexIterator collect_vertex_last,
   ValueIterator local_value_first,
   std::vector<typename thrust::iterator_traits<VertexIterator>::value_type> const&
-    vertex_partition_range_lasts)
+    comm_rank_vertex_partition_range_lasts,
+  typename thrust::iterator_traits<VertexIterator>::value_type local_vertex_partition_range_first,
+  rmm::cuda_stream_view stream_view)
 {
   using vertex_t = typename thrust::iterator_traits<VertexIterator>::value_type;
   using value_t  = typename thrust::iterator_traits<ValueIterator>::value_type;
 
   size_t input_size = thrust::distance(collect_vertex_first, collect_vertex_last);
 
-  rmm::device_uvector<vertex_t> sorted_unique_int_vertices(input_size, handle.get_stream());
+  rmm::device_uvector<vertex_t> sorted_unique_int_vertices(input_size, stream_view);
 
-  raft::copy(
-    sorted_unique_int_vertices.data(), collect_vertex_first, input_size, handle.get_stream());
+  raft::copy(sorted_unique_int_vertices.data(), collect_vertex_first, input_size, stream_view);
 
-  thrust::sort(handle.get_thrust_policy(),
+  thrust::sort(rmm::exec_policy_nosync(stream_view),
                sorted_unique_int_vertices.begin(),
                sorted_unique_int_vertices.end());
-  auto last = thrust::unique(handle.get_thrust_policy(),
+  auto last = thrust::unique(rmm::exec_policy(stream_view),
                              sorted_unique_int_vertices.begin(),
                              sorted_unique_int_vertices.end());
   sorted_unique_int_vertices.resize(thrust::distance(sorted_unique_int_vertices.begin(), last),
-                                    handle.get_stream());
-
-  auto [unique_int_vertices, tmp_value_buffer] = collect_values_for_unique_int_vertices(
-    handle, std::move(sorted_unique_int_vertices), local_value_first, vertex_partition_range_lasts);
+                                    stream_view);
 
-  kv_store_t<vertex_t, value_t, true> kv_map(std::move(unique_int_vertices),
+  auto tmp_value_buffer = collect_values_for_sorted_unique_int_vertices(
+    comm,
+    raft::device_span<vertex_t const>(sorted_unique_int_vertices.data(),
+                                      sorted_unique_int_vertices.size()),
+    local_value_first,
+    comm_rank_vertex_partition_range_lasts,
+    local_vertex_partition_range_first,
+    stream_view);
+
+  kv_store_t<vertex_t, value_t, true> kv_map(std::move(sorted_unique_int_vertices),
                                              std::move(tmp_value_buffer),
                                              invalid_vertex_id<vertex_t>::value,
                                              false,
-                                             handle.get_stream());
+                                             stream_view);
   auto device_view = detail::kv_binary_search_store_device_view_t(kv_map.view());
 
-  auto value_buffer = allocate_dataframe_buffer<value_t>(input_size, handle.get_stream());
-  thrust::transform(handle.get_thrust_policy(),
+  auto value_buffer = allocate_dataframe_buffer<value_t>(input_size, stream_view);
+  thrust::transform(rmm::exec_policy_nosync(stream_view),
                     collect_vertex_first,
                     collect_vertex_last,
                     get_dataframe_buffer_begin(value_buffer),
diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh
index 70327db5ff..1cf2493cd2 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs.cuh
+++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh
@@ -61,10 +61,10 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                       (edge_ids ? sizeof(edge_t) : size_t{0}) +
                       (edge_types ? sizeof(edge_type_t) : size_t{0});
   auto constexpr mem_frugal_ratio =
-    0.1;  // if the expected temporary buffer size exceeds the mem_frugal_ratio of the
-          // total_global_mem, switch to the memory frugal approach (thrust::sort is used to
-          // group-by by default, and thrust::sort requires temporary buffer comparable to the input
-          // data size)
+    0.05;  // if the expected temporary buffer size exceeds the mem_frugal_ratio of the
+           // total_global_mem, switch to the memory frugal approach (thrust::sort is used to
+           // group-by by default, and thrust::sort requires temporary buffer comparable to the
+           // input data size)
   auto mem_frugal_threshold =
     static_cast<size_t>(static_cast<double>(total_global_mem / element_size) * mem_frugal_ratio);
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a2eeafea8c..44963f9151 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -698,9 +698,9 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu)
 
     ###############################################################################################
-    # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST tests --------------------------
-    ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST_TEST
-                    prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu)
+    # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests ------------------------------
+    ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST
+                    prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu)
 
     ###############################################################################################
     # - MG PRIMS REDUCE_V tests -------------------------------------------------------------------
diff --git a/cpp/tests/c_api/mg_test_utils.h b/cpp/tests/c_api/mg_test_utils.h
index b040d8dc52..12ca16bfe3 100644
--- a/cpp/tests/c_api/mg_test_utils.h
+++ b/cpp/tests/c_api/mg_test_utils.h
@@ -36,6 +36,15 @@
     }                                                                       \
   } while (0)
 
+#define C_NCCL_TRY(call)                                                              \
+  do {                                                                                \
+    ncclResult_t status = call;                                                       \
+    if (ncclSuccess != status) {                                                      \
+      printf("NCCL call='%s' at file=%s line=%d failed.", #call, __FILE__, __LINE__); \
+      exit(1);                                                                        \
+    }                                                                                 \
+  } while (0)
+
 #define C_CUDA_TRY(call)              \
   do {                                \
     cudaError_t const status = call;  \
diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
similarity index 74%
rename from cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu
rename to cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
index 9b7e24856f..085077017b 100644
--- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/vertex_frontier.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/conversion_utilities.hpp"
@@ -203,48 +203,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
-      hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src");
-    }
-
-    auto mg_reduce_by_src_new_frontier_key_buffer =
-      cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
-    [[maybe_unused]] auto mg_reduce_by_src_payload_buffer =
-      cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
-
-    if constexpr (std::is_same_v<payload_t, void>) {
-      mg_reduce_by_src_new_frontier_key_buffer =
-        cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
-          *handle_,
-          mg_graph_view,
-          mg_vertex_frontier.bucket(bucket_idx_cur),
-          mg_src_prop.view(),
-          mg_dst_prop.view(),
-          cugraph::edge_dummy_property_t{}.view(),
-          e_op_t<key_t, vertex_t, property_t, payload_t>{},
-          cugraph::reduce_op::null{});
-    } else {
-      std::tie(mg_reduce_by_src_new_frontier_key_buffer, mg_reduce_by_src_payload_buffer) =
-        cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
-          *handle_,
-          mg_graph_view,
-          mg_vertex_frontier.bucket(bucket_idx_cur),
-          mg_src_prop.view(),
-          mg_dst_prop.view(),
-          cugraph::edge_dummy_property_t{}.view(),
-          e_op_t<key_t, vertex_t, property_t, payload_t>{},
-          cugraph::reduce_op::plus<payload_t>{});
-    }
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      handle_->get_comms().barrier();
-      hr_timer.stop();
-      hr_timer.display_and_clear(std::cout);
-    }
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      handle_->get_comms().barrier();
-      hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src");
+      hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_dst");
     }
 
     auto mg_reduce_by_dst_new_frontier_key_buffer =
@@ -286,64 +245,27 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
     // 3. compare SG & MG results
 
     if (prims_usecase.check_correctness) {
-      if constexpr (std::is_same_v<key_t, vertex_t>) {
-        cugraph::unrenumber_int_vertices<vertex_t, true>(
-          *handle_,
-          mg_reduce_by_src_new_frontier_key_buffer.begin(),
-          mg_reduce_by_src_new_frontier_key_buffer.size(),
-          (*mg_renumber_map).data(),
-          mg_graph_view.vertex_partition_range_lasts());
-
-        cugraph::unrenumber_int_vertices<vertex_t, true>(
-          *handle_,
-          mg_reduce_by_dst_new_frontier_key_buffer.begin(),
-          mg_reduce_by_dst_new_frontier_key_buffer.size(),
-          (*mg_renumber_map).data(),
-          mg_graph_view.vertex_partition_range_lasts());
-      } else {
-        cugraph::unrenumber_int_vertices<vertex_t, true>(
-          *handle_,
-          std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).begin(),
-          std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size(),
-          (*mg_renumber_map).data(),
-          mg_graph_view.vertex_partition_range_lasts());
-
-        cugraph::unrenumber_int_vertices<vertex_t, true>(
-          *handle_,
-          std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).begin(),
-          std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(),
-          (*mg_renumber_map).data(),
-          mg_graph_view.vertex_partition_range_lasts());
-      }
-
-      auto mg_reduce_by_src_aggregate_new_frontier_key_buffer =
-        cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
-      if constexpr (std::is_same_v<key_t, vertex_t>) {
-        mg_reduce_by_src_aggregate_new_frontier_key_buffer =
-          cugraph::test::device_gatherv(*handle_,
-                                        mg_reduce_by_src_new_frontier_key_buffer.data(),
-                                        mg_reduce_by_src_new_frontier_key_buffer.size());
-      } else {
-        std::get<0>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) =
-          cugraph::test::device_gatherv(
-            *handle_,
-            std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).data(),
-            std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size());
-        std::get<1>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) =
-          cugraph::test::device_gatherv(
-            *handle_,
-            std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).data(),
-            std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).size());
-      }
-
       auto mg_reduce_by_dst_aggregate_new_frontier_key_buffer =
         cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
       if constexpr (std::is_same_v<key_t, vertex_t>) {
+        cugraph::unrenumber_local_int_vertices(*handle_,
+                                               mg_reduce_by_dst_new_frontier_key_buffer.data(),
+                                               mg_reduce_by_dst_new_frontier_key_buffer.size(),
+                                               (*mg_renumber_map).data(),
+                                               mg_graph_view.local_vertex_partition_range_first(),
+                                               mg_graph_view.local_vertex_partition_range_last());
         mg_reduce_by_dst_aggregate_new_frontier_key_buffer =
           cugraph::test::device_gatherv(*handle_,
                                         mg_reduce_by_dst_new_frontier_key_buffer.data(),
                                         mg_reduce_by_dst_new_frontier_key_buffer.size());
       } else {
+        cugraph::unrenumber_local_int_vertices(
+          *handle_,
+          std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).data(),
+          std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(),
+          (*mg_renumber_map).data(),
+          mg_graph_view.local_vertex_partition_range_first(),
+          mg_graph_view.local_vertex_partition_range_last());
         std::get<0>(mg_reduce_by_dst_aggregate_new_frontier_key_buffer) =
           cugraph::test::device_gatherv(
             *handle_,
@@ -356,26 +278,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
             std::get<1>(mg_reduce_by_dst_new_frontier_key_buffer).size());
       }
 
-      [[maybe_unused]] auto mg_reduce_by_src_aggregate_payload_buffer =
-        cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
-      if constexpr (!std::is_same_v<payload_t, void>) {
-        if constexpr (std::is_arithmetic_v<payload_t>) {
-          mg_reduce_by_src_aggregate_payload_buffer =
-            cugraph::test::device_gatherv(*handle_,
-                                          mg_reduce_by_src_payload_buffer.data(),
-                                          mg_reduce_by_src_payload_buffer.size());
-        } else {
-          std::get<0>(mg_reduce_by_src_aggregate_payload_buffer) =
-            cugraph::test::device_gatherv(*handle_,
-                                          std::get<0>(mg_reduce_by_src_payload_buffer).data(),
-                                          std::get<0>(mg_reduce_by_src_payload_buffer).size());
-          std::get<1>(mg_reduce_by_src_aggregate_payload_buffer) =
-            cugraph::test::device_gatherv(*handle_,
-                                          std::get<1>(mg_reduce_by_src_payload_buffer).data(),
-                                          std::get<1>(mg_reduce_by_src_payload_buffer).size());
-        }
-      }
-
       [[maybe_unused]] auto mg_reduce_by_dst_aggregate_payload_buffer =
         cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
       if constexpr (!std::is_same_v<payload_t, void>) {
@@ -409,22 +311,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
 
       if (handle_->get_comms().get_rank() == int{0}) {
         if constexpr (std::is_same_v<payload_t, void>) {
-          thrust::sort(
-            handle_->get_thrust_policy(),
-            cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer));
-
           thrust::sort(
             handle_->get_thrust_policy(),
             cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer),
             cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_new_frontier_key_buffer));
         } else {
-          thrust::sort_by_key(
-            handle_->get_thrust_policy(),
-            cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_payload_buffer));
-
           thrust::sort_by_key(
             handle_->get_thrust_policy(),
             cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer),
@@ -471,34 +362,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
           .insert(cugraph::get_dataframe_buffer_begin(sg_key_buffer),
                   cugraph::get_dataframe_buffer_end(sg_key_buffer));
 
-        auto sg_reduce_by_src_new_frontier_key_buffer =
-          cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
-        [[maybe_unused]] auto sg_reduce_by_src_payload_buffer =
-          cugraph::detail::allocate_optional_dataframe_buffer<payload_t>(0, handle_->get_stream());
-        if constexpr (std::is_same_v<payload_t, void>) {
-          sg_reduce_by_src_new_frontier_key_buffer =
-            cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
-              *handle_,
-              sg_graph_view,
-              sg_vertex_frontier.bucket(bucket_idx_cur),
-              sg_src_prop.view(),
-              sg_dst_prop.view(),
-              cugraph::edge_dummy_property_t{}.view(),
-              e_op_t<key_t, vertex_t, property_t, payload_t>{},
-              cugraph::reduce_op::null{});
-        } else {
-          std::tie(sg_reduce_by_src_new_frontier_key_buffer, sg_reduce_by_src_payload_buffer) =
-            cugraph::transform_reduce_v_frontier_outgoing_e_by_src(
-              *handle_,
-              sg_graph_view,
-              sg_vertex_frontier.bucket(bucket_idx_cur),
-              sg_src_prop.view(),
-              sg_dst_prop.view(),
-              cugraph::edge_dummy_property_t{}.view(),
-              e_op_t<key_t, vertex_t, property_t, payload_t>{},
-              cugraph::reduce_op::plus<payload_t>{});
-        }
-
         auto sg_reduce_by_dst_new_frontier_key_buffer =
           cugraph::allocate_dataframe_buffer<key_t>(0, handle_->get_stream());
         [[maybe_unused]] auto sg_reduce_by_dst_payload_buffer =
@@ -528,22 +391,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
         }
 
         if constexpr (std::is_same_v<payload_t, void>) {
-          thrust::sort(
-            handle_->get_thrust_policy(),
-            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer));
-
           thrust::sort(
             handle_->get_thrust_policy(),
             cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer),
             cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer));
         } else {
-          thrust::sort_by_key(
-            handle_->get_thrust_policy(),
-            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer),
-            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer));
-
           thrust::sort_by_key(
             handle_->get_thrust_policy(),
             cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer),
@@ -551,14 +403,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
             cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer));
         }
 
-        bool key_passed = thrust::equal(
-          handle_->get_thrust_policy(),
-          cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer),
-          cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer),
-          cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer));
-        ASSERT_TRUE(key_passed);
-
-        key_passed = thrust::equal(
+        auto key_passed = thrust::equal(
           handle_->get_thrust_policy(),
           cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer),
           cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer),
@@ -567,13 +412,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst
 
         if constexpr (!std::is_same_v<payload_t, void>) {
           bool payload_passed = thrust::equal(
-            handle_->get_thrust_policy(),
-            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer),
-            cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer),
-            cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_payload_buffer));
-          ASSERT_TRUE(payload_passed);
-
-          payload_passed = thrust::equal(
             handle_->get_thrust_policy(),
             cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer),
             cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer),
diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp
index 4d4b83e275..3cd712798e 100644
--- a/cpp/tests/traversal/mg_bfs_test.cpp
+++ b/cpp/tests/traversal/mg_bfs_test.cpp
@@ -100,16 +100,6 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
         *handle_, mg_graph_view, 2);
       mg_graph_view.attach_edge_mask((*edge_mask).view());
     }
-    {  // FIXME: for testing, delete
-      auto num_self_loops  = mg_graph_view.count_self_loops(*handle_);
-      auto number_of_edges = mg_graph_view.compute_number_of_edges(*handle_);
-      std::cout << "V=" << mg_graph_view.number_of_vertices() << " E=" << number_of_edges
-                << " num_self_loops=" << num_self_loops;
-      if (mg_graph_view.is_symmetric()) {
-        std::cout << " undirected E=" << ((number_of_edges - num_self_loops) / 2 + num_self_loops)
-                  << std::endl;
-      }
-    }
 
     ASSERT_TRUE(static_cast<vertex_t>(bfs_usecase.source) >= 0 &&
                 static_cast<vertex_t>(bfs_usecase.source) < mg_graph_view.number_of_vertices())
@@ -324,10 +314,12 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGBFS_Rmat,
   ::testing::Values(
     // enable correctness checks
-    std::make_tuple(BFS_Usecase{0, false},
-                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)),
-    std::make_tuple(BFS_Usecase{0, true},
-                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+    std::make_tuple(
+      BFS_Usecase{0, false},
+      cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false)),
+    std::make_tuple(
+      BFS_Usecase{0, true},
+      cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
@@ -338,13 +330,11 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGBFS_Rmat,
   ::testing::Values(
     // disable correctness checks for large graphs
-    std::make_tuple(
-      BFS_Usecase{0, false, false},
-      cugraph::test::Rmat_Usecase(
-        20, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false /* scramble vertex IDs */)),
-    std::make_tuple(
-      BFS_Usecase{0, true, false},
-      cugraph::test::Rmat_Usecase(
-        20, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false /* scramble vertex IDs */))));
+    std::make_tuple(BFS_Usecase{0, false, false},
+                    cugraph::test::Rmat_Usecase(
+                      20, 16, 0.57, 0.19, 0.19, 0, false, false /* scramble vertex IDs */)),
+    std::make_tuple(BFS_Usecase{0, true, false},
+                    cugraph::test::Rmat_Usecase(
+                      20, 16, 0.57, 0.19, 0.19, 0, false, false /* scramble vertex IDs */))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/mg_utilities.cpp b/cpp/tests/utilities/mg_utilities.cpp
index 6f8fb8c6ac..cb86dee5d9 100644
--- a/cpp/tests/utilities/mg_utilities.cpp
+++ b/cpp/tests/utilities/mg_utilities.cpp
@@ -51,9 +51,9 @@ std::unique_ptr<raft::handle_t> initialize_mg_handle(size_t pool_size)
   handle = std::make_unique<raft::handle_t>(rmm::cuda_stream_per_thread,
                                             std::make_shared<rmm::cuda_stream_pool>(pool_size));
 
+  auto comm_size = query_mpi_comm_world_size();
+
   raft::comms::initialize_mpi_comms(handle.get(), MPI_COMM_WORLD);
-  auto& comm           = handle->get_comms();
-  auto const comm_size = comm.get_size();
 
   auto gpu_row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
   while (comm_size % gpu_row_comm_size != 0) {
diff --git a/cpp/tests/utilities/mg_utilities.hpp b/cpp/tests/utilities/mg_utilities.hpp
index 9f98245387..d87da4aaa3 100644
--- a/cpp/tests/utilities/mg_utilities.hpp
+++ b/cpp/tests/utilities/mg_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,8 @@ void finalize_mpi();
 int query_mpi_comm_world_rank();
 int query_mpi_comm_world_size();
 
-std::unique_ptr<raft::handle_t> initialize_mg_handle(size_t pool_size = 64);
+std::unique_ptr<raft::handle_t> initialize_mg_handle(
+  size_t pool_size = 8 /* default value of CUDA_DEVICE_MAX_CONNECTIONS */);
 
 // NCCL lazily initializes for P2P, and this enforces P2P initialization for better performance
 // measurements
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 0a706d1cf8..ed96ba2391 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -24,7 +24,6 @@
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_generators.hpp>
-#include <cugraph/legacy/functions.hpp>  // legacy coo_to_csr
 
 #include <raft/random/rng_state.hpp>
 
@@ -234,7 +233,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
   construct_edgelist(raft::handle_t const& handle,
                      bool test_weighted,
                      bool store_transposed,
-                     bool multi_gpu) const
+                     bool multi_gpu,
+                     bool shuffle = true) const
   {
     CUGRAPH_EXPECTS(
       (size_t{1} << scale_) <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
@@ -246,7 +246,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
     // cuMemAddressReserve
     // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we
     // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size)
-    size_t constexpr num_partitions_per_gpu = 4;
+    size_t constexpr num_partitions_per_gpu = 8;
     size_t num_partitions =
       num_partitions_per_gpu * static_cast<size_t>(multi_gpu ? handle.get_comms().get_size() : 1);
 
@@ -330,7 +330,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
             handle, std::move(tmp_src_v), std::move(tmp_dst_v), std::move(tmp_weights_v));
       }
 
-      if (multi_gpu) {
+      if (multi_gpu && shuffle) {
         std::tie(store_transposed ? tmp_dst_v : tmp_src_v,
                  store_transposed ? tmp_src_v : tmp_dst_v,
                  tmp_weights_v,
@@ -375,7 +375,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
 
     translate(handle, vertex_v);
 
-    if (multi_gpu) {
+    if (multi_gpu && shuffle) {
       vertex_v = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
         handle, std::move(vertex_v));
     }
@@ -391,6 +391,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
 
   void set_edge_factor(size_t edge_factor) { edge_factor_ = edge_factor; }
 
+  bool undirected() const { return undirected_; }
+
  private:
   size_t scale_{};
   size_t edge_factor_{};
@@ -762,39 +764,5 @@ construct_graph(raft::handle_t const& handle,
   return std::make_tuple(std::move(graph), std::move(edge_weights), std::move(renumber_map));
 }
 
-namespace legacy {
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename input_usecase_t>
-std::unique_ptr<cugraph::legacy::GraphCSR<vertex_t, edge_t, weight_t>> construct_graph_csr(
-  raft::handle_t const& handle, input_usecase_t const& input_usecase, bool test_weighted)
-{
-  auto [d_src_v, d_dst_v, d_weight_v, d_vertices_v, is_symmetric] =
-    input_usecase.template construct_edgelist<vertex_t, weight_t>(
-      handle, test_weighted, false, false);
-  vertex_t num_vertices{};  // assuming that vertex IDs are non-negative consecutive integers
-  if (d_vertices_v) {
-    num_vertices =
-      max_element(
-        handle, raft::device_span<vertex_t const>((*d_vertices_v).data(), (*d_vertices_v).size())) +
-      1;
-  } else {
-    num_vertices =
-      std::max(
-        max_element(handle, raft::device_span<vertex_t const>(d_src_v.data(), d_src_v.size())),
-        max_element(handle, raft::device_span<vertex_t const>(d_dst_v.data(), d_dst_v.size()))) +
-      1;
-  }
-
-  cugraph::legacy::GraphCOOView<vertex_t, edge_t, weight_t> cooview(
-    d_src_v.data(),
-    d_dst_v.data(),
-    d_weight_v ? d_weight_v->data() : nullptr,
-    num_vertices,
-    static_cast<edge_t>(d_src_v.size()));
-
-  return cugraph::coo_to_csr(cooview);
-}
-
-}  // namespace legacy
 }  // namespace test
 }  // namespace cugraph

From a97775568159d054cba05ed85f885cf2ae765409 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Mon, 25 Nov 2024 10:17:29 -0800
Subject: [PATCH 4/7] Fix debug build failure (#4774)

cugraph is no longer dependent on cugraph_ops and no longer pulls files from the cugrpahops repo. But we have two `assert` statements still assuming cugrpahops files are available. These assert statements are compiled only in the debug mode. This PR fixes build errors due to these assert statements in the debug build.

Closes #4763

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4774
---
 .../prims/detail/sample_and_compute_local_nbr_indices.cuh  | 7 +------
 cpp/src/sampling/neighbor_sampling_impl.hpp                | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
index 5ebc3dc8ae..dd0da77851 100644
--- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
+++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
@@ -597,8 +597,6 @@ rmm::device_uvector<edge_t> compute_uniform_sampling_index_without_replacement(
   raft::random::RngState& rng_state,
   size_t K)
 {
-  assert(cugraph::invalid_edge_id_v<edge_t> == cugraph::ops::graph::INVALID_ID<edge_t>);
-
   edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
   assert(mid_partition_degree_range_last > K);
   size_t high_partition_oversampling_K = K * 2;  // tuning parameter
@@ -1567,10 +1565,7 @@ uniform_sample_and_compute_local_nbr_indices(
   size_t K,
   bool with_replacement)
 {
-  using edge_t = typename GraphViewType::edge_type;
-
-  assert(cugraph::invalid_edge_id_v<edge_t> == cugraph::ops::graph::INVALID_ID<edge_t>);
-
+  using edge_t   = typename GraphViewType::edge_type;
   using vertex_t = typename GraphViewType::vertex_type;
   using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
 
diff --git a/cpp/src/sampling/neighbor_sampling_impl.hpp b/cpp/src/sampling/neighbor_sampling_impl.hpp
index ccca71cdf2..ce580ea9b4 100644
--- a/cpp/src/sampling/neighbor_sampling_impl.hpp
+++ b/cpp/src/sampling/neighbor_sampling_impl.hpp
@@ -179,7 +179,7 @@ neighbor_sample_impl(raft::handle_t const& handle,
                     ? (fan_out.size() / num_edge_types)
                     : ((fan_out.size() / num_edge_types) + 1);
 
-  for (auto hop = 0; hop < num_hops; hop++) {
+  for (size_t hop = 0; hop < num_hops; ++hop) {
     for (auto edge_type_id = 0; edge_type_id < num_edge_types; edge_type_id++) {
       auto k_level = fan_out[(hop * num_edge_types) + edge_type_id];
       rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());

From e155a8f2e52ca71a89ac07f7fc205ebe4e062d55 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Mon, 25 Nov 2024 16:18:34 -0600
Subject: [PATCH 5/7] Updates READMEs, updates `core_number` to properly ignore
 `degree_type`, minor cleanup (#4776)

* updates READMEs to remove outdated nx-cugraph text
* updates `core_number` docs, APIs, tests to properly ignore `degree_type` due to `core_number` not supporting directed graphs which `degree_type` is intended for - `degree_type` settings will be honored when directed graphs are supported.
* renames test helper function for clarity
* fixes issue with datasets API to properly recreate the edgelist for MG (dask) if previously created for SG.

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Don Acosta (https://github.com/acostadon)
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/4776
---
 README.md                                     | 10 -----
 python/cugraph/cugraph/cores/core_number.py   | 41 +++++++++++--------
 .../cugraph/cugraph/dask/cores/core_number.py | 29 +++++++------
 python/cugraph/cugraph/datasets/dataset.py    |  4 +-
 .../cugraph/tests/core/test_core_number.py    | 24 ++++++-----
 .../cugraph/tests/core/test_core_number_mg.py | 33 ++++++---------
 .../tests/link_prediction/test_jaccard.py     | 22 +++++-----
 .../pylibcugraph/pylibcugraph/core_number.pyx | 24 +++++------
 8 files changed, 90 insertions(+), 97 deletions(-)

diff --git a/README.md b/README.md
index 857406075e..f77b5702f7 100644
--- a/README.md
+++ b/README.md
@@ -34,16 +34,6 @@
 
 </div>
 
------
-## News
-
-___NEW!___   _[nx-cugraph](https://rapids.ai/nx-cugraph/)_, a NetworkX backend that provides GPU acceleration to NetworkX with zero code change.
-```
-> pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com
-> export NETWORKX_AUTOMATIC_BACKENDS=cugraph
-```
-That's it.  NetworkX now leverages cuGraph for accelerated graph algorithms.
-
 -----
 
 ## Table of contents
diff --git a/python/cugraph/cugraph/cores/core_number.py b/python/cugraph/cugraph/cores/core_number.py
index 0b411c2eed..d84069ddec 100644
--- a/python/cugraph/cugraph/cores/core_number.py
+++ b/python/cugraph/cugraph/cores/core_number.py
@@ -23,19 +23,16 @@
 def core_number(G, degree_type="bidirectional"):
     """
     Compute the core numbers for the nodes of the graph G. A k-core of a graph
-    is a maximal subgraph that contains nodes of degree k or more.
-    A node has a core number of k if it belongs a k-core but not to k+1-core.
-    This call does not support a graph with self-loops and parallel
-    edges.
+    is a maximal subgraph that contains nodes of degree k or more.  A node has
+    a core number of k if it belongs to a k-core but not to k+1-core.  This
+    call does not support a graph with self-loops and parallel edges.
 
     Parameters
     ----------
     G : cuGraph.Graph or networkx.Graph
-        The graph should contain undirected edges where undirected edges are
-        represented as directed edges in both directions. While this graph
-        can contain edge weights, they don't participate in the calculation
+        The current implementation only supports undirected graphs.  The graph
+        can contain edge weights, but they don't participate in the calculation
         of the core numbers.
-        The current implementation only supports undirected graphs.
 
         .. deprecated:: 24.12
            Accepting a ``networkx.Graph`` is deprecated and will be removed in a
@@ -43,9 +40,10 @@ def core_number(G, degree_type="bidirectional"):
            the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
 
     degree_type: str, (default="bidirectional")
-        This option determines if the core number computation should be based
-        on input, output, or both directed edges, with valid values being
-        "incoming", "outgoing", and "bidirectional" respectively.
+        This option is currently ignored.  This option may eventually determine
+        if the core number computation should be based on input, output, or
+        both directed edges, with valid values being "incoming", "outgoing",
+        and "bidirectional" respectively.
 
     Returns
     -------
@@ -63,7 +61,13 @@ def core_number(G, degree_type="bidirectional"):
     >>> from cugraph.datasets import karate
     >>> G = karate.get_graph(download=True)
     >>> df = cugraph.core_number(G)
-
+    >>> df.head()
+       vertex  core_number
+    0      33            4
+    1       0            4
+    2      32            4
+    3       2            4
+    4       1            4
     """
 
     G, isNx = ensure_cugraph_obj_for_nx(G)
@@ -71,11 +75,14 @@ def core_number(G, degree_type="bidirectional"):
     if G.is_directed():
         raise ValueError("input graph must be undirected")
 
-    if degree_type not in ["incoming", "outgoing", "bidirectional"]:
-        raise ValueError(
-            f"'degree_type' must be either incoming, "
-            f"outgoing or bidirectional, got: {degree_type}"
-        )
+    # degree_type is currently ignored until libcugraph supports directed
+    # graphs for core_number. Once supporteed, degree_type should be checked
+    # like so:
+    # if degree_type not in ["incoming", "outgoing", "bidirectional"]:
+    #     raise ValueError(
+    #         f"'degree_type' must be either incoming, "
+    #         f"outgoing or bidirectional, got: {degree_type}"
+    #     )
 
     vertex, core_number = pylibcugraph_core_number(
         resource_handle=ResourceHandle(),
diff --git a/python/cugraph/cugraph/dask/cores/core_number.py b/python/cugraph/cugraph/dask/cores/core_number.py
index 4ae1fb547d..3266348f73 100644
--- a/python/cugraph/cugraph/dask/cores/core_number.py
+++ b/python/cugraph/cugraph/dask/cores/core_number.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,15 +53,15 @@ def core_number(input_graph, degree_type="bidirectional"):
     Parameters
     ----------
     input_graph : cugraph.graph
-        cuGraph graph descriptor, should contain the connectivity information,
-        (edge weights are not used in this algorithm).
-        The current implementation only supports undirected graphs.
+        The current implementation only supports undirected graphs.  The graph
+        can contain edge weights, but they don't participate in the calculation
+        of the core numbers.
 
     degree_type: str, (default="bidirectional")
-        This option determines if the core number computation should be based
-        on input, output, or both directed edges, with valid values being
-        "incoming", "outgoing", and "bidirectional" respectively.
-
+        This option is currently ignored.  This option may eventually determine
+        if the core number computation should be based on input, output, or
+        both directed edges, with valid values being "incoming", "outgoing",
+        and "bidirectional" respectively.
 
     Returns
     -------
@@ -77,11 +77,14 @@ def core_number(input_graph, degree_type="bidirectional"):
     if input_graph.is_directed():
         raise ValueError("input graph must be undirected")
 
-    if degree_type not in ["incoming", "outgoing", "bidirectional"]:
-        raise ValueError(
-            f"'degree_type' must be either incoming, "
-            f"outgoing or bidirectional, got: {degree_type}"
-        )
+    # degree_type is currently ignored until libcugraph supports directed
+    # graphs for core_number. Once supporteed, degree_type should be checked
+    # like so:
+    # if degree_type not in ["incoming", "outgoing", "bidirectional"]:
+    #     raise ValueError(
+    #         f"'degree_type' must be either incoming, "
+    #         f"outgoing or bidirectional, got: {degree_type}"
+    #     )
 
     # Initialize dask client
     client = default_client()
diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py
index 15c30700fc..63389cbc16 100644
--- a/python/cugraph/cugraph/datasets/dataset.py
+++ b/python/cugraph/cugraph/datasets/dataset.py
@@ -352,7 +352,9 @@ def get_dask_graph(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
         """
-        if self._edgelist is None:
+        if self._edgelist is None or not isinstance(
+            self._edgelist, dask_cudf.DataFrame
+        ):
             self.get_dask_edgelist(download=download)
 
         if create_using is None:
diff --git a/python/cugraph/cugraph/tests/core/test_core_number.py b/python/cugraph/cugraph/tests/core/test_core_number.py
index a01b837ff6..b50e60ceb8 100644
--- a/python/cugraph/cugraph/tests/core/test_core_number.py
+++ b/python/cugraph/cugraph/tests/core/test_core_number.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,11 +32,15 @@ def setup_function():
 # =============================================================================
 # Pytest fixtures
 # =============================================================================
-degree_type = ["incoming", "outgoing"]
+# FIXME: degree_type is currently unsupported (ignored)
+# degree_type = ["incoming", "outgoing"]
 
+# fixture_params = gen_fixture_params_product(
+#     (UNDIRECTED_DATASETS, "graph_file"),
+#     (degree_type, "degree_type"),
+# )
 fixture_params = gen_fixture_params_product(
     (UNDIRECTED_DATASETS, "graph_file"),
-    (degree_type, "degree_type"),
 )
 
 
@@ -46,7 +50,9 @@ def input_combo(request):
     This fixture returns a dictionary containing all input params required to
     run a Core number algo
     """
-    parameters = dict(zip(("graph_file", "degree_type"), request.param))
+    # FIXME: degree_type is not supported so do not test with different values
+    # parameters = dict(zip(("graph_file", "degree_type"), request.param))
+    parameters = {"graph_file": request.param[0]}
 
     graph_file = parameters["graph_file"]
     G = graph_file.get_graph()
@@ -69,7 +75,8 @@ def input_combo(request):
 def test_core_number(input_combo):
     G = input_combo["G"]
     Gnx = input_combo["Gnx"]
-    degree_type = input_combo["degree_type"]
+    # FIXME: degree_type is currently unsupported (ignored)
+    # degree_type = input_combo["degree_type"]
     nx_core_number_results = cudf.DataFrame()
 
     dic_results = nx.core_number(Gnx)
@@ -80,7 +87,7 @@ def test_core_number(input_combo):
     )
 
     core_number_results = (
-        cugraph.core_number(G, degree_type)
+        cugraph.core_number(G)
         .sort_values("vertex")
         .reset_index(drop=True)
         .rename(columns={"core_number": "cugraph_core_number"})
@@ -109,8 +116,3 @@ def test_core_number_invalid_input(input_combo):
 
     with pytest.raises(ValueError):
         cugraph.core_number(G)
-
-    invalid_degree_type = "invalid"
-    G = input_combo["G"]
-    with pytest.raises(ValueError):
-        cugraph.core_number(G, invalid_degree_type)
diff --git a/python/cugraph/cugraph/tests/core/test_core_number_mg.py b/python/cugraph/cugraph/tests/core/test_core_number_mg.py
index 1138c1dc48..2c2c7e40a2 100644
--- a/python/cugraph/cugraph/tests/core/test_core_number_mg.py
+++ b/python/cugraph/cugraph/tests/core/test_core_number_mg.py
@@ -17,7 +17,7 @@
 
 import cugraph
 import cugraph.dask as dcg
-from cugraph.datasets import karate, dolphins, karate_asymmetric
+from cugraph.datasets import karate, dolphins
 
 
 # =============================================================================
@@ -35,7 +35,8 @@ def setup_function():
 
 
 DATASETS = [karate, dolphins]
-DEGREE_TYPE = ["incoming", "outgoing", "bidirectional"]
+# FIXME: degree_type is currently unsupported (ignored)
+# DEGREE_TYPE = ["incoming", "outgoing", "bidirectional"]
 
 
 # =============================================================================
@@ -43,9 +44,9 @@ def setup_function():
 # =============================================================================
 
 
-def get_sg_results(dataset, degree_type):
+def get_sg_results(dataset):
     G = dataset.get_graph(create_using=cugraph.Graph(directed=False))
-    res = cugraph.core_number(G, degree_type)
+    res = cugraph.core_number(G)
     res = res.sort_values("vertex").reset_index(drop=True)
     return res
 
@@ -57,23 +58,23 @@ def get_sg_results(dataset, degree_type):
 
 @pytest.mark.mg
 @pytest.mark.parametrize("dataset", DATASETS)
-@pytest.mark.parametrize("degree_type", DEGREE_TYPE)
-def test_sg_core_number(dask_client, dataset, degree_type, benchmark):
+# @pytest.mark.parametrize("degree_type", DEGREE_TYPE)
+def test_sg_core_number(dask_client, dataset, benchmark):
     # This test is only for benchmark purposes.
     sg_core_number_results = None
     G = dataset.get_graph(create_using=cugraph.Graph(directed=False))
-    sg_core_number_results = benchmark(cugraph.core_number, G, degree_type)
+    sg_core_number_results = benchmark(cugraph.core_number, G)
     assert sg_core_number_results is not None
 
 
 @pytest.mark.mg
 @pytest.mark.parametrize("dataset", DATASETS)
-@pytest.mark.parametrize("degree_type", DEGREE_TYPE)
-def test_core_number(dask_client, dataset, degree_type, benchmark):
+# @pytest.mark.parametrize("degree_type", DEGREE_TYPE)
+def test_core_number(dask_client, dataset, benchmark):
     dataset.get_dask_edgelist(download=True)  # reload with MG edgelist
     dg = dataset.get_dask_graph(create_using=cugraph.Graph(directed=False))
 
-    result_core_number = benchmark(dcg.core_number, dg, degree_type)
+    result_core_number = benchmark(dcg.core_number, dg)
     result_core_number = (
         result_core_number.drop_duplicates()
         .compute()
@@ -82,7 +83,7 @@ def test_core_number(dask_client, dataset, degree_type, benchmark):
         .rename(columns={"core_number": "mg_core_number"})
     )
 
-    expected_output = get_sg_results(dataset, degree_type)
+    expected_output = get_sg_results(dataset)
 
     # Update the mg core number with sg core number results
     # for easy comparison using cuDF DataFrame methods.
@@ -90,13 +91,3 @@ def test_core_number(dask_client, dataset, degree_type, benchmark):
     counts_diffs = result_core_number.query("mg_core_number != sg_core_number")
 
     assert len(counts_diffs) == 0
-
-
-@pytest.mark.mg
-def test_core_number_invalid_input():
-    dg = karate_asymmetric.get_graph(create_using=cugraph.Graph(directed=True))
-
-    invalid_degree_type = 3
-
-    with pytest.raises(ValueError):
-        dcg.core_number(dg, invalid_degree_type)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
index c9fb73babb..ed3a796121 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
@@ -156,12 +156,10 @@ def networkx_call(M, benchmark_callable=None):
 
 # FIXME: This compare is shared across several tests... it should be
 #        a general utility
-def compare(src1, dst1, val1, src2, dst2, val2):
-    #
+def assert_results_equal(src1, dst1, val1, src2, dst2, val2):
     #  We will do comparison computations by using dataframe
     #  merge functions (essentially doing fast joins).  We
     #  start by making two data frames
-    #
     df1 = cudf.DataFrame()
     df1["src1"] = src1
     df1["dst1"] = dst1
@@ -174,19 +172,18 @@ def compare(src1, dst1, val1, src2, dst2, val2):
     if val2 is not None:
         df2["val2"] = val2
 
-    #
-    #  Check to see if all pairs in the original data frame
-    #  still exist in the new data frame.  If we join (merge)
-    #  the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i])
-    #  then we should get exactly the same number of entries in
-    #  the data frame if we did not lose any data.
-    #
+    #  Check to see if all pairs in df1 still exist in the new (merged) data
+    #  frame.  If we join (merge) the data frames where (src1[i]=src2[i]) and
+    #  (dst1[i]=dst2[i]) then we should get exactly the same number of entries
+    #  in the data frame if we did not lose any data.
     join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"])
 
+    # Print detailed differences on test failure
     if len(df1) != len(join):
         join2 = df1.merge(
             df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"]
         )
+        orig_option = pd.get_option("display.max_rows")
         pd.set_option("display.max_rows", 500)
         print("df1 = \n", df1.sort_values(["src1", "dst1"]))
         print("df2 = \n", df2.sort_values(["src2", "dst2"]))
@@ -196,6 +193,7 @@ def compare(src1, dst1, val1, src2, dst2, val2):
             .to_pandas()
             .query("src2.isnull()", engine="python"),
         )
+        pd.set_option("display.max_rows", orig_option)
 
     assert len(df1) == len(join)
 
@@ -485,7 +483,7 @@ def test_all_pairs_jaccard_with_topk():
     worst_coeff = all_pairs_jaccard_results["jaccard_coeff"].min()
     better_than_k = jaccard_results[jaccard_results["jaccard_coeff"] > worst_coeff]
 
-    compare(
+    assert_results_equal(
         all_pairs_jaccard_results["first"],
         all_pairs_jaccard_results["second"],
         all_pairs_jaccard_results["jaccard_coeff"],
@@ -494,7 +492,7 @@ def test_all_pairs_jaccard_with_topk():
         jaccard_results["jaccard_coeff"],
     )
 
-    compare(
+    assert_results_equal(
         better_than_k["first"],
         better_than_k["second"],
         better_than_k["jaccard_coeff"],
diff --git a/python/pylibcugraph/pylibcugraph/core_number.pyx b/python/pylibcugraph/pylibcugraph/core_number.pyx
index e754ef2c65..48d9c5de42 100644
--- a/python/pylibcugraph/pylibcugraph/core_number.pyx
+++ b/python/pylibcugraph/pylibcugraph/core_number.pyx
@@ -66,14 +66,14 @@ def core_number(ResourceHandle resource_handle,
         referencing data and running algorithms.
 
     graph : SGGraph or MGGraph
-        The input graph, for either Single or Multi-GPU operations.
+        The input graph, for either single or multi-GPU operations. The input
+        graph must be symmetric (the is_symmetric property must be True).
 
     degree_type: str
-        This option determines if the core number computation should be based
-        on input, output, or both directed edges, with valid values being
-        "incoming", "outgoing", and "bidirectional" respectively.
-        This option is currently ignored in this release, and setting it will
-        result in a warning.
+        This option is currently ignored.  This option may eventually determine
+        if the core number computation should be based on input, output, or
+        both directed edges, with valid values being "incoming", "outgoing",
+        and "bidirectional" respectively.
 
     do_expensive_check: bool
         If True, performs more extensive tests on the inputs to ensure
@@ -98,14 +98,14 @@ def core_number(ResourceHandle resource_handle,
     cdef cugraph_error_code_t error_code
     cdef cugraph_error_t* error_ptr
 
-    degree_type_map = {
-        "incoming": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_IN,
-        "outgoing": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_OUT,
-        "bidirectional": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_INOUT}
-
+    # When supported, degree_type string should be mapped to constant like so:
+    # degree_type_map = {
+    #     "incoming": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_IN,
+    #     "outgoing": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_OUT,
+    #     "bidirectional": cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_INOUT}
     error_code = cugraph_core_number(c_resource_handle_ptr,
                                      c_graph_ptr,
-                                     degree_type_map[degree_type],
+                                     cugraph_k_core_degree_type_t.K_CORE_DEGREE_TYPE_IN,
                                      do_expensive_check,
                                      &result_ptr,
                                      &error_ptr)

From 8388aca66d7beb3a4b173bdaca05685c34f49f2e Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Mon, 25 Nov 2024 16:20:11 -0600
Subject: [PATCH 6/7] Drop support for NetworkX 3.0 and 3.1 for nx-cugraph
 (#4766)

This updates docs. Code changes are here: https://github.com/rapidsai/nx-cugraph/pull/27

See: https://github.com/rapidsai/cugraph/issues/4760

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4766
---
 docs/cugraph/source/nx_cugraph/how-it-works.md | 2 +-
 docs/cugraph/source/nx_cugraph/installation.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/cugraph/source/nx_cugraph/how-it-works.md b/docs/cugraph/source/nx_cugraph/how-it-works.md
index 88788f3c0c..0061b0445d 100644
--- a/docs/cugraph/source/nx_cugraph/how-it-works.md
+++ b/docs/cugraph/source/nx_cugraph/how-it-works.md
@@ -10,7 +10,7 @@ While NetworkX is a pure-Python implementation, backends may be written to use o
 
 ## Enabling nx-cugraph
 
-It is recommended to use `networkx>=3.4` for optimal zero code change performance, but `nx-cugraph` will also work with `networkx 3.0+`.
+It is recommended to use `networkx>=3.4` for optimal zero code change performance, but `nx-cugraph` will also work with `networkx 3.2+`.
 
 NetworkX will use `nx-cugraph` as the backend if any of the following are used:
 
diff --git a/docs/cugraph/source/nx_cugraph/installation.md b/docs/cugraph/source/nx_cugraph/installation.md
index a816801d00..9675306c47 100644
--- a/docs/cugraph/source/nx_cugraph/installation.md
+++ b/docs/cugraph/source/nx_cugraph/installation.md
@@ -10,7 +10,7 @@ This guide describes how to install ``nx-cugraph`` and use it in your workflows.
  - **Volta architecture or later NVIDIA GPU, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+**
  - **[CUDA](https://docs.nvidia.com/cuda/index.html) 11.2, 11.4, 11.5, 11.8, 12.0, 12.2, or 12.5**
  - **Python >= 3.10**
- - **[NetworkX](https://networkx.org/documentation/stable/install.html#) >= 3.0 (version 3.4 or higher recommended)**
+ - **[NetworkX](https://networkx.org/documentation/stable/install.html#) >= 3.2 (version 3.4 or higher recommended)**
 
 More details about system requirements can be found in the [RAPIDS System Requirements Documentation](https://docs.rapids.ai/install#system-req).
 

From 3d681cc9e6d85336822ccb6faf0d1cba5f759ead Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Mon, 25 Nov 2024 16:21:19 -0600
Subject: [PATCH 7/7] Add sphinx-lint pre-commit and some docs fixes (#4771)

I noticed the `Traversals` table is not showing up in the [nx-cugraph docs](https://docs.rapids.ai/api/cugraph/stable/nx_cugraph/supported-algorithms/). `sphinx-lint` catches the underlying issue, and it also caught a couple other minor issues that this PR fixes.

`sphinx-lint` is used by other notable repos such as `pandas` ([here](https://github.com/pandas-dev/pandas/blob/7fe270c8e7656c0c187260677b3b16a17a1281dc/.pre-commit-config.yaml#L92-L96)] and CPython ([here](https://github.com/python/cpython/blob/8fe1926164932f868e6e907ad72a74c2f2372b07/.pre-commit-config.yaml#L68-L73))

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Ralph Liu (https://github.com/nv-rliu)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4771
---
 .pre-commit-config.yaml                                   | 5 +++++
 docs/cugraph/source/api_docs/cugraph-ops/index.rst        | 2 +-
 docs/cugraph/source/api_docs/cugraph-ops/python/index.rst | 2 +-
 docs/cugraph/source/nx_cugraph/index.rst                  | 2 +-
 docs/cugraph/source/nx_cugraph/supported-algorithms.rst   | 2 +-
 docs/cugraph/source/tutorials/cugraph_blogs.rst           | 4 ++--
 6 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4bb037b5fd..28f83a967c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,6 +53,11 @@ repos:
               meta[.]yaml$|
               setup[.]cfg$
       - id: verify-alpha-spec
+  - repo: https://github.com/sphinx-contrib/sphinx-lint
+    rev: v1.0.0
+    hooks:
+      - id: sphinx-lint
+        args: ["--enable=all", "--disable=line-too-long"]
   - repo: https://github.com/rapidsai/dependency-file-generator
     rev: v1.16.0
     hooks:
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/index.rst b/docs/cugraph/source/api_docs/cugraph-ops/index.rst
index 0f6a6c937d..41ae941652 100644
--- a/docs/cugraph/source/api_docs/cugraph-ops/index.rst
+++ b/docs/cugraph/source/api_docs/cugraph-ops/index.rst
@@ -1,7 +1,7 @@
 cugraph-ops API reference
 =========================
 
-This page provides a list of all publicly accessible modules, methods and classes through `pylibcugraphops.*` namespace.
+This page provides a list of all publicly accessible modules, methods and classes through ``pylibcugraphops.*`` namespace.
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst b/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst
index fb25f2fa00..f1f332cb01 100644
--- a/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst
+++ b/docs/cugraph/source/api_docs/cugraph-ops/python/index.rst
@@ -1,7 +1,7 @@
 cugraph-ops Python API reference
 ================================
 
-This page provides a list of all publicly accessible modules, methods and classes through `pylibcugraphops.*` namespace.
+This page provides a list of all publicly accessible modules, methods and classes through ``pylibcugraphops.*`` namespace.
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/cugraph/source/nx_cugraph/index.rst b/docs/cugraph/source/nx_cugraph/index.rst
index 50565c805a..0eb8907b39 100644
--- a/docs/cugraph/source/nx_cugraph/index.rst
+++ b/docs/cugraph/source/nx_cugraph/index.rst
@@ -49,7 +49,7 @@ Users can have GPU-based, large-scale performance **without** changing their fam
 +--------------------------------------------------------------------------------------------------------+
 | **Run the same code on CPU or GPU**                                                                    |
 |                                                                                                        |
-| Nothing changes, not even your `import` statements, when going from CPU to GPU.                        |
+| Nothing changes, not even your ``import`` statements, when going from CPU to GPU.                      |
 +--------------------------------------------------------------------------------------------------------+
 
 
diff --git a/docs/cugraph/source/nx_cugraph/supported-algorithms.rst b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
index ae32bc330f..ed31ec73ae 100644
--- a/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
+++ b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
@@ -180,7 +180,7 @@ Algorithms
 +---------------------------------------+
 
 +---------------------------+
-| **Traversal**       		|
+| **Traversal**             |
 +===========================+
 | bfs_edges                 |
 +---------------------------+
diff --git a/docs/cugraph/source/tutorials/cugraph_blogs.rst b/docs/cugraph/source/tutorials/cugraph_blogs.rst
index 3665f425e3..57fa011ab5 100644
--- a/docs/cugraph/source/tutorials/cugraph_blogs.rst
+++ b/docs/cugraph/source/tutorials/cugraph_blogs.rst
@@ -65,9 +65,9 @@ Academic Papers
 
  * Alex Fender, Brad Rees, Joe Eaton (2022) `Massive Graph Analytics <https://books.google.com/books?hl=en&lr=&id=QspxEAAAQBAJ&oi=fnd&pg=PT8&dq=book:%22Massive+Graph+Analytics%22&ots=3HAGJ0njKO&sig=8e4v0azmzA6LTQNUNgPw-uTLkoc#v=onepage&q&f=false>`_  Bader, D. (Editor) CRC Press
 
- * S Kang, A. Fender, J. Eaton, B. Rees:`Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters`. In IEEE HPEC, Sep. 2020
+ * S Kang, A. Fender, J. Eaton, B. Rees. `Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters <https://ieeexplore.ieee.org/abstract/document/9286216>`_. In IEEE HPEC, Sep. 2020
 
- * Hricik, T., Bader, D., & Green, O. (2020, September). `Using RAPIDS AI to accelerate graph data science workflows`. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE.
+ * Hricik, T., Bader, D., & Green, O. (2020, September). `Using RAPIDS AI to accelerate graph data science workflows <https://ieeexplore.ieee.org/abstract/document/9286224>`_. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE.
 
  * Richardson, B., Rees, B., Drabas, T., Oldridge, E., Bader, D. A., & Allen, R. (2020, August). Accelerating and Expanding End-to-End Data Science Workflows with DL/ML Interoperability Using RAPIDS. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (pp. 3503-3504).