From 6b7c1143356be46fdcfd6445cc968bb72a6aa8dd Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:45:55 -0800
Subject: [PATCH] Enable edge masking in additional primitives (#4126)

List of the primitives updated in this PR.

* compute_in_weight_sums()
* compute_out_weight_sums()
* detail::extract_transform_v_frontier_e()
* extract_transform_e
* extract_transform_v_frontier_outgoing_e
* transform_reduce_v_frontier_outgoing_e_by_dst

List of primitives added in this PR
* edge_partition_device_view_t::compute_number_of_edges_with_mask, compute_local_degrees_with_mask

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4126
---
 .../cugraph/edge_partition_device_view.cuh    | 230 ++++++++
 cpp/include/cugraph/utilities/mask_utils.cuh  |   3 +-
 .../betweenness_centrality_impl.cuh           |   4 +-
 .../detail/extract_transform_v_frontier_e.cuh | 535 +++++++-----------
 cpp/src/prims/detail/nbr_intersection.cuh     |   3 +-
 cpp/src/prims/detail/prim_functors.cuh        |  86 ++-
 cpp/src/prims/extract_transform_e.cuh         |   4 +-
 ...xtract_transform_v_frontier_outgoing_e.cuh |   4 +-
 cpp/src/prims/fill_edge_src_dst_property.cuh  |   6 +-
 ...v_transform_reduce_incoming_outgoing_e.cuh |   4 +
 cpp/src/prims/transform_e.cuh                 |   3 +-
 cpp/src/prims/transform_reduce_e.cuh          |   4 +
 ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh |  41 +-
 .../detail/gather_one_hop_edgelist_impl.cuh   |   4 +-
 cpp/src/structure/graph_weight_utils_impl.cuh |  12 +-
 cpp/src/structure/induced_subgraph_impl.cuh   |   6 +-
 cpp/tests/prims/mg_extract_transform_e.cu     |  17 +-
 ...extract_transform_v_frontier_outgoing_e.cu |  17 +-
 ...transform_dst_nbr_weighted_intersection.cu |  63 ++-
 ...orm_reduce_v_frontier_outgoing_e_by_dst.cu |  17 +-
 cpp/tests/structure/weight_sum_test.cpp       | 253 ++++++---
 21 files changed, 808 insertions(+), 508 deletions(-)
diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index d1c2cf3df52..c286c81683e 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -17,6 +17,7 @@
 
 #include <cugraph/edge_partition_view.hpp>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
 #include <cugraph/utilities/misc_utils.cuh>
 
 #include <raft/core/device_span.hpp>
@@ -92,6 +93,54 @@ struct local_degree_op_t {
   }
 };
 
+template <typename vertex_t,
+          typename edge_t,
+          typename return_type_t,
+          bool multi_gpu,
+          bool use_dcs,
+          typename MaskIterator>
+struct local_degree_with_mask_op_t {
+  raft::device_span<edge_t const> offsets{};
+  std::conditional_t<multi_gpu, vertex_t, std::byte /* dummy */> major_range_first{};
+
+  std::conditional_t<use_dcs, raft::device_span<vertex_t const>, std::byte /* dummy */>
+    dcs_nzd_vertices{};
+  std::conditional_t<use_dcs, vertex_t, std::byte /* dummy */> major_hypersparse_first{};
+
+  MaskIterator mask_first{};
+
+  __device__ return_type_t operator()(vertex_t major) const
+  {
+    if constexpr (multi_gpu) {
+      vertex_t idx{};
+      if constexpr (use_dcs) {
+        if (major < major_hypersparse_first) {
+          idx = major - major_range_first;
+          return static_cast<return_type_t>(
+            count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx]));
+        } else {
+          auto major_hypersparse_idx =
+            major_hypersparse_idx_from_major_nocheck_impl(dcs_nzd_vertices, major);
+          if (major_hypersparse_idx) {
+            idx = (major_hypersparse_first - major_range_first) + *major_hypersparse_idx;
+            return static_cast<return_type_t>(
+              count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx]));
+          } else {
+            return return_type_t{0};
+          }
+        }
+      } else {
+        idx = major - major_range_first;
+        return static_cast<return_type_t>(
+          count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx]));
+      }
+    } else {
+      return static_cast<return_type_t>(
+        count_set_bits(mask_first, offsets[major], offsets[major + 1] - offsets[major]));
+    }
+  }
+};
+
 template <typename vertex_t, typename edge_t>
 class edge_partition_device_view_base_t {
  public:
@@ -255,6 +304,122 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return local_degrees;
   }
 
+  template <typename MaskIterator, typename MajorIterator>
+  size_t compute_number_of_edges_with_mask(MaskIterator mask_first,
+                                           MajorIterator major_first,
+                                           MajorIterator major_last,
+                                           rmm::cuda_stream_view stream) const
+  {
+    return dcs_nzd_vertices_ ? thrust::transform_reduce(
+                                 rmm::exec_policy(stream),
+                                 major_first,
+                                 major_last,
+                                 detail::local_degree_with_mask_op_t<
+                                   vertex_t,
+                                   edge_t,
+                                   size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                   multi_gpu,
+                                   true,
+                                   MaskIterator>{this->offsets_,
+                                                 major_range_first_,
+                                                 *dcs_nzd_vertices_,
+                                                 *major_hypersparse_first_,
+                                                 mask_first},
+                                 size_t{0},
+                                 thrust::plus<size_t>())
+                             : thrust::transform_reduce(
+                                 rmm::exec_policy(stream),
+                                 major_first,
+                                 major_last,
+                                 detail::local_degree_with_mask_op_t<
+                                   vertex_t,
+                                   edge_t,
+                                   size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                   multi_gpu,
+                                   false,
+                                   MaskIterator>{this->offsets_,
+                                                 major_range_first_,
+                                                 std::byte{0} /* dummy */,
+                                                 std::byte{0} /* dummy */,
+                                                 mask_first},
+                                 size_t{0},
+                                 thrust::plus<size_t>());
+  }
+
+  template <typename MaskIterator>
+  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
+                                                              rmm::cuda_stream_view stream) const
+  {
+    rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
+    if (dcs_nzd_vertices_) {
+      assert(major_hypersparse_first_);
+      thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(this->major_range_first()),
+        thrust::make_counting_iterator(this->major_range_last()),
+        local_degrees.begin(),
+        detail::
+          local_degree_with_mask_op_t<vertex_t, edge_t, edge_t, multi_gpu, true, MaskIterator>{
+            this->offsets_,
+            major_range_first_,
+            *dcs_nzd_vertices_,
+            major_hypersparse_first_.value_or(vertex_t{0}),
+            mask_first});
+    } else {
+      thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(this->major_range_first()),
+        thrust::make_counting_iterator(this->major_range_last()),
+        local_degrees.begin(),
+        detail::
+          local_degree_with_mask_op_t<vertex_t, edge_t, edge_t, multi_gpu, false, MaskIterator>{
+            this->offsets_,
+            major_range_first_,
+            std::byte{0} /* dummy */,
+            std::byte{0} /* dummy */,
+            mask_first});
+    }
+    return local_degrees;
+  }
+
+  template <typename MaskIterator, typename MajorIterator>
+  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
+                                                              MajorIterator major_first,
+                                                              MajorIterator major_last,
+                                                              rmm::cuda_stream_view stream) const
+  {
+    rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
+    if (dcs_nzd_vertices_) {
+      assert(major_hypersparse_first_);
+      thrust::transform(
+        rmm::exec_policy(stream),
+        major_first,
+        major_last,
+        local_degrees.begin(),
+        detail::
+          local_degree_with_mask_op_t<vertex_t, edge_t, edge_t, multi_gpu, true, MaskIterator>{
+            this->offsets_,
+            major_range_first_,
+            dcs_nzd_vertices_.value(),
+            major_hypersparse_first_.value_or(vertex_t{0}),
+            mask_first});
+    } else {
+      thrust::transform(
+        rmm::exec_policy(stream),
+        major_first,
+        major_last,
+        local_degrees.begin(),
+        detail::
+          local_degree_with_mask_op_t<vertex_t, edge_t, edge_t, multi_gpu, false, MaskIterator>{
+            this->offsets_,
+            major_range_first_,
+            std::byte{0} /* dummy */,
+            std::byte{0} /* dummy */,
+            mask_first});
+    }
+    return local_degrees;
+  }
+
   __host__ __device__ vertex_t major_value_start_offset() const
   {
     return major_value_start_offset_;
@@ -440,6 +605,71 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return local_degrees;
   }
 
+  template <typename MaskIterator, typename MajorIterator>
+  size_t compute_number_of_edges_with_mask(MaskIterator mask_first,
+                                           MajorIterator major_first,
+                                           MajorIterator major_last,
+                                           rmm::cuda_stream_view stream) const
+  {
+    return thrust::transform_reduce(
+      rmm::exec_policy(stream),
+      major_first,
+      major_last,
+      detail::local_degree_with_mask_op_t<
+        vertex_t,
+        edge_t,
+        size_t /* no limit on majors.size(), so edge_t can overflow */,
+        multi_gpu,
+        false,
+        MaskIterator>{this->offsets_,
+                      std::byte{0} /* dummy */,
+                      std::byte{0} /* dummy */,
+                      std::byte{0} /* dummy */,
+                      mask_first},
+      size_t{0},
+      thrust::plus<size_t>());
+  }
+
+  template <typename MaskIterator>
+  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
+                                                              rmm::cuda_stream_view stream) const
+  {
+    rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(this->major_range_first()),
+      thrust::make_counting_iterator(this->major_range_last()),
+      local_degrees.begin(),
+      detail::local_degree_with_mask_op_t<vertex_t, edge_t, edge_t, multi_gpu, false, MaskIterator>{
+        this->offsets_,
+        std::byte{0} /* dummy */,
+        std::byte{0} /* dummy */,
+        std::byte{0} /* dummy */,
+        mask_first});
+    return local_degrees;
+  }
+
+  template <typename MaskIterator, typename MajorIterator>
+  rmm::device_uvector<edge_t> compute_local_degrees_with_mask(MaskIterator mask_first,
+                                                              MajorIterator major_first,
+                                                              MajorIterator major_last,
+                                                              rmm::cuda_stream_view stream) const
+  {
+    rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      major_first,
+      major_last,
+      local_degrees.begin(),
+      detail::local_degree_with_mask_op_t<vertex_t, edge_t, edge_t, multi_gpu, false, MaskIterator>{
+        this->offsets_,
+        std::byte{0} /* dummy */,
+        std::byte{0} /* dummy */,
+        std::byte{0} /* dummy */,
+        mask_first});
+    return local_degrees;
+  }
+
   __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; }
 
   __host__ __device__ thrust::optional<vertex_t> major_hypersparse_first() const noexcept
diff --git a/cpp/include/cugraph/utilities/mask_utils.cuh b/cpp/include/cugraph/utilities/mask_utils.cuh
index ab1403d019b..5621b1267e9 100644
--- a/cpp/include/cugraph/utilities/mask_utils.cuh
+++ b/cpp/include/cugraph/utilities/mask_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
 
 namespace cugraph {
 
diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh
index e496344583c..08907716412 100644
--- a/cpp/src/centrality/betweenness_centrality_impl.cuh
+++ b/cpp/src/centrality/betweenness_centrality_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ struct extract_edge_e_op_t {
     vertex_t dst,
     thrust::tuple<vertex_t, edge_t, weight_t> src_props,
     thrust::tuple<vertex_t, edge_t, weight_t> dst_props,
-    weight_t edge_centrality)
+    weight_t edge_centrality) const
   {
     return ((thrust::get<0>(dst_props) == d) && (thrust::get<0>(src_props) == (d - 1)))
              ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index 2d77d64e1ff..608a824c57e 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <prims/detail/optional_dataframe_buffer.hpp>
+#include <prims/detail/prim_functors.cuh>
 #include <prims/property_op_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
@@ -61,13 +62,13 @@ namespace detail {
 
 int32_t constexpr extract_transform_v_frontier_e_kernel_block_size = 512;
 
-template <typename e_op_result_t,
-          typename BufferKeyOutputIterator,
-          typename BufferValueOutputIterator>
-__device__ void push_buffer_element(e_op_result_t e_op_result,
-                                    BufferKeyOutputIterator buffer_key_output_first,
+template <typename BufferKeyOutputIterator,
+          typename BufferValueOutputIterator,
+          typename e_op_result_t>
+__device__ void push_buffer_element(BufferKeyOutputIterator buffer_key_output_first,
                                     BufferValueOutputIterator buffer_value_output_first,
-                                    size_t buffer_idx)
+                                    size_t buffer_idx,
+                                    e_op_result_t e_op_result)
 {
   using output_key_t =
     typename optional_dataframe_buffer_value_type_t<BufferKeyOutputIterator>::value;
@@ -87,15 +88,45 @@ __device__ void push_buffer_element(e_op_result_t e_op_result,
   }
 }
 
-template <typename GraphViewType,
+template <typename BufferKeyOutputIterator,
+          typename BufferValueOutputIterator,
+          typename e_op_result_t>
+__device__ void warp_push_buffer_elements(
+  BufferKeyOutputIterator buffer_key_output_first,
+  BufferValueOutputIterator buffer_value_output_first,
+  cuda::atomic_ref<size_t, cuda::thread_scope_device>& buffer_idx,
+  int lane_id,
+  e_op_result_t e_op_result)
+{
+  auto ballot = __ballot_sync(raft::warp_full_mask(), e_op_result ? uint32_t{1} : uint32_t{0});
+  if (ballot > 0) {
+    size_t warp_buffer_start_idx{};
+    if (lane_id == 0) {
+      auto increment        = __popc(ballot);
+      warp_buffer_start_idx = buffer_idx.fetch_add(increment, cuda::std::memory_order_relaxed);
+    }
+    warp_buffer_start_idx = __shfl_sync(raft::warp_full_mask(), warp_buffer_start_idx, int{0});
+    if (e_op_result) {
+      auto buffer_warp_offset = __popc(ballot & ~(raft::warp_full_mask() << lane_id));
+      push_buffer_element(buffer_key_output_first,
+                          buffer_value_output_first,
+                          warp_buffer_start_idx + buffer_warp_offset,
+                          e_op_result);
+    }
+  }
+}
+
+template <bool hypersparse,
+          typename GraphViewType,
           typename KeyIterator,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_hypersparse(
+__global__ void extract_transform_v_frontier_e_hypersparse_or_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -104,6 +135,7 @@ __global__ void extract_transform_v_frontier_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferValueOutputIterator buffer_value_output_first,
   size_t* buffer_idx_ptr,
@@ -127,6 +159,8 @@ __global__ void extract_transform_v_frontier_e_hypersparse(
                                                 edge_partition.major_range_first());
   auto idx                = static_cast<size_t>(tid);
 
+  cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
+
   __shared__ edge_t
     warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size];
   __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size];
@@ -134,9 +168,6 @@ __global__ void extract_transform_v_frontier_e_hypersparse(
   using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
   __shared__ typename WarpScan::TempStorage temp_storage;
 
-  __shared__ size_t
-    buffer_warp_start_indices[extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()];
-
   auto indices = edge_partition.indices();
 
   vertex_t num_keys = static_cast<size_t>(thrust::distance(key_first, key_last));
@@ -160,251 +191,93 @@ __global__ void extract_transform_v_frontier_e_hypersparse(
       } else {
         major = thrust::get<0>(key);
       }
-      auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-      if (major_hypersparse_idx) {
-        auto major_idx                           = major_start_offset + *major_hypersparse_idx;
-        local_degree                             = edge_partition.local_degree(major_idx);
-        warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_idx);
-      } else {
-        local_degree                             = edge_t{0};
-        warp_key_local_edge_offsets[threadIdx.x] = edge_t{0};  // dummy
-      }
-    }
-    WarpScan(temp_storage)
-      .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]);
-    __syncwarp();
-
-    // process local edges for the keys in [key_first + min_key_idx, key_first + max_key_idx)
-
-    auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
-                                                                (max_key_idx - min_key_idx) - 1];
-    auto rounded_up_num_edges_this_warp =
-      ((static_cast<size_t>(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) *
-      raft::warp_size();
-
-    for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-      e_op_result_t e_op_result{};
-
-      if (i < static_cast<size_t>(num_edges_this_warp)) {
-        auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
-          warp_local_degree_inclusive_sums + warp_id * raft::warp_size(),
-          thrust::upper_bound(thrust::seq,
-                              warp_local_degree_inclusive_sums + warp_id * raft::warp_size(),
-                              warp_local_degree_inclusive_sums + warp_id * raft::warp_size() +
-                                (max_key_idx - min_key_idx),
-                              i)));
-        auto local_edge_offset =
-          warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
-          static_cast<edge_t>(i -
-                              ((key_idx_this_warp == 0)
-                                 ? edge_t{0}
-                                 : warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
-                                                                    key_idx_this_warp - 1]));
-        auto key = *(key_first + (min_key_idx + key_idx_this_warp));
-        vertex_t major{};
-        if constexpr (std::is_same_v<key_t, vertex_t>) {
-          major = key;
+      if constexpr (hypersparse) {
+        auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
+        if (major_hypersparse_idx) {
+          auto major_idx                           = major_start_offset + *major_hypersparse_idx;
+          local_degree                             = edge_partition.local_degree(major_idx);
+          warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_idx);
         } else {
-          major = thrust::get<0>(key);
-        }
-        auto minor        = indices[local_edge_offset];
-        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        std::conditional_t<GraphViewType::is_storage_transposed, vertex_t, key_t>
-          key_or_src{};  // key if major
-        std::conditional_t<GraphViewType::is_storage_transposed, key_t, vertex_t>
-          key_or_dst{};  // key if major
-        if constexpr (GraphViewType::is_storage_transposed) {
-          key_or_src = minor;
-          key_or_dst = key;
-        } else {
-          key_or_src = key;
-          key_or_dst = minor;
-        }
-        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-        e_op_result     = e_op(key_or_src,
-                           key_or_dst,
-                           edge_partition_src_value_input.get(src_offset),
-                           edge_partition_dst_value_input.get(dst_offset),
-                           edge_partition_e_value_input.get(local_edge_offset));
-      }
-      auto ballot_e_op =
-        __ballot_sync(uint32_t{0xffffffff}, e_op_result ? uint32_t{1} : uint32_t{0});
-      if (ballot_e_op) {
-        if (lane_id == 0) {
-          auto increment = __popc(ballot_e_op);
-          static_assert(sizeof(unsigned long long int) == sizeof(size_t));
-          buffer_warp_start_indices[warp_id] =
-            static_cast<size_t>(atomicAdd(reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
-                                          static_cast<unsigned long long int>(increment)));
-        }
-        __syncwarp();
-        if (e_op_result) {
-          auto buffer_warp_offset =
-            static_cast<edge_t>(__popc(ballot_e_op & ~(uint32_t{0xffffffff} << lane_id)));
-          push_buffer_element(e_op_result,
-                              buffer_key_output_first,
-                              buffer_value_output_first,
-                              buffer_warp_start_indices[warp_id] + buffer_warp_offset);
+          local_degree                             = edge_t{0};
+          warp_key_local_edge_offsets[threadIdx.x] = edge_t{0};  // dummy
         }
-      }
-    }
-    idx += gridDim.x * blockDim.x;
-  }
-}
-
-template <typename GraphViewType,
-          typename KeyIterator,
-          typename EdgePartitionSrcValueInputWrapper,
-          typename EdgePartitionDstValueInputWrapper,
-          typename EdgePartitionEdgeValueInputWrapper,
-          typename BufferKeyOutputIterator,
-          typename BufferValueOutputIterator,
-          typename EdgeOp>
-__global__ void extract_transform_v_frontier_e_low_degree(
-  edge_partition_device_view_t<typename GraphViewType::vertex_type,
-                               typename GraphViewType::edge_type,
-                               GraphViewType::is_multi_gpu> edge_partition,
-  KeyIterator key_first,
-  KeyIterator key_last,
-  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
-  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
-  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  BufferKeyOutputIterator buffer_key_output_first,
-  BufferValueOutputIterator buffer_value_output_first,
-  size_t* buffer_idx_ptr,
-  EdgeOp e_op)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
-  using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
-  using e_op_result_t =
-    typename edge_op_result_type<key_t,
-                                 typename GraphViewType::vertex_type,
-                                 typename EdgePartitionSrcValueInputWrapper::value_type,
-                                 typename EdgePartitionDstValueInputWrapper::value_type,
-                                 typename EdgePartitionEdgeValueInputWrapper::value_type,
-                                 EdgeOp>::type;
-
-  auto const tid     = threadIdx.x + blockIdx.x * blockDim.x;
-  auto const warp_id = threadIdx.x / raft::warp_size();
-  auto const lane_id = tid % raft::warp_size();
-  auto idx           = static_cast<size_t>(tid);
-
-  __shared__ edge_t
-    warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size];
-  __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size];
-
-  using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
-  __shared__ typename WarpScan::TempStorage temp_storage;
-
-  __shared__ size_t
-    buffer_warp_start_indices[extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()];
-
-  auto indices = edge_partition.indices();
-
-  vertex_t num_keys = static_cast<size_t>(thrust::distance(key_first, key_last));
-  auto rounded_up_num_keys =
-    ((static_cast<size_t>(num_keys) + (raft::warp_size() - 1)) / raft::warp_size()) *
-    raft::warp_size();
-  while (idx < rounded_up_num_keys) {
-    auto min_key_idx = static_cast<vertex_t>(idx - (idx % raft::warp_size()));  // inclusive
-    auto max_key_idx =
-      static_cast<vertex_t>(std::min(static_cast<size_t>(min_key_idx) + raft::warp_size(),
-                                     static_cast<size_t>(num_keys)));  // exclusive
-
-    // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets
-
-    edge_t local_degree{0};
-    if (lane_id < static_cast<int32_t>(max_key_idx - min_key_idx)) {
-      auto key = *(key_first + idx);
-      vertex_t major{};
-      if constexpr (std::is_same_v<key_t, vertex_t>) {
-        major = key;
       } else {
-        major = thrust::get<0>(key);
+        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+        local_degree      = edge_partition.local_degree(major_offset);
+        warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_offset);
       }
-      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-      local_degree      = edge_partition.local_degree(major_offset);
-      warp_key_local_edge_offsets[threadIdx.x] = edge_partition.local_offset(major_offset);
     }
     WarpScan(temp_storage)
       .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]);
     __syncwarp();
 
-    // processes local edges for the keys in [key_first + min_key_idx, key_first + max_key_idx)
+    // all the threads in a warp collectively process local edges for the keys in [key_first +
+    // min_key_idx, key_first + max_key_idx)
 
     auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
                                                                 (max_key_idx - min_key_idx) - 1];
     auto rounded_up_num_edges_this_warp =
       ((static_cast<size_t>(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) *
       raft::warp_size();
-    for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
-      e_op_result_t e_op_result{};
-
-      if (i < static_cast<size_t>(num_edges_this_warp)) {
-        auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
-          warp_local_degree_inclusive_sums + warp_id * raft::warp_size(),
-          thrust::upper_bound(thrust::seq,
-                              warp_local_degree_inclusive_sums + warp_id * raft::warp_size(),
-                              warp_local_degree_inclusive_sums + warp_id * raft::warp_size() +
-                                (max_key_idx - min_key_idx),
-                              i)));
-        auto local_edge_offset =
-          warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
-          static_cast<edge_t>(i -
-                              ((key_idx_this_warp == 0)
-                                 ? edge_t{0}
-                                 : warp_local_degree_inclusive_sums[warp_id * raft::warp_size() +
-                                                                    key_idx_this_warp - 1]));
-        auto key = *(key_first + (min_key_idx + key_idx_this_warp));
-        vertex_t major{};
-        if constexpr (std::is_same_v<key_t, vertex_t>) {
-          major = key;
-        } else {
-          major = thrust::get<0>(key);
-        }
-        auto minor        = indices[local_edge_offset];
-        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        std::conditional_t<GraphViewType::is_storage_transposed, vertex_t, key_t>
-          key_or_src{};  // key if major
-        std::conditional_t<GraphViewType::is_storage_transposed, key_t, vertex_t>
-          key_or_dst{};  // key if major
-        if constexpr (GraphViewType::is_storage_transposed) {
-          key_or_src = minor;
-          key_or_dst = key;
-        } else {
-          key_or_src = key;
-          key_or_dst = minor;
+
+    auto call_e_op = call_e_op_with_key_t<GraphViewType,
+                                          key_t,
+                                          EdgePartitionSrcValueInputWrapper,
+                                          EdgePartitionDstValueInputWrapper,
+                                          EdgePartitionEdgeValueInputWrapper,
+                                          EdgeOp>{edge_partition,
+                                                  edge_partition_src_value_input,
+                                                  edge_partition_dst_value_input,
+                                                  edge_partition_e_value_input,
+                                                  e_op};
+
+    auto this_warp_inclusive_sum_first =
+      warp_local_degree_inclusive_sums + warp_id * raft::warp_size();
+    auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx);
+
+    if (edge_partition_e_mask) {
+      for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
+        e_op_result_t e_op_result{thrust::nullopt};
+
+        if (i < static_cast<size_t>(num_edges_this_warp)) {
+          auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
+            this_warp_inclusive_sum_first,
+            thrust::upper_bound(
+              thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
+          auto local_edge_offset =
+            warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
+            static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
+                                                              : *(this_warp_inclusive_sum_first +
+                                                                  (key_idx_this_warp - 1))));
+          if ((*edge_partition_e_mask).get(local_edge_offset)) {
+            auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
+            e_op_result = call_e_op(key, local_edge_offset);
+          }
         }
-        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-        e_op_result     = e_op(key_or_src,
-                           key_or_dst,
-                           edge_partition_src_value_input.get(src_offset),
-                           edge_partition_dst_value_input.get(dst_offset),
-                           edge_partition_e_value_input.get(local_edge_offset));
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
-      auto ballot = __ballot_sync(uint32_t{0xffffffff}, e_op_result ? uint32_t{1} : uint32_t{0});
-      if (ballot > 0) {
-        if (lane_id == 0) {
-          auto increment = __popc(ballot);
-          static_assert(sizeof(unsigned long long int) == sizeof(size_t));
-          buffer_warp_start_indices[warp_id] =
-            static_cast<size_t>(atomicAdd(reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
-                                          static_cast<unsigned long long int>(increment)));
-        }
-        __syncwarp();
-        if (e_op_result) {
-          auto buffer_warp_offset =
-            static_cast<edge_t>(__popc(ballot & ~(uint32_t{0xffffffff} << lane_id)));
-          push_buffer_element(e_op_result,
-                              buffer_key_output_first,
-                              buffer_value_output_first,
-                              buffer_warp_start_indices[warp_id] + buffer_warp_offset);
+    } else {
+      for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) {
+        e_op_result_t e_op_result{thrust::nullopt};
+
+        if (i < static_cast<size_t>(num_edges_this_warp)) {
+          auto key_idx_this_warp = static_cast<vertex_t>(thrust::distance(
+            this_warp_inclusive_sum_first,
+            thrust::upper_bound(
+              thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i)));
+          auto local_edge_offset =
+            warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] +
+            static_cast<edge_t>(i - ((key_idx_this_warp == 0) ? edge_t{0}
+                                                              : *(this_warp_inclusive_sum_first +
+                                                                  (key_idx_this_warp - 1))));
+          auto key    = *(key_first + (min_key_idx + key_idx_this_warp));
+          e_op_result = call_e_op(key, local_edge_offset);
         }
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
     }
 
@@ -417,6 +290,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
@@ -429,6 +303,7 @@ __global__ void extract_transform_v_frontier_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferValueOutputIterator buffer_value_output_first,
   size_t* buffer_idx_ptr,
@@ -451,8 +326,8 @@ __global__ void extract_transform_v_frontier_e_mid_degree(
   auto const lane_id = tid % raft::warp_size();
   auto idx           = static_cast<size_t>(tid / raft::warp_size());
 
-  __shared__ size_t
-    buffer_warp_start_indices[extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()];
+  cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
+
   while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
     auto key = *(key_first + idx);
     vertex_t major{};
@@ -470,48 +345,40 @@ __global__ void extract_transform_v_frontier_e_mid_degree(
     auto rounded_up_local_out_degree =
       ((static_cast<size_t>(local_out_degree) + (raft::warp_size() - 1)) / raft::warp_size()) *
       raft::warp_size();
-    for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) {
-      e_op_result_t e_op_result{};
-      if (i < static_cast<size_t>(local_out_degree)) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        std::conditional_t<GraphViewType::is_storage_transposed, vertex_t, key_t>
-          key_or_src{};  // key if major
-        std::conditional_t<GraphViewType::is_storage_transposed, key_t, vertex_t>
-          key_or_dst{};  // key if major
-        if constexpr (GraphViewType::is_storage_transposed) {
-          key_or_src = minor;
-          key_or_dst = key;
-        } else {
-          key_or_src = key;
-          key_or_dst = minor;
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 key_t,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         key,
+                                         major_offset,
+                                         indices,
+                                         local_edge_offset};
+
+    if (edge_partition_e_mask) {
+      for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) {
+        e_op_result_t e_op_result{thrust::nullopt};
+        if ((i < static_cast<size_t>(local_out_degree)) &&
+            ((*edge_partition_e_mask).get(local_edge_offset + i))) {
+          e_op_result = call_e_op(i);
         }
-        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-        e_op_result     = e_op(key_or_src,
-                           key_or_dst,
-                           edge_partition_src_value_input.get(src_offset),
-                           edge_partition_dst_value_input.get(dst_offset),
-                           edge_partition_e_value_input.get(local_edge_offset + i));
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
-      auto ballot = __ballot_sync(uint32_t{0xffffffff}, e_op_result ? uint32_t{1} : uint32_t{0});
-      if (ballot > 0) {
-        if (lane_id == 0) {
-          auto increment = __popc(ballot);
-          static_assert(sizeof(unsigned long long int) == sizeof(size_t));
-          buffer_warp_start_indices[warp_id] =
-            static_cast<size_t>(atomicAdd(reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
-                                          static_cast<unsigned long long int>(increment)));
-        }
-        __syncwarp();
-        if (e_op_result) {
-          auto buffer_warp_offset =
-            static_cast<edge_t>(__popc(ballot & ~(uint32_t{0xffffffff} << lane_id)));
-          push_buffer_element(e_op_result,
-                              buffer_key_output_first,
-                              buffer_value_output_first,
-                              buffer_warp_start_indices[warp_id] + buffer_warp_offset);
-        }
+    } else {
+      for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) {
+        e_op_result_t e_op_result{thrust::nullopt};
+        if (i < static_cast<size_t>(local_out_degree)) { e_op_result = call_e_op(i); }
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
     }
 
@@ -524,6 +391,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename BufferKeyOutputIterator,
           typename BufferValueOutputIterator,
           typename EdgeOp>
@@ -536,6 +404,7 @@ __global__ void extract_transform_v_frontier_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferValueOutputIterator buffer_value_output_first,
   size_t* buffer_idx_ptr,
@@ -552,11 +421,11 @@ __global__ void extract_transform_v_frontier_e_high_degree(
                                  typename EdgePartitionEdgeValueInputWrapper::value_type,
                                  EdgeOp>::type;
 
-  auto idx = static_cast<size_t>(blockIdx.x);
+  auto const warp_id = threadIdx.x / raft::warp_size();
+  auto const lane_id = threadIdx.x % raft::warp_size();
+  auto idx           = static_cast<size_t>(blockIdx.x);
 
-  using BlockScan = cub::BlockScan<edge_t, extract_transform_v_frontier_e_kernel_block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-  __shared__ size_t buffer_block_start_idx;
+  cuda::atomic_ref<size_t, cuda::thread_scope_device> buffer_idx(*buffer_idx_ptr);
 
   while (idx < static_cast<size_t>(thrust::distance(key_first, key_last))) {
     auto key = *(key_first + idx);
@@ -576,49 +445,40 @@ __global__ void extract_transform_v_frontier_e_high_degree(
                                          (extract_transform_v_frontier_e_kernel_block_size - 1)) /
                                         extract_transform_v_frontier_e_kernel_block_size) *
                                        extract_transform_v_frontier_e_kernel_block_size;
-    for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) {
-      e_op_result_t e_op_result{};
-      edge_t buffer_block_offset{0};
-
-      if (i < static_cast<size_t>(local_out_degree)) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        std::conditional_t<GraphViewType::is_storage_transposed, vertex_t, key_t>
-          key_or_src{};  // key if major
-        std::conditional_t<GraphViewType::is_storage_transposed, key_t, vertex_t>
-          key_or_dst{};  // key if major
-        if constexpr (GraphViewType::is_storage_transposed) {
-          key_or_src = minor;
-          key_or_dst = key;
-        } else {
-          key_or_src = key;
-          key_or_dst = minor;
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 key_t,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         key,
+                                         major_offset,
+                                         indices,
+                                         local_edge_offset};
+
+    if (edge_partition_e_mask) {
+      for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) {
+        e_op_result_t e_op_result{thrust::nullopt};
+        if ((i < static_cast<size_t>(local_out_degree)) &&
+            ((*edge_partition_e_mask).get(local_edge_offset + i))) {
+          e_op_result = call_e_op(i);
         }
-        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-        e_op_result     = e_op(key_or_src,
-                           key_or_dst,
-                           edge_partition_src_value_input.get(src_offset),
-                           edge_partition_dst_value_input.get(dst_offset),
-                           edge_partition_e_value_input.get(local_edge_offset + i));
-      }
-      BlockScan(temp_storage)
-        .ExclusiveSum(e_op_result ? edge_t{1} : edge_t{0}, buffer_block_offset);
-      if (threadIdx.x == (blockDim.x - 1)) {
-        auto increment = buffer_block_offset + (e_op_result ? edge_t{1} : edge_t{0});
-        static_assert(sizeof(unsigned long long int) == sizeof(size_t));
-        buffer_block_start_idx = increment > 0
-                                   ? static_cast<size_t>(atomicAdd(
-                                       reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
-                                       static_cast<unsigned long long int>(increment)))
-                                   : size_t{0} /* dummy */;
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
-      __syncthreads();
-      if (e_op_result) {
-        push_buffer_element(e_op_result,
-                            buffer_key_output_first,
-                            buffer_value_output_first,
-                            buffer_block_start_idx + buffer_block_offset);
+    } else {
+      for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) {
+        e_op_result_t e_op_result{thrust::nullopt};
+        if (i < static_cast<size_t>(local_out_degree)) { e_op_result = call_e_op(i); }
+
+        warp_push_buffer_elements(
+          buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result);
       }
     }
 
@@ -757,10 +617,18 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
       static_cast<vertex_t>(thrust::distance(frontier_key_first, frontier_key_last)))};
   }
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     auto edge_partition_frontier_key_buffer =
       allocate_dataframe_buffer<key_t>(size_t{0}, handle.get_stream());
@@ -846,8 +714,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
       RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
       h_offsets.push_back(edge_partition_frontier_size);
       // FIXME: we may further improve performance by 1) concurrently running kernels on different
-      // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
-      // segment for very high degree vertices and running segmented reduction
+      // segments; 2) individually tuning block sizes for different segments; and 3) adding one
+      // more segment for very high degree vertices and running segmented reduction
       if (h_offsets[0] > 0) {
         raft::grid_1d_block_t update_grid(h_offsets[0],
                                           extract_transform_v_frontier_e_kernel_block_size,
@@ -860,6 +728,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
             get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
             buffer_idx.data(),
@@ -877,6 +746,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
             get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
             buffer_idx.data(),
@@ -886,7 +756,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1],
                                            extract_transform_v_frontier_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_low_degree<GraphViewType>
+        extract_transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first + h_offsets[1],
@@ -894,6 +764,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
             get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
             buffer_idx.data(),
@@ -903,7 +774,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2],
                                            extract_transform_v_frontier_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        extract_transform_v_frontier_e_hypersparse<GraphViewType>
+        extract_transform_v_frontier_e_hypersparse_or_low_degree<true, GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first + h_offsets[2],
@@ -911,6 +782,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
             get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
             buffer_idx.data(),
@@ -922,7 +794,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
                                            extract_transform_v_frontier_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
 
-        extract_transform_v_frontier_e_low_degree<GraphViewType>
+        extract_transform_v_frontier_e_hypersparse_or_low_degree<false, GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_frontier_key_first,
@@ -930,6 +802,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_optional_dataframe_buffer_begin<output_key_t>(key_buffer),
             get_optional_dataframe_buffer_begin<output_value_t>(value_buffer),
             buffer_idx.data(),
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index 26b87f21dbb..e3453ffdec5 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -213,8 +213,7 @@ struct update_rx_major_local_nbrs_t {
         auto mask_first = (*edge_partition_e_mask).value_first();
         if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {
           auto input_first =
-            thrust::make_zip_iterator(indices, edge_partition_e_value_input.value_first()) +
-            edge_offset;
+            thrust::make_zip_iterator(indices, edge_partition_e_value_input.value_first());
           copy_if_mask_set(input_first,
                            mask_first,
                            thrust::make_zip_iterator(local_nbrs_for_rx_majors.begin(),
diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh
index 2785ba38dfd..d142aed1051 100644
--- a/cpp/src/prims/detail/prim_functors.cuh
+++ b/cpp/src/prims/detail/prim_functors.cuh
@@ -22,6 +22,7 @@ namespace cugraph {
 namespace detail {
 
 template <typename GraphViewType,
+          typename key_t,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
@@ -34,27 +35,94 @@ struct call_e_op_t {
   EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{};
   EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{};
   EdgeOp const& e_op{};
-  typename GraphViewType::vertex_type major{};
+  key_t key{};
   typename GraphViewType::vertex_type major_offset{};
-  typename GraphViewType::vertex_type const* indices{nullptr};
+  typename GraphViewType::vertex_type const* indices{
+    nullptr};  // indices = edge_partition.incies() + edge_offset
   typename GraphViewType::edge_type edge_offset{};
 
-  __device__ auto operator()(typename GraphViewType::edge_type i) const
+  __device__ auto operator()(
+    typename GraphViewType::edge_type i /* index in key's neighbor list */) const
   {
     auto minor        = indices[i];
     auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-    auto src          = GraphViewType::is_storage_transposed ? minor : major;
-    auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-    auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-    auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-    return e_op(src,
-                dst,
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       typename GraphViewType::vertex_type,
+                       key_t>
+      key_or_src{};
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       key_t,
+                       typename GraphViewType::vertex_type>
+      key_or_dst{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      key_or_src = minor;
+      key_or_dst = key;
+    } else {
+      key_or_src = key;
+      key_or_dst = minor;
+    }
+    auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    return e_op(key_or_src,
+                key_or_dst,
                 edge_partition_src_value_input.get(src_offset),
                 edge_partition_dst_value_input.get(dst_offset),
                 edge_partition_e_value_input.get(edge_offset + i));
   }
 };
 
+template <typename GraphViewType,
+          typename key_t,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgeOp>
+struct call_e_op_with_key_t {
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> const& edge_partition{};
+  EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{};
+  EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{};
+  EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{};
+  EdgeOp const& e_op{};
+
+  __device__ auto operator()(
+    key_t key, typename GraphViewType::edge_type i /* index in edge_partition's edge list */) const
+  {
+    typename GraphViewType::vertex_type major{};
+    if constexpr (std::is_same_v<key_t, typename GraphViewType::vertex_type>) {
+      major = key;
+    } else {
+      major = thrust::get<0>(key);
+    }
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    auto minor        = *(edge_partition.indices() + i);
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       typename GraphViewType::vertex_type,
+                       key_t>
+      key_or_src{};
+    std::conditional_t<GraphViewType::is_storage_transposed,
+                       key_t,
+                       typename GraphViewType::vertex_type>
+      key_or_dst{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      key_or_src = minor;
+      key_or_dst = key;
+    } else {
+      key_or_src = key;
+      key_or_dst = minor;
+    }
+    auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    return e_op(key_or_src,
+                key_or_dst,
+                edge_partition_src_value_input.get(src_offset),
+                edge_partition_dst_value_input.get(dst_offset),
+                edge_partition_e_value_input.get(i));
+  }
+};
+
 }  // namespace detail
 
 }  // namespace cugraph
diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh
index f135b76d6e3..25e04fff83a 100644
--- a/cpp/src/prims/extract_transform_e.cuh
+++ b/cpp/src/prims/extract_transform_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,8 +107,6 @@ extract_transform_e(raft::handle_t const& handle,
   static_assert(!std::is_same_v<e_op_result_t, void>);
   using payload_t = typename e_op_result_t::value_type;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   // FIXME: Consider updating detail::extract_transform_v_forntier_e to take std::nullopt to as a
   // frontier or create a new key bucket type that just stores [vertex_first, vertex_last) for
   // further optimization. Better revisit this once this becomes a performance bottleneck and after
diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
index 42af8a1164d..f3b85da53ea 100644
--- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
+++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,8 +98,6 @@ extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle,
   static_assert(!std::is_same_v<e_op_result_t, void>);
   using payload_t = typename e_op_result_t::value_type;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   auto value_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
   std::tie(std::ignore, value_buffer) =
     detail::extract_transform_v_frontier_e<false, void, payload_t>(handle,
diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh
index 86e23a1a04e..5a7fe6b5044 100644
--- a/cpp/src/prims/fill_edge_src_dst_property.cuh
+++ b/cpp/src/prims/fill_edge_src_dst_property.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,8 +123,6 @@ void fill_edge_src_property(raft::handle_t const& handle,
                             edge_src_property_t<GraphViewType, T>& edge_src_property_output,
                             bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -161,8 +159,6 @@ void fill_edge_dst_property(raft::handle_t const& handle,
                             edge_dst_property_t<GraphViewType, T>& edge_dst_property_output,
                             bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 24b4f0857b1..c519d8aefed 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -190,6 +190,7 @@ __global__ void per_v_transform_reduce_e_hypersparse(
       edge_partition.local_edges(static_cast<vertex_t>(major_idx));
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
@@ -288,6 +289,7 @@ __global__ void per_v_transform_reduce_e_low_degree(
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
@@ -393,6 +395,7 @@ __global__ void per_v_transform_reduce_e_mid_degree(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
@@ -504,6 +507,7 @@ __global__ void per_v_transform_reduce_e_high_degree(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index 5c83e0f1b71..c4b4b8d3745 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -107,8 +107,7 @@ __global__ void transform_e_packed_bool(
                             : int{0};
       }
     }
-
-    uint32_t new_val = __ballot_sync(uint32_t{0xffffffff}, predicate);
+    uint32_t new_val = __ballot_sync(raft::warp_full_mask(), predicate);
     if (lane_id == 0) {
       if constexpr (check_edge_mask) {
         *(edge_partition_e_value_output.value_first() + idx) = new_val;
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index 7acc7461268..109d10a8b1c 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -99,6 +99,7 @@ __global__ void transform_reduce_e_hypersparse(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
@@ -187,6 +188,7 @@ __global__ void transform_reduce_e_low_degree(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
@@ -276,6 +278,7 @@ __global__ void transform_reduce_e_mid_degree(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
@@ -350,6 +353,7 @@ __global__ void transform_reduce_e_high_degree(
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
 
     auto call_e_op = call_e_op_t<GraphViewType,
+                                 vertex_t,
                                  EdgePartitionSrcValueInputWrapper,
                                  EdgePartitionDstValueInputWrapper,
                                  EdgePartitionEdgeValueInputWrapper,
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
index 18e722d62cc..0315b3808e0 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -189,8 +189,6 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
   using key_t    = typename VertexFrontierBucketType::key_type;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   size_t ret{0};
 
   vertex_t const* local_frontier_vertex_first{nullptr};
@@ -207,10 +205,19 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
   } else {
     local_frontier_sizes = std::vector<size_t>{static_cast<size_t>(frontier.size())};
   }
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     if constexpr (GraphViewType::is_multi_gpu) {
       auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
@@ -225,14 +232,30 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
                    static_cast<int>(i),
                    handle.get_stream());
 
-      ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(),
-                                                    edge_partition_frontier_vertices.end(),
-                                                    handle.get_stream());
+      if (edge_partition_e_mask) {
+        ret +=
+          edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(),
+                                                           edge_partition_frontier_vertices.begin(),
+                                                           edge_partition_frontier_vertices.end(),
+                                                           handle.get_stream());
+      } else {
+        ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(),
+                                                      edge_partition_frontier_vertices.end(),
+                                                      handle.get_stream());
+      }
     } else {
       assert(i == 0);
-      ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first,
-                                                    local_frontier_vertex_first + frontier.size(),
-                                                    handle.get_stream());
+      if (edge_partition_e_mask) {
+        ret += edge_partition.compute_number_of_edges_with_mask(
+          (*edge_partition_e_mask).value_first(),
+          local_frontier_vertex_first,
+          local_frontier_vertex_first + frontier.size(),
+          handle.get_stream());
+      } else {
+        ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first,
+                                                      local_frontier_vertex_first + frontier.size(),
+                                                      handle.get_stream());
+      }
     }
   }
 
@@ -323,8 +346,6 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
   using key_t     = typename VertexFrontierBucketType::key_type;
   using payload_t = typename ReduceOp::value_type;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh b/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh
index 74267d02b38..cac648079b0 100644
--- a/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh
+++ b/cpp/src/sampling/detail/gather_one_hop_edgelist_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ struct return_edges_with_properties_e_op {
                                       vertex_t dst,
                                       thrust::nullopt_t,
                                       thrust::nullopt_t,
-                                      EdgeProperties edge_properties)
+                                      EdgeProperties edge_properties) const
   {
     static_assert(std::is_same_v<key_t, vertex_t> ||
                   std::is_same_v<key_t, thrust::tuple<vertex_t, int32_t>>);
diff --git a/cpp/src/structure/graph_weight_utils_impl.cuh b/cpp/src/structure/graph_weight_utils_impl.cuh
index 1e386792b21..173b4df207b 100644
--- a/cpp/src/structure/graph_weight_utils_impl.cuh
+++ b/cpp/src/structure/graph_weight_utils_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,8 +89,6 @@ rmm::device_uvector<weight_t> compute_in_weight_sums(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (store_transposed) {
     return compute_weight_sums<true>(handle, graph_view, edge_weight_view);
   } else {
@@ -108,8 +106,6 @@ rmm::device_uvector<weight_t> compute_out_weight_sums(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (store_transposed) {
     return compute_weight_sums<false>(handle, graph_view, edge_weight_view);
   } else {
@@ -127,8 +123,6 @@ weight_t compute_max_in_weight_sum(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   auto in_weight_sums = compute_in_weight_sums(handle, graph_view, edge_weight_view);
   auto it =
     thrust::max_element(handle.get_thrust_policy(), in_weight_sums.begin(), in_weight_sums.end());
@@ -153,8 +147,6 @@ weight_t compute_max_out_weight_sum(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   auto out_weight_sums = compute_out_weight_sums(handle, graph_view, edge_weight_view);
   auto it =
     thrust::max_element(handle.get_thrust_policy(), out_weight_sums.begin(), out_weight_sums.end());
@@ -179,8 +171,6 @@ weight_t compute_total_edge_weight(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   return transform_reduce_e(
     handle,
     graph_view,
diff --git a/cpp/src/structure/induced_subgraph_impl.cuh b/cpp/src/structure/induced_subgraph_impl.cuh
index 18e1af32a71..fa4c5e0f6f2 100644
--- a/cpp/src/structure/induced_subgraph_impl.cuh
+++ b/cpp/src/structure/induced_subgraph_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ struct induced_subgraph_weighted_edge_op {
                                     vertex_t dst,
                                     property_t sv,
                                     property_t dv,
-                                    weight_t wgt)
+                                    weight_t wgt) const
   {
     size_t subgraph = thrust::get<1>(tagged_src);
     return thrust::binary_search(thrust::seq,
@@ -95,7 +95,7 @@ struct induced_subgraph_unweighted_edge_op {
                                     vertex_t dst,
                                     property_t sv,
                                     property_t dv,
-                                    thrust::nullopt_t)
+                                    thrust::nullopt_t) const
   {
     size_t subgraph = thrust::get<1>(tagged_src);
     return thrust::binary_search(thrust::seq,
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index bca6471a5bb..caa00e13640 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,6 +116,7 @@ struct e_op_t {
 };
 
 struct Prims_Usecase {
+  bool edge_masking{false};
   bool check_correctness{true};
 };
 
@@ -180,6 +181,13 @@ class Tests_MGExtractTransformE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG extract_transform_e
 
     const int hash_bin_count = 5;
@@ -400,7 +408,7 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGExtractTransformE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -408,7 +416,8 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGExtractTransformE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
+                                                              Prims_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -420,7 +429,7 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGExtractTransformE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
index 4d9435dd344..09c2fc0b2cb 100644
--- a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,6 +115,7 @@ struct e_op_t {
 };
 
 struct Prims_Usecase {
+  bool edge_masking{false};
   bool check_correctness{true};
 };
 
@@ -180,6 +181,13 @@ class Tests_MGExtractTransformVFrontierOutgoingE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG extract_transform_v_frontier_outgoing_e
 
     const int hash_bin_count = 5;
@@ -458,7 +466,7 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGExtractTransformVFrontierOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -466,7 +474,8 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGExtractTransformVFrontierOutgoingE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
+                                                              Prims_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -478,7 +487,7 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGExtractTransformVFrontierOutgoingE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
index 4d05b0c9e65..3e59bf3bf20 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/mg_utilities.hpp>
@@ -84,6 +86,7 @@ struct intersection_op_t {
 
 struct Prims_Usecase {
   size_t num_vertex_pairs{0};
+  bool edge_masking{false};
   bool check_correctness{true};
 };
 
@@ -109,6 +112,13 @@ class Tests_MGPerVPairTransformDstNbrIntersection
     auto const comm_rank = handle_->get_comms().get_rank();
     auto const comm_size = handle_->get_comms().get_size();
 
+    constexpr bool store_transposed = false;
+
+    constexpr bool test_weighted    = true;
+    constexpr bool renumber         = true;
+    constexpr bool drop_self_loops  = false;
+    constexpr bool drop_multi_edges = true;
+
     // 1. create MG graph
 
     if (cugraph::test::g_perf) {
@@ -117,29 +127,10 @@ class Tests_MGPerVPairTransformDstNbrIntersection
       hr_timer.start("MG Construct graph");
     }
 
-    constexpr bool store_transposed = false;
-    constexpr bool multi_gpu        = true;
-
-    cugraph::graph_t<vertex_t, edge_t, false, true> mg_graph(*handle_);
-    std::optional<
-      cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
-                               weight_t>>
-      mg_edge_weight{std::nullopt};
-
-    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
-
-    constexpr bool test_weighted    = true;
-    constexpr bool renumber         = true;
-    constexpr bool drop_self_loops  = false;
-    constexpr bool drop_multi_edges = true;
-
-    std::tie(mg_graph, mg_edge_weight, mg_renumber_map) =
-      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+    auto [mg_graph, mg_edge_weight, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
         *handle_, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
 
-    auto mg_graph_view       = mg_graph.view();
-    auto mg_edge_weight_view = (*mg_edge_weight).view();
-
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
@@ -147,6 +138,16 @@ class Tests_MGPerVPairTransformDstNbrIntersection
       hr_timer.display_and_clear(std::cout);
     }
 
+    auto mg_graph_view       = mg_graph.view();
+    auto mg_edge_weight_view = (*mg_edge_weight).view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG per_v_pair_transform_dst_nbr_intersection primitive
 
     ASSERT_TRUE(
@@ -355,15 +356,18 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGPerVPairTransformDstNbrIntersection_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{size_t{10}, true}),
+    ::testing::Values(Prims_Usecase{size_t{10}, false, true},
+                      Prims_Usecase{size_t{10}, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
 
-INSTANTIATE_TEST_SUITE_P(rmat_small_test,
-                         Tests_MGPerVPairTransformDstNbrIntersection_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{size_t{1024}, true}),
-                                            ::testing::Values(cugraph::test::Rmat_Usecase(
-                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGPerVPairTransformDstNbrIntersection_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{size_t{1024}, false, true},
+                      Prims_Usecase{size_t{1024}, true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
@@ -373,7 +377,8 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGPerVPairTransformDstNbrIntersection_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{size_t{1024 * 1024}, false}),
+    ::testing::Values(Prims_Usecase{size_t{1024 * 1024}, false, false},
+                      Prims_Usecase{size_t{1024 * 1024}, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
index d0b97065da7..7d1b2dd9412 100644
--- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,6 +88,7 @@ struct e_op_t {
 };
 
 struct Prims_Usecase {
+  bool edge_masking{false};
   bool check_correctness{true};
 };
 
@@ -152,6 +153,13 @@ class Tests_MGTransformReduceVFrontierOutgoingEByDst
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform reduce
 
     const int hash_bin_count = 5;
@@ -533,7 +541,7 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGTransformReduceVFrontierOutgoingEByDst_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -541,7 +549,8 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
+                                                              Prims_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -553,7 +562,7 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGTransformReduceVFrontierOutgoingEByDst_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/weight_sum_test.cpp b/cpp/tests/structure/weight_sum_test.cpp
index a61c0d4eeb4..30de0092a5a 100644
--- a/cpp/tests/structure/weight_sum_test.cpp
+++ b/cpp/tests/structure/weight_sum_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
 
 #include <raft/core/handle.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -46,8 +47,7 @@ void weight_sum_reference(edge_t const* offsets,
   if (!major) { std::fill(weight_sums, weight_sums + num_vertices, weight_t{0.0}); }
   for (vertex_t i = 0; i < num_vertices; ++i) {
     if (major) {
-      weight_sums[i] =
-        std::accumulate(weights + offsets[i], weights + offsets[i + 1], weight_t{0.0});
+      weight_sums[i] = std::reduce(weights + offsets[i], weights + offsets[i + 1], weight_t{0.0});
     } else {
       for (auto j = offsets[i]; j < offsets[i + 1]; ++j) {
         auto nbr = indices[j];
@@ -60,19 +60,13 @@ void weight_sum_reference(edge_t const* offsets,
 }
 
 typedef struct WeightSum_Usecase_t {
-  std::string graph_file_full_path{};
-
-  WeightSum_Usecase_t(std::string const& graph_file_path)
-  {
-    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
-      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
-    } else {
-      graph_file_full_path = graph_file_path;
-    }
-  };
+  bool edge_masking{false};
+  bool check_correctness{true};
 } WeightSum_Usecase;
 
-class Tests_WeightSum : public ::testing::TestWithParam<WeightSum_Usecase> {
+template <typename input_usecase_t>
+class Tests_WeightSum
+  : public ::testing::TestWithParam<std::tuple<WeightSum_Usecase, input_usecase_t>> {
  public:
   Tests_WeightSum() {}
 
@@ -83,96 +77,189 @@ class Tests_WeightSum : public ::testing::TestWithParam<WeightSum_Usecase> {
   virtual void TearDown() {}
 
   template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
-  void run_current_test(WeightSum_Usecase const& configuration)
+  void run_current_test(WeightSum_Usecase const& weight_sum_usecase,
+                        input_usecase_t const& input_usecase)
   {
+    constexpr bool renumber      = true;
+    constexpr bool test_weighted = true;
+
     raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, test_weighted, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
 
-    cugraph::graph_t<vertex_t, edge_t, store_transposed, false> graph(handle);
-    std::optional<
-      cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, store_transposed, false>,
-                               weight_t>>
-      edge_weights{std::nullopt};
-    std::tie(graph, edge_weights, std::ignore) = cugraph::test::
-      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed, false>(
-        handle, configuration.graph_file_full_path, true, false);
     auto graph_view = graph.view();
     auto edge_weight_view =
       edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
 
-    auto h_offsets =
-      cugraph::test::to_host(handle, graph_view.local_edge_partition_view().offsets());
-    auto h_indices =
-      cugraph::test::to_host(handle, graph_view.local_edge_partition_view().indices());
-    auto h_weights = cugraph::test::to_host(
-      handle,
-      raft::device_span<weight_t const>((*edge_weight_view).value_firsts()[0],
-                                        (*edge_weight_view).edge_counts()[0]));
-
-    std::vector<weight_t> h_reference_in_weight_sums(graph_view.number_of_vertices());
-    std::vector<weight_t> h_reference_out_weight_sums(graph_view.number_of_vertices());
-
-    weight_sum_reference(h_offsets.data(),
-                         h_indices.data(),
-                         h_weights.data(),
-                         h_reference_in_weight_sums.data(),
-                         graph_view.number_of_vertices(),
-                         store_transposed);
-
-    weight_sum_reference(h_offsets.data(),
-                         h_indices.data(),
-                         h_weights.data(),
-                         h_reference_out_weight_sums.data(),
-                         graph_view.number_of_vertices(),
-                         !store_transposed);
-
-    RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Compute in-weight sums");
+    }
 
     auto d_in_weight_sums = cugraph::compute_in_weight_sums(handle, graph_view, *edge_weight_view);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Compute out-weight sums");
+    }
+
     auto d_out_weight_sums =
       cugraph::compute_out_weight_sums(handle, graph_view, *edge_weight_view);
 
-    RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    auto h_cugraph_in_weight_sums  = cugraph::test::to_host(handle, d_in_weight_sums);
-    auto h_cugraph_out_weight_sums = cugraph::test::to_host(handle, d_out_weight_sums);
-
-    auto threshold_ratio     = weight_t{1e-4};
-    auto threshold_magnitude = std::numeric_limits<weight_t>::min();
-    auto nearly_equal        = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      return std::abs(lhs - rhs) <
-             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
-    };
-
-    ASSERT_TRUE(std::equal(h_reference_in_weight_sums.begin(),
-                           h_reference_in_weight_sums.end(),
-                           h_cugraph_in_weight_sums.begin(),
-                           nearly_equal))
-      << "In-weight-sum values do not match with the reference values.";
-    ASSERT_TRUE(std::equal(h_reference_out_weight_sums.begin(),
-                           h_reference_out_weight_sums.end(),
-                           h_cugraph_out_weight_sums.begin(),
-                           nearly_equal))
-      << "Out-weight-sum values do not match with the reference values.";
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (weight_sum_usecase.check_correctness) {
+      auto [h_offsets, h_indices, h_weights] =
+        cugraph::test::graph_to_host_csr(handle, graph_view, edge_weight_view);
+
+      std::vector<weight_t> h_reference_in_weight_sums(graph_view.number_of_vertices());
+      std::vector<weight_t> h_reference_out_weight_sums(graph_view.number_of_vertices());
+
+      weight_sum_reference(h_offsets.data(),
+                           h_indices.data(),
+                           (*h_weights).data(),
+                           h_reference_in_weight_sums.data(),
+                           graph_view.number_of_vertices(),
+                           store_transposed);
+
+      weight_sum_reference(h_offsets.data(),
+                           h_indices.data(),
+                           (*h_weights).data(),
+                           h_reference_out_weight_sums.data(),
+                           graph_view.number_of_vertices(),
+                           !store_transposed);
+
+      auto h_cugraph_in_weight_sums  = cugraph::test::to_host(handle, d_in_weight_sums);
+      auto h_cugraph_out_weight_sums = cugraph::test::to_host(handle, d_out_weight_sums);
+
+      auto threshold_ratio     = weight_t{2.0 * 1e-4};
+      auto threshold_magnitude = std::numeric_limits<weight_t>::min();
+      auto nearly_equal        = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+        auto ret =
+          std::abs(lhs - rhs) < std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+        return std::abs(lhs - rhs) <
+               std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+      };
+
+      ASSERT_TRUE(std::equal(h_reference_in_weight_sums.begin(),
+                             h_reference_in_weight_sums.end(),
+                             h_cugraph_in_weight_sums.begin(),
+                             nearly_equal))
+        << "In-weight-sum values do not match with the reference values.";
+
+      ASSERT_TRUE(std::equal(h_reference_out_weight_sums.begin(),
+                             h_reference_out_weight_sums.end(),
+                             h_cugraph_out_weight_sums.begin(),
+                             nearly_equal))
+        << "Out-weight-sum values do not match with the reference values.";
+    }
   }
 };
 
-// FIXME: add tests for type combinations
+using Tests_WeightSum_File = Tests_WeightSum<cugraph::test::File_Usecase>;
+using Tests_WeightSum_Rmat = Tests_WeightSum<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_WeightSum_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_WeightSum_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_WeightSum_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, false>(std::get<0>(param), std::get<1>(param));
+}
 
-TEST_P(Tests_WeightSum, CheckInt32Int32FloatTransposeFalse)
+TEST_P(Tests_WeightSum_Rmat, CheckInt32Int32FloatTransposeTrue)
 {
-  run_current_test<int32_t, int32_t, float, false>(GetParam());
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, true>(std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_WeightSum, CheckInt32Int32FloatTransposeTrue)
+TEST_P(Tests_WeightSum_Rmat, CheckInt32Int64FloatTransposeFalse)
 {
-  run_current_test<int32_t, int32_t, float, true>(GetParam());
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, false>(std::get<0>(param), std::get<1>(param));
 }
 
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_WeightSum,
-                         ::testing::Values(WeightSum_Usecase("test/datasets/karate.mtx"),
-                                           WeightSum_Usecase("test/datasets/web-Google.mtx"),
-                                           WeightSum_Usecase("test/datasets/ljournal-2008.mtx"),
-                                           WeightSum_Usecase("test/datasets/webbase-1M.mtx")));
+TEST_P(Tests_WeightSum_Rmat, CheckInt32Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_WeightSum_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_WeightSum_Rmat, CheckInt64Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, true>(std::get<0>(param), std::get<1>(param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_WeightSum_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(WeightSum_Usecase{false}, WeightSum_Usecase{true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_WeightSum_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(WeightSum_Usecase{false}, WeightSum_Usecase{true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_WeightSum_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(WeightSum_Usecase{false, false}, WeightSum_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()