From ff76a385c4048ac75d1408c3bcf9d4e3018a88b0 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Thu, 25 Jan 2024 13:13:58 -0800
Subject: [PATCH] Update per_v_transform_reduce_incoming|outgoing_e to support
 edge masking (#4085)

per_v_transform_reduce_(incoming|outgoing_e) now supports edge masking.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4085
---
 .../eigenvector_centrality_impl.cuh           |   4 +-
 cpp/src/link_analysis/pagerank_impl.cuh       |   4 +-
 cpp/src/prims/detail/prim_functors.cuh        |  60 +++
 ...v_transform_reduce_incoming_outgoing_e.cuh | 442 +++++++++++-------
 ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh |  18 +-
 .../traversal/od_shortest_distances_impl.cuh  |  17 +-
 ..._v_transform_reduce_incoming_outgoing_e.cu |  27 +-
 7 files changed, 381 insertions(+), 191 deletions(-)
 create mode 100644 cpp/src/prims/detail/prim_functors.cuh

diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh
index 8d1bea4004d..2129dca6985 100644
--- a/cpp/src/centrality/eigenvector_centrality_impl.cuh
+++ b/cpp/src/centrality/eigenvector_centrality_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,7 +117,7 @@ rmm::device_uvector<weight_t> eigenvector_centrality(
         edge_src_centralities.view(),
         edge_dst_dummy_property_t{}.view(),
         edge_dummy_property_t{}.view(),
-        [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val * 1.0; },
+        [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val; },
         weight_t{0},
         reduce_op::plus<weight_t>{},
         centralities.begin());
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 92c70fcff20..9a76ba73f92 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -288,7 +288,7 @@ centrality_algorithm_metadata_t pagerank(
         edge_dst_dummy_property_t{}.view(),
         edge_dummy_property_t{}.view(),
         [alpha] __device__(vertex_t, vertex_t, auto src_val, auto, auto) {
-          return src_val * 1.0 * alpha;
+          return src_val * alpha;
         },
         unvarying_part,
         reduce_op::plus<result_t>{},
diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh
new file mode 100644
index 00000000000..2785ba38dfd
--- /dev/null
+++ b/cpp/src/prims/detail/prim_functors.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/edge_partition_device_view.cuh>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename GraphViewType,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgeOp>
+struct call_e_op_t {
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> const& edge_partition{};
+  EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{};
+  EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{};
+  EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{};
+  EdgeOp const& e_op{};
+  typename GraphViewType::vertex_type major{};
+  typename GraphViewType::vertex_type major_offset{};
+  typename GraphViewType::vertex_type const* indices{nullptr};
+  typename GraphViewType::edge_type edge_offset{};
+
+  __device__ auto operator()(typename GraphViewType::edge_type i) const
+  {
+    auto minor        = indices[i];
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+    auto src          = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst          = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    return e_op(src,
+                dst,
+                edge_partition_src_value_input.get(src_offset),
+                edge_partition_dst_value_input.get(dst_offset),
+                edge_partition_e_value_input.get(edge_offset + i));
+  }
+};
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 1a7fc0130c4..24b4f0857b1 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <detail/graph_partition_utils.cuh>
+#include <prims/detail/prim_functors.cuh>
 #include <prims/fill_edge_src_dst_property.cuh>
 #include <prims/property_op_utils.cuh>
 #include <prims/reduce_op.cuh>
@@ -63,11 +64,84 @@ namespace detail {
 
 int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512;
 
+template <typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename ResultValueOutputIteratorOrWrapper>
+struct transform_and_atomic_reduce_t {
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition{};
+  result_t identity_element{};
+  vertex_t const* indices{nullptr};
+  TransformOp const& transform_op{};
+  ResultValueOutputIteratorOrWrapper& result_value_output{};
+
+  __device__ void operator()(edge_t i) const
+  {
+    auto e_op_result = transform_op(i);
+    if (e_op_result != identity_element) {
+      auto minor        = indices[i];
+      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+      if constexpr (multi_gpu) {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+      } else {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+      }
+    }
+  }
+};
+
+template <bool update_major,
+          typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename ResultValueOutputIteratorOrWrapper>
+__device__ void update_result_value_output(
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition,
+  vertex_t const* indices,
+  edge_t local_degree,
+  TransformOp const& transform_op,
+  result_t init,
+  ReduceOp const& reduce_op,
+  size_t output_idx /* relevent only when update_major === true */,
+  result_t identity_element,
+  ResultValueOutputIteratorOrWrapper& result_value_output)
+{
+  if constexpr (update_major) {
+    *(result_value_output + output_idx) =
+      thrust::transform_reduce(thrust::seq,
+                               thrust::make_counting_iterator(edge_t{0}),
+                               thrust::make_counting_iterator(local_degree),
+                               transform_op,
+                               init,
+                               reduce_op);
+  } else {
+    thrust::for_each(
+      thrust::seq,
+      thrust::make_counting_iterator(edge_t{0}),
+      thrust::make_counting_iterator(local_degree),
+      transform_and_atomic_reduce_t<vertex_t,
+                                    edge_t,
+                                    multi_gpu,
+                                    result_t,
+                                    TransformOp,
+                                    ReduceOp,
+                                    ResultValueOutputIteratorOrWrapper>{
+        edge_partition, identity_element, indices, transform_op, result_value_output});
+  }
+}
+
 template <bool update_major,
           typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -82,9 +156,11 @@ __global__ void per_v_transform_reduce_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
   ReduceOp reduce_op)
 {
   static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
@@ -104,6 +180,7 @@ __global__ void per_v_transform_reduce_e_hypersparse(
   while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
     auto major =
       *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     auto major_idx =
       major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
     vertex_t const* indices{nullptr};
@@ -111,60 +188,50 @@ __global__ void per_v_transform_reduce_e_hypersparse(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_idx));
-    auto transform_op = [&edge_partition,
-                         &edge_partition_src_value_input,
-                         &edge_partition_dst_value_input,
-                         &edge_partition_e_value_input,
-                         &e_op,
-                         major,
-                         indices,
-                         edge_offset] __device__(auto i) {
-      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed ? minor : major;
-      auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-      auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-      auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-      return e_op(src,
-                  dst,
-                  edge_partition_src_value_input.get(src_offset),
-                  edge_partition_dst_value_input.get(dst_offset),
-                  edge_partition_e_value_input.get(edge_offset + i));
-    };
 
-    if constexpr (update_major) {
-      *(result_value_output + (major - *(edge_partition.major_hypersparse_first()))) =
-        thrust::transform_reduce(thrust::seq,
-                                 thrust::make_counting_iterator(edge_t{0}),
-                                 thrust::make_counting_iterator(local_degree),
-                                 transform_op,
-                                 init,
-                                 reduce_op);
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      auto transform_op =
+        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return identity_element;
+          }
+        };
+
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               transform_op,
+                                               init,
+                                               reduce_op,
+                                               major - *(edge_partition).major_hypersparse_first(),
+                                               identity_element,
+                                               result_value_output);
     } else {
-      if constexpr (GraphViewType::is_multi_gpu) {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          });
-      } else {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          });
-      }
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               call_e_op,
+                                               init,
+                                               reduce_op,
+                                               major - *(edge_partition).major_hypersparse_first(),
+                                               identity_element,
+                                               result_value_output);
     }
     idx += gridDim.x * blockDim.x;
   }
@@ -175,6 +242,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -191,9 +259,11 @@ __global__ void per_v_transform_reduce_e_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
   ReduceOp reduce_op)
 {
   static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
@@ -209,71 +279,57 @@ __global__ void per_v_transform_reduce_e_low_degree(
   auto idx = static_cast<size_t>(tid);
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto transform_op = [&edge_partition,
-                         &edge_partition_src_value_input,
-                         &edge_partition_dst_value_input,
-                         &edge_partition_e_value_input,
-                         &e_op,
-                         major_offset,
-                         indices,
-                         edge_offset] __device__(auto i) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      return e_op(src,
-                  dst,
-                  edge_partition_src_value_input.get(src_offset),
-                  edge_partition_dst_value_input.get(dst_offset),
-                  edge_partition_e_value_input.get(edge_offset + i));
-    };
 
-    if constexpr (update_major) {
-      *(result_value_output + idx) =
-        thrust::transform_reduce(thrust::seq,
-                                 thrust::make_counting_iterator(edge_t{0}),
-                                 thrust::make_counting_iterator(local_degree),
-                                 transform_op,
-                                 init,
-                                 reduce_op);
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      auto transform_op =
+        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return identity_element;
+          }
+        };
+
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               transform_op,
+                                               init,
+                                               reduce_op,
+                                               idx,
+                                               identity_element,
+                                               result_value_output);
     } else {
-      if constexpr (GraphViewType::is_multi_gpu) {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          });
-      } else {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          });
-      }
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               call_e_op,
+                                               init,
+                                               reduce_op,
+                                               idx,
+                                               identity_element,
+                                               result_value_output);
     }
     idx += gridDim.x * blockDim.x;
   }
@@ -284,6 +340,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -300,6 +357,7 @@ __global__ void per_v_transform_reduce_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -327,41 +385,61 @@ __global__ void per_v_transform_reduce_e_mid_degree(
                  raft::warp_size()];  // relevant only if update_major == true
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
     [[maybe_unused]] auto reduced_e_op_result =
       lane_id == 0 ? init : identity_element;  // relevant only if update_major == true
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      if constexpr (update_major) {
-        reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-      } else {
-        if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+    if (edge_partition_e_mask) {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          if constexpr (update_major) {
+            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+          } else {
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+            if constexpr (GraphViewType::is_multi_gpu) {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+            } else {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+            }
+          }
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        auto e_op_result = call_e_op(i);
+        if constexpr (update_major) {
+          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
         } else {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+          if constexpr (GraphViewType::is_multi_gpu) {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+          } else {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          }
         }
       }
     }
+
     if constexpr (update_major) {
       reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
                               .Reduce(reduced_e_op_result, reduce_op);
@@ -377,6 +455,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -393,6 +472,7 @@ __global__ void per_v_transform_reduce_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -416,41 +496,61 @@ __global__ void per_v_transform_reduce_e_high_degree(
     typename BlockReduce::TempStorage temp_storage;  // relevant only if update_major == true
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
     [[maybe_unused]] auto reduced_e_op_result =
       threadIdx.x == 0 ? init : identity_element;  // relevant only if update_major == true
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      if constexpr (update_major) {
-        reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-      } else {
-        if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+    if (edge_partition_e_mask) {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          if constexpr (update_major) {
+            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+          } else {
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+            if constexpr (GraphViewType::is_multi_gpu) {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+            } else {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+            }
+          }
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        auto e_op_result = call_e_op(i);
+        if constexpr (update_major) {
+          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
         } else {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+          if constexpr (GraphViewType::is_multi_gpu) {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+          } else {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          }
         }
       }
     }
+
     if constexpr (update_major) {
       reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op);
       if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; }
@@ -656,10 +756,18 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
 
   if (stream_pool_indices) { handle.sync_stream(); }
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     auto major_init = ReduceOp::identity_element;
     if constexpr (update_major) {
@@ -737,9 +845,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
               edge_partition_src_value_input,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
+              edge_partition_e_mask,
               segment_output_buffer,
               e_op,
               major_init,
+              ReduceOp::identity_element,
               reduce_op);
         }
       }
@@ -761,9 +871,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             segment_output_buffer,
             e_op,
             major_init,
+            ReduceOp::identity_element,
             reduce_op);
       }
       if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
@@ -784,6 +896,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             segment_output_buffer,
             e_op,
             major_init,
@@ -806,6 +919,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             output_buffer,
             e_op,
             major_init,
@@ -825,9 +939,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             output_buffer,
             e_op,
             major_init,
+            ReduceOp::identity_element,
             reduce_op);
       }
     }
@@ -1056,8 +1172,6 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -1137,8 +1251,6 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
index 745f1a8fd8e..18e722d62cc 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ template <typename key_t,
           typename dst_value_t,
           typename e_value_t,
           typename EdgeOp>
-struct call_e_op_t {
+struct transform_reduce_v_frontier_call_e_op_t {
   EdgeOp e_op{};
 
   __device__ thrust::optional<
@@ -331,13 +331,13 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
 
   // 1. fill the buffer
 
-  detail::call_e_op_t<key_t,
-                      payload_t,
-                      vertex_t,
-                      typename EdgeSrcValueInputWrapper::value_type,
-                      typename EdgeDstValueInputWrapper::value_type,
-                      typename EdgeValueInputWrapper::value_type,
-                      EdgeOp>
+  detail::transform_reduce_v_frontier_call_e_op_t<key_t,
+                                                  payload_t,
+                                                  vertex_t,
+                                                  typename EdgeSrcValueInputWrapper::value_type,
+                                                  typename EdgeDstValueInputWrapper::value_type,
+                                                  typename EdgeValueInputWrapper::value_type,
+                                                  EdgeOp>
     e_op_wrapper{e_op};
 
   auto [key_buffer, payload_buffer] =
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index cc69cb5f67f..58fae83bca0 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -639,13 +639,14 @@ rmm::device_uvector<weight_t> od_shortest_distances(
         static_cast<od_idx_t>(origins.size()),
         cutoff,
         invalid_distance};
-      detail::call_e_op_t<thrust::tuple<vertex_t, od_idx_t>,
-                          weight_t,
-                          vertex_t,
-                          thrust::nullopt_t,
-                          thrust::nullopt_t,
-                          weight_t,
-                          e_op_t<vertex_t, od_idx_t, key_t, weight_t, GraphViewType::is_multi_gpu>>
+      detail::transform_reduce_v_frontier_call_e_op_t<
+        thrust::tuple<vertex_t, od_idx_t>,
+        weight_t,
+        vertex_t,
+        thrust::nullopt_t,
+        thrust::nullopt_t,
+        weight_t,
+        e_op_t<vertex_t, od_idx_t, key_t, weight_t, GraphViewType::is_multi_gpu>>
         e_op_wrapper{e_op};
 
       auto new_frontier_tagged_vertex_buffer =
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index 677d6ce5022..fc8114a4652 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,8 +150,9 @@ struct result_compare {
 };
 
 struct Prims_Usecase {
-  bool check_correctness{true};
   bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -200,6 +201,13 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform reduce
 
     const int hash_bin_count = 5;
@@ -674,7 +682,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGPerVTransformReduceIncomingOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -682,7 +693,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -694,7 +708,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()