From 0e0b36371bfc5d3055007d596e383d5e12ddb457 Mon Sep 17 00:00:00 2001
From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com>
Date: Tue, 12 Mar 2024 12:40:08 -0400
Subject: [PATCH 1/5] [docs] fix GNN typo (#4196)

Authors:
  - Anthony Mahanna (https://github.com/aMahanna)
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Ralph Liu (https://github.com/nv-rliu)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Don Acosta (https://github.com/acostadon)

URL: https://github.com/rapidsai/cugraph/pull/4196

From e4b58ec04e4add07578ce779977835cfe28f516a Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 12 Mar 2024 19:00:31 +0100
Subject: [PATCH 2/5] Make external vertex and edge shuffling function public
 (#4227)

Make vertex and edge shuffling function public

Authors:
  - Naim (https://github.com/naimnv)
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/4227
---
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/cugraph/graph_functions.hpp       |  63 +++++++++++
 .../shuffle_vertex_pairs.cu                   | 102 ++++++++++++++++++
 .../{detail => utilities}/shuffle_vertices.cu |  70 ++++++++++++
 4 files changed, 237 insertions(+), 2 deletions(-)
 rename cpp/src/{detail => utilities}/shuffle_vertex_pairs.cu (80%)
 rename cpp/src/{detail => utilities}/shuffle_vertices.cu (73%)
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a3392627fb8..3131404712f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -186,9 +186,9 @@ endif()
 #       which should give us a better parallel schedule.
 
 set(CUGRAPH_SOURCES
-    src/detail/shuffle_vertices.cu
+    src/utilities/shuffle_vertices.cu
     src/detail/permute_range.cu
-    src/detail/shuffle_vertex_pairs.cu
+    src/utilities/shuffle_vertex_pairs.cu
     src/detail/collect_local_vertex_values.cu
     src/detail/groupby_and_count.cu
     src/detail/collect_comm_wrapper.cu
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 90425f86bef..6d4470e8251 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1052,4 +1052,67 @@ remove_multi_edges(raft::handle_t const& handle,
                    std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
                    bool keep_min_value_edge = false);
 
+/**
+ * @brief Shuffle external vertex ids to the proper GPU.
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices  List of vertex ids
+ * @return Vector of vertex ids mapped to this GPU.
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
+                                                        rmm::device_uvector<vertex_t>&& vertices);
+
+/**
+ * @brief Shuffle external vertex ids and values to the proper GPU.
+ *
+ * @tparam vertex_t   Type of vertex identifiers. Needs to be an integral type.
+ * @tparam value_t    Type of values. currently supported types are int32_t,
+ * int64_t, size_t, float and double.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices  List of vertex ids
+ * @param values List of values
+ * @return Tuple of vectors storing vertex ids and values mapped to this GPU.
+ */
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<vertex_t>&& vertices,
+                                    rmm::device_uvector<value_t>&& values);
+
+/**
+ * @brief Shuffle external edges to the proper GPU.
+ *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t      Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t    Type of edge weight. Currently float and double are supported.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edge_srcs  List of source vertex ids
+ * @param edge_dsts  List of destination vertex ids
+ * @param edge_weights  Optional list of edge weights
+ * @param edge_ids  Optional list of edge ids
+ * @param edge_types Optional list of edge types
+ * @return Tuple of vectors storing edge sources, destinations, optional weights,
+ *          optional edge ids, optional edge types mapped to this GPU.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<vertex_t>&& edge_srcs,
+                       rmm::device_uvector<vertex_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights,
+                       std::optional<rmm::device_uvector<edge_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<edge_type_t>>&& edge_types);
+
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/utilities/shuffle_vertex_pairs.cu
similarity index 80%
rename from cpp/src/detail/shuffle_vertex_pairs.cu
rename to cpp/src/utilities/shuffle_vertex_pairs.cu
index 33a7834f5ff..b473796aa9d 100644
--- a/cpp/src/detail/shuffle_vertex_pairs.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs.cu
@@ -519,4 +519,106 @@ shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   std::vector<int64_t> const& vertex_partition_range_lasts);
 
 }  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<vertex_t>&& edge_srcs,
+                       rmm::device_uvector<vertex_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights,
+                       std::optional<rmm::device_uvector<edge_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<edge_type_t>>&& edge_types)
+{
+  auto& comm                 = handle.get_comms();
+  auto const comm_size       = comm.get_size();
+  auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+  auto const major_comm_size = major_comm.get_size();
+  auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+  auto const minor_comm_size = minor_comm.get_size();
+
+  return detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+    handle,
+    std::move(edge_srcs),
+    std::move(edge_dsts),
+    std::move(edge_weights),
+    std::move(edge_ids),
+    std::move(edge_types));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int64_t>&& majors,
+                       rmm::device_uvector<int64_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int64_t>&& majors,
+                       rmm::device_uvector<int64_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/utilities/shuffle_vertices.cu
similarity index 73%
rename from cpp/src/detail/shuffle_vertices.cu
rename to cpp/src/utilities/shuffle_vertices.cu
index be6875f1073..b396201f509 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/utilities/shuffle_vertices.cu
@@ -249,4 +249,74 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<double>&& values);
 
 }  // namespace detail
+
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<vertex_t>&& vertices,
+                                    rmm::device_uvector<value_t>&& values)
+{
+  return detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+    handle, std::move(vertices), std::move(values));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<int32_t>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<size_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<size_t>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<float>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<double>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<int32_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<int64_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<size_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<size_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<float>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<double>&& values);
+
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
+                                                        rmm::device_uvector<vertex_t>&& vertices)
+{
+  return detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(handle,
+                                                                          std::move(vertices));
+}
+
+template rmm::device_uvector<int32_t> shuffle_external_vertices(
+  raft::handle_t const& handle, rmm::device_uvector<int32_t>&& d_vertices);
+
+template rmm::device_uvector<int64_t> shuffle_external_vertices(
+  raft::handle_t const& handle, rmm::device_uvector<int64_t>&& d_vertices);
+
 }  // namespace cugraph

From a4eab99f8f0ff50aab980dfab1fffa68e8983942 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 12 Mar 2024 19:01:20 +0100
Subject: [PATCH 3/5] Graph coloring and MIS (#4211)

This PR implements
- MNMG Graph Coloring
- MNMG MIS for general graphs

Closes #4230
Closes #4231

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4211
---
 cpp/CMakeLists.txt                            |   8 +-
 cpp/include/cugraph/algorithms.hpp            |  30 +-
 .../detail/maximal_independent_moves.cuh      | 313 ++++++++++++++++++
 .../maximal_independent_moves.hpp}            |   6 +-
 .../detail/maximal_independent_moves_mg.cu    |  38 +++
 .../detail/maximal_independent_moves_sg.cu    |  37 +++
 cpp/src/community/detail/refine_impl.cuh      |   6 +-
 .../detail => components}/mis_impl.cuh        |  37 ++-
 .../detail => components}/mis_mg.cu           |   8 +-
 .../detail => components}/mis_sg.cu           |   7 +-
 cpp/src/components/vertex_coloring_impl.cuh   | 150 +++++++++
 cpp/src/components/vertex_coloring_mg.cu      |  35 ++
 cpp/src/components/vertex_coloring_sg.cu      |  35 ++
 cpp/tests/CMakeLists.txt                      |  20 +-
 .../{community => components}/mg_mis_test.cu  |  19 +-
 .../components/mg_vertex_coloring_test.cu     | 275 +++++++++++++++
 cpp/tests/components/mis_test.cu              | 240 ++++++++++++++
 cpp/tests/components/vertex_coloring_test.cu  | 244 ++++++++++++++
 18 files changed, 1468 insertions(+), 40 deletions(-)
 create mode 100644 cpp/src/community/detail/maximal_independent_moves.cuh
 rename cpp/src/community/{mis.hpp => detail/maximal_independent_moves.hpp} (91%)
 create mode 100644 cpp/src/community/detail/maximal_independent_moves_mg.cu
 create mode 100644 cpp/src/community/detail/maximal_independent_moves_sg.cu
 rename cpp/src/{community/detail => components}/mis_impl.cuh (90%)
 rename cpp/src/{community/detail => components}/mis_mg.cu (83%)
 rename cpp/src/{community/detail => components}/mis_sg.cu (82%)
 create mode 100644 cpp/src/components/vertex_coloring_impl.cuh
 create mode 100644 cpp/src/components/vertex_coloring_mg.cu
 create mode 100644 cpp/src/components/vertex_coloring_sg.cu
 rename cpp/tests/{community => components}/mg_mis_test.cu (95%)
 create mode 100644 cpp/tests/components/mg_vertex_coloring_test.cu
 create mode 100644 cpp/tests/components/mis_test.cu
 create mode 100644 cpp/tests/components/vertex_coloring_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3131404712f..b12403710ab 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -197,8 +197,8 @@ set(CUGRAPH_SOURCES
     src/community/detail/common_methods_sg.cu
     src/community/detail/refine_sg.cu
     src/community/detail/refine_mg.cu
-    src/community/detail/mis_sg.cu
-    src/community/detail/mis_mg.cu
+    src/community/detail/maximal_independent_moves_sg.cu
+    src/community/detail/maximal_independent_moves_mg.cu
     src/detail/utility_wrappers.cu
     src/structure/graph_view_mg.cu
     src/structure/remove_self_loops.cu
@@ -295,6 +295,10 @@ set(CUGRAPH_SOURCES
     src/tree/legacy/mst.cu
     src/components/weakly_connected_components_sg.cu
     src/components/weakly_connected_components_mg.cu
+    src/components/mis_sg.cu
+    src/components/mis_mg.cu
+    src/components/vertex_coloring_sg.cu
+    src/components/vertex_coloring_mg.cu
     src/structure/create_graph_from_edgelist_sg.cu
     src/structure/create_graph_from_edgelist_mg.cu
     src/structure/symmetrize_edgelist_sg.cu
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 5c29604a5a7..1471d340cec 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -2340,15 +2340,41 @@ std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<vertex_t>> k_hop_nbr
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Graph view object.
  * @param rng_state The RngState instance holding pseudo-random number generator state.
- * @return A device vector containing vertices found in the maximal independent set
+ * @return A device vector containing vertices in the maximal independent set.
  */
-
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 rmm::device_uvector<vertex_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   raft::random::RngState& rng_state);
 
+/*
+ * @brief Find a Greedy Vertex Coloring
+ *
+ * A vertex coloring is an assignment of colors or labels to each vertex of a graph so that
+ * no two adjacent vertices have the same color or label. Finding the minimum number of colors
+ * needed to color the vertices of a graph is an NP-hard problem and therefore for practical
+ * use cases greedy coloring is used. Here we provide an implementation of greedy vertex
+ * coloring based on maximal independent set.
+ * See
+ * https://research.nvidia.com/sites/default/files/pubs/2015-05_Parallel-Graph-Coloring/nvr-2015-001.pdf
+ * for further information.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param rng_state The RngState instance holding pseudo-random number generator state.
+ * @return A device vector containing color for each vertex.
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state);
+
 }  // namespace cugraph
 
 /**
diff --git a/cpp/src/community/detail/maximal_independent_moves.cuh b/cpp/src/community/detail/maximal_independent_moves.cuh
new file mode 100644
index 00000000000..82d20a04203
--- /dev/null
+++ b/cpp/src/community/detail/maximal_independent_moves.cuh
@@ -0,0 +1,313 @@
+
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "maximal_independent_moves.hpp"
+#include "prims/fill_edge_src_dst_property.cuh"
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <cuda/functional>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/optional.h>
+#include <thrust/remove.h>
+#include <thrust/set_operations.h>
+#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
+
+#include <cmath>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
+  using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+
+  vertex_t local_vtx_partitoin_size = graph_view.local_vertex_partition_range_size();
+
+  rmm::device_uvector<vertex_t> remaining_vertices(local_vtx_partitoin_size, handle.get_stream());
+
+  auto vertex_begin =
+    thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first());
+  auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last());
+
+  // Compute out-degree
+  auto out_degrees = graph_view.compute_out_degrees(handle);
+
+  // Only vertices with non-zero out-degree are possible can move
+  remaining_vertices.resize(
+    thrust::distance(remaining_vertices.begin(),
+                     thrust::copy_if(handle.get_thrust_policy(),
+                                     vertex_begin,
+                                     vertex_end,
+                                     out_degrees.begin(),
+                                     remaining_vertices.begin(),
+                                     [] __device__(auto deg) { return deg > 0; })),
+    handle.get_stream());
+
+  // Set ID of each vertex as its rank
+  rmm::device_uvector<vertex_t> ranks(local_vtx_partitoin_size, handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
+
+  // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
+  thrust::transform_if(handle.get_thrust_policy(),
+                       out_degrees.begin(),
+                       out_degrees.end(),
+                       ranks.begin(),
+                       cuda::proclaim_return_type<vertex_t>(
+                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
+                       [] __device__(auto deg) { return deg == 0; });
+
+  out_degrees.resize(0, handle.get_stream());
+  out_degrees.shrink_to_fit(handle.get_stream());
+
+  size_t loop_counter = 0;
+  while (true) {
+    loop_counter++;
+
+    // Copy ranks into temporary vector to begin with
+
+    rmm::device_uvector<vertex_t> temporary_ranks(local_vtx_partitoin_size, handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(), ranks.begin(), ranks.end(), temporary_ranks.begin());
+
+    // Select a random set of candidate vertices
+
+    vertex_t nr_remaining_vertices_to_check = remaining_vertices.size();
+    if (multi_gpu) {
+      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
+                                                             nr_remaining_vertices_to_check,
+                                                             raft::comms::op_t::SUM,
+                                                             handle.get_stream());
+    }
+
+    vertex_t nr_candidates = (nr_remaining_vertices_to_check < 1024)
+                               ? nr_remaining_vertices_to_check
+                               : std::min(static_cast<vertex_t>((0.50 + 0.25 * loop_counter) *
+                                                                nr_remaining_vertices_to_check),
+                                          nr_remaining_vertices_to_check);
+
+    // FIXME: Can we improve performance here?
+    // FIXME: if(nr_remaining_vertices_to_check < 1024), may avoid calling select_random_vertices
+    auto d_sampled_vertices =
+      cugraph::select_random_vertices(handle,
+                                      graph_view,
+                                      std::make_optional(raft::device_span<vertex_t const>{
+                                        remaining_vertices.data(), remaining_vertices.size()}),
+                                      rng_state,
+                                      nr_candidates,
+                                      false,
+                                      true);
+
+    rmm::device_uvector<vertex_t> non_candidate_vertices(
+      remaining_vertices.size() - d_sampled_vertices.size(), handle.get_stream());
+
+    thrust::set_difference(handle.get_thrust_policy(),
+                           remaining_vertices.begin(),
+                           remaining_vertices.end(),
+                           d_sampled_vertices.begin(),
+                           d_sampled_vertices.end(),
+                           non_candidate_vertices.begin());
+
+    // Set temporary ranks of non-candidate vertices to std::numeric_limits<vertex_t>::lowest()
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      non_candidate_vertices.begin(),
+      non_candidate_vertices.end(),
+      [temporary_ranks =
+         raft::device_span<vertex_t>(temporary_ranks.data(), temporary_ranks.size()),
+       v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+        //
+        // if rank of a non-candidate vertex is not std::numeric_limits<vertex_t>::max() (i.e. the
+        // vertex is not already in MIS), set it to std::numeric_limits<vertex_t>::lowest()
+        //
+        auto v_offset = v - v_first;
+        if (temporary_ranks[v_offset] < std::numeric_limits<vertex_t>::max()) {
+          temporary_ranks[v_offset] = std::numeric_limits<vertex_t>::lowest();
+        }
+      });
+
+    // Caches for ranks
+    edge_src_property_t<GraphViewType, vertex_t> src_rank_cache(handle);
+    edge_dst_property_t<GraphViewType, vertex_t> dst_rank_cache(handle);
+
+    // Update rank caches with temporary ranks
+    if constexpr (multi_gpu) {
+      src_rank_cache = edge_src_property_t<GraphViewType, vertex_t>(handle, graph_view);
+      dst_rank_cache = edge_dst_property_t<GraphViewType, vertex_t>(handle, graph_view);
+      update_edge_src_property(handle, graph_view, temporary_ranks.begin(), src_rank_cache);
+      update_edge_dst_property(handle, graph_view, temporary_ranks.begin(), dst_rank_cache);
+    }
+
+    //
+    // Find maximum rank outgoing neighbor for each vertex
+    //
+
+    rmm::device_uvector<vertex_t> max_outgoing_ranks(local_vtx_partitoin_size, handle.get_stream());
+
+    per_v_transform_reduce_outgoing_e(
+      handle,
+      graph_view,
+      multi_gpu
+        ? src_rank_cache.view()
+        : detail::edge_major_property_view_t<vertex_t, vertex_t const*>(temporary_ranks.data()),
+      multi_gpu ? dst_rank_cache.view()
+                : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                    temporary_ranks.data(), vertex_t{0}),
+      edge_dummy_property_t{}.view(),
+      [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; },
+      std::numeric_limits<vertex_t>::lowest(),
+      cugraph::reduce_op::maximum<vertex_t>{},
+      max_outgoing_ranks.begin());
+
+    //
+    // Find maximum rank incoming neighbor for each vertex
+    //
+
+    rmm::device_uvector<vertex_t> max_incoming_ranks(local_vtx_partitoin_size, handle.get_stream());
+
+    per_v_transform_reduce_incoming_e(
+      handle,
+      graph_view,
+      multi_gpu
+        ? src_rank_cache.view()
+        : detail::edge_major_property_view_t<vertex_t, vertex_t const*>(temporary_ranks.data()),
+      multi_gpu ? dst_rank_cache.view()
+                : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                    temporary_ranks.data(), vertex_t{0}),
+      edge_dummy_property_t{}.view(),
+      [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return src_rank; },
+      std::numeric_limits<vertex_t>::lowest(),
+      cugraph::reduce_op::maximum<vertex_t>{},
+      max_incoming_ranks.begin());
+
+    temporary_ranks.resize(0, handle.get_stream());
+    temporary_ranks.shrink_to_fit(handle.get_stream());
+
+    //
+    // Compute max of outgoing and incoming neighbors
+    //
+    thrust::transform(handle.get_thrust_policy(),
+                      max_incoming_ranks.begin(),
+                      max_incoming_ranks.end(),
+                      max_outgoing_ranks.begin(),
+                      max_outgoing_ranks.begin(),
+                      thrust::maximum<vertex_t>());
+
+    max_incoming_ranks.resize(0, handle.get_stream());
+    max_incoming_ranks.shrink_to_fit(handle.get_stream());
+
+    //
+    // If the max neighbor of a vertex is already in MIS (i.e. has rank
+    // std::numeric_limits<vertex_t>::max()), discard it, otherwise,
+    // include the vertex if it has larger rank than its maximum rank neighbor
+    //
+    auto last = thrust::remove_if(
+      handle.get_thrust_policy(),
+      d_sampled_vertices.begin(),
+      d_sampled_vertices.end(),
+      [max_rank_neighbor_first = max_outgoing_ranks.begin(),
+       ranks                   = raft::device_span<vertex_t>(ranks.data(), ranks.size()),
+       v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+        auto v_offset          = v - v_first;
+        auto max_neighbor_rank = *(max_rank_neighbor_first + v_offset);
+        auto rank_of_v         = ranks[v_offset];
+
+        if (max_neighbor_rank >= std::numeric_limits<vertex_t>::max()) {
+          // Maximum rank neighbor is alreay in MIS
+          // Discard current vertex by setting its rank to
+          // std::numeric_limits<vertex_t>::lowest()
+          ranks[v_offset] = std::numeric_limits<vertex_t>::lowest();
+          return true;
+        }
+
+        if (rank_of_v >= max_neighbor_rank) {
+          // Include v and set its rank to std::numeric_limits<vertex_t>::max()
+          ranks[v_offset] = std::numeric_limits<vertex_t>::max();
+          return true;
+        }
+        return false;
+      });
+
+    max_outgoing_ranks.resize(0, handle.get_stream());
+    max_outgoing_ranks.shrink_to_fit(handle.get_stream());
+
+    d_sampled_vertices.resize(thrust::distance(d_sampled_vertices.begin(), last),
+                              handle.get_stream());
+    d_sampled_vertices.shrink_to_fit(handle.get_stream());
+
+    remaining_vertices.resize(non_candidate_vertices.size() + d_sampled_vertices.size(),
+                              handle.get_stream());
+    remaining_vertices.shrink_to_fit(handle.get_stream());
+
+    // merge non-candidate and remaining candidate vertices
+    thrust::merge(handle.get_thrust_policy(),
+                  non_candidate_vertices.begin(),
+                  non_candidate_vertices.end(),
+                  d_sampled_vertices.begin(),
+                  d_sampled_vertices.end(),
+                  remaining_vertices.begin());
+
+    nr_remaining_vertices_to_check = remaining_vertices.size();
+    if (multi_gpu) {
+      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
+                                                             nr_remaining_vertices_to_check,
+                                                             raft::comms::op_t::SUM,
+                                                             handle.get_stream());
+    }
+
+    if (nr_remaining_vertices_to_check == 0) { break; }
+  }
+
+  // Count number of vertices included in MIS
+
+  vertex_t nr_vertices_included_in_mis = thrust::count_if(
+    handle.get_thrust_policy(), ranks.begin(), ranks.end(), [] __device__(auto v_rank) {
+      return v_rank >= std::numeric_limits<vertex_t>::max();
+    });
+
+  // Build MIS and return
+  rmm::device_uvector<vertex_t> mis(nr_vertices_included_in_mis, handle.get_stream());
+  thrust::copy_if(
+    handle.get_thrust_policy(),
+    vertex_begin,
+    vertex_end,
+    ranks.begin(),
+    mis.begin(),
+    [] __device__(auto v_rank) { return v_rank >= std::numeric_limits<vertex_t>::max(); });
+
+  ranks.resize(0, handle.get_stream());
+  ranks.shrink_to_fit(handle.get_stream());
+  return mis;
+}
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/community/mis.hpp b/cpp/src/community/detail/maximal_independent_moves.hpp
similarity index 91%
rename from cpp/src/community/mis.hpp
rename to cpp/src/community/detail/maximal_independent_moves.hpp
index 83c0d9775f9..b5588b11ef7 100644
--- a/cpp/src/community/mis.hpp
+++ b/cpp/src/community/detail/maximal_independent_moves.hpp
@@ -23,9 +23,13 @@
 #include <rmm/device_uvector.hpp>
 
 namespace cugraph {
+namespace detail {
+
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-rmm::device_uvector<vertex_t> maximal_independent_set(
+rmm::device_uvector<vertex_t> maximal_independent_moves(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   raft::random::RngState& rng_state);
+
+}  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/maximal_independent_moves_mg.cu b/cpp/src/community/detail/maximal_independent_moves_mg.cu
new file mode 100644
index 00000000000..577253cdf58
--- /dev/null
+++ b/cpp/src/community/detail/maximal_independent_moves_mg.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "maximal_independent_moves.cuh"
+
+namespace cugraph {
+namespace detail {
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/community/detail/maximal_independent_moves_sg.cu b/cpp/src/community/detail/maximal_independent_moves_sg.cu
new file mode 100644
index 00000000000..18527c1ce48
--- /dev/null
+++ b/cpp/src/community/detail/maximal_independent_moves_sg.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "maximal_independent_moves.cuh"
+
+namespace cugraph {
+namespace detail {
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> maximal_independent_moves(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& decision_graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index b767ce7d8bb..ef34ad90584 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -16,8 +16,8 @@
 #pragma once
 
 #include "common_methods.hpp"
-#include "community/mis.hpp"
 #include "detail/graph_partition_utils.cuh"
+#include "maximal_independent_moves.hpp"
 #include "prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh"
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/reduce_op.cuh"
@@ -660,8 +660,8 @@ refine_clustering(
     // Determine a set of moves using MIS of the decision_graph
     //
 
-    auto vertices_in_mis =
-      maximal_independent_set<vertex_t, edge_t, multi_gpu>(handle, decision_graph_view, rng_state);
+    auto vertices_in_mis = maximal_independent_moves<vertex_t, edge_t, multi_gpu>(
+      handle, decision_graph_view, rng_state);
 
     rmm::device_uvector<vertex_t> numbering_indices((*renumber_map).size(), handle.get_stream());
     detail::sequence_fill(handle.get_stream(),
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/components/mis_impl.cuh
similarity index 90%
rename from cpp/src/community/detail/mis_impl.cuh
rename to cpp/src/components/mis_impl.cuh
index 217d64eb1c1..550edf9807a 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/components/mis_impl.cuh
@@ -16,11 +16,12 @@
  */
 #pragma once
 
-#include "community/mis.hpp"
+#include "prims/fill_edge_property.cuh"
 #include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 
+#include <cugraph/algorithms.hpp>
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_functions.hpp>
@@ -60,36 +61,46 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
     thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first());
   auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last());
 
-  // Compute out-degree
   auto out_degrees = graph_view.compute_out_degrees(handle);
+  auto in_degrees  = graph_view.compute_in_degrees(handle);
 
-  // Vertices with non-zero out-degree are possible candidates for MIS.
+  // Vertices with degree zero are always part of MIS
   remaining_vertices.resize(
     thrust::distance(remaining_vertices.begin(),
                      thrust::copy_if(handle.get_thrust_policy(),
                                      vertex_begin,
                                      vertex_end,
-                                     out_degrees.begin(),
+                                     thrust::make_zip_iterator(
+                                       thrust::make_tuple(out_degrees.begin(), in_degrees.begin())),
                                      remaining_vertices.begin(),
-                                     [] __device__(auto deg) { return deg > 0; })),
+                                     [] __device__(auto out_deg_and_in_deg) {
+                                       return !((thrust::get<0>(out_deg_and_in_deg) == 0) &&
+                                                (thrust::get<1>(out_deg_and_in_deg) == 0));
+                                     })),
     handle.get_stream());
 
   // Set ID of each vertex as its rank
   rmm::device_uvector<vertex_t> ranks(local_vtx_partitoin_size, handle.get_stream());
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
-  // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::transform_if(handle.get_thrust_policy(),
-                       out_degrees.begin(),
-                       out_degrees.end(),
-                       ranks.begin(),
-                       cuda::proclaim_return_type<vertex_t>(
-                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
-                       [] __device__(auto deg) { return deg == 0; });
+  // Set ranks of zero degree vetices to std::numeric_limits<vertex_t>::max()
+  thrust::transform_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(thrust::make_tuple(out_degrees.begin(), in_degrees.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(out_degrees.end(), in_degrees.end())),
+    ranks.begin(),
+    cuda::proclaim_return_type<vertex_t>(
+      [] __device__(auto) { return std::numeric_limits<vertex_t>::max(); }),
+    [] __device__(auto in_out_degree) {
+      return (thrust::get<0>(in_out_degree) == 0) && (thrust::get<1>(in_out_degree) == 0);
+    });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
 
+  in_degrees.resize(0, handle.get_stream());
+  in_degrees.shrink_to_fit(handle.get_stream());
+
   size_t loop_counter = 0;
   while (true) {
     loop_counter++;
diff --git a/cpp/src/community/detail/mis_mg.cu b/cpp/src/components/mis_mg.cu
similarity index 83%
rename from cpp/src/community/detail/mis_mg.cu
rename to cpp/src/components/mis_mg.cu
index 0fc5eeb63c0..2418b38dd0b 100644
--- a/cpp/src/community/detail/mis_mg.cu
+++ b/cpp/src/components/mis_mg.cu
@@ -14,21 +14,21 @@
  * limitations under the License.
  */
 #include "mis_impl.cuh"
-
 namespace cugraph {
+
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& decision_graph_view,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, true> const& decision_graph_view,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int64_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& decision_graph_view,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   raft::random::RngState& rng_state);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/mis_sg.cu b/cpp/src/components/mis_sg.cu
similarity index 82%
rename from cpp/src/community/detail/mis_sg.cu
rename to cpp/src/components/mis_sg.cu
index 8a20b31d157..fea4c4f2765 100644
--- a/cpp/src/community/detail/mis_sg.cu
+++ b/cpp/src/components/mis_sg.cu
@@ -16,19 +16,20 @@
 #include "mis_impl.cuh"
 
 namespace cugraph {
+
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& decision_graph_view,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, false> const& decision_graph_view,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   raft::random::RngState& rng_state);
 
 template rmm::device_uvector<int64_t> maximal_independent_set(
   raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& decision_graph_view,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   raft::random::RngState& rng_state);
 
 }  // namespace cugraph
diff --git a/cpp/src/components/vertex_coloring_impl.cuh b/cpp/src/components/vertex_coloring_impl.cuh
new file mode 100644
index 00000000000..ce445ab3809
--- /dev/null
+++ b/cpp/src/components/vertex_coloring_impl.cuh
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/fill_edge_property.cuh"
+#include "prims/transform_e.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+
+#include <cugraph/algorithms.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/random/rng_state.hpp>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> vertex_coloring(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+  graph_view_t current_graph_view(graph_view);
+
+  // edge mask
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_even(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_odd(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+
+  cugraph::transform_e(
+    handle,
+    current_graph_view,
+    edge_src_dummy_property_t{}.view(),
+    edge_dst_dummy_property_t{}.view(),
+    cugraph::edge_dummy_property_t{}.view(),
+    [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+      return !(src == dst);  // mask out self-loop
+    },
+    edge_masks_even.mutable_view());
+
+  current_graph_view.attach_edge_mask(edge_masks_even.view());
+
+  // device vector to store colors of vertices
+  rmm::device_uvector<vertex_t> colors = rmm::device_uvector<vertex_t>(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+  thrust::fill(
+    handle.get_thrust_policy(), colors.begin(), colors.end(), std::numeric_limits<vertex_t>::max());
+
+  vertex_t color_id = 0;
+  while (true) {
+    auto mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
+      handle, current_graph_view, rng_state);
+
+    using flag_t                                 = uint8_t;
+    rmm::device_uvector<flag_t> is_vertex_in_mis = rmm::device_uvector<flag_t>(
+      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(), is_vertex_in_mis.begin(), is_vertex_in_mis.end(), 0);
+
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      mis.begin(),
+      mis.end(),
+      [color_id,
+       colors           = colors.data(),
+       is_vertex_in_mis = is_vertex_in_mis.data(),
+       v_first = current_graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+        auto v_offset              = v - v_first;
+        is_vertex_in_mis[v_offset] = flag_t{1};
+        vertex_t initial_color_id  = colors[v_offset];
+        colors[v_offset]           = (color_id < initial_color_id) ? color_id : initial_color_id;
+      });
+
+    if (current_graph_view.compute_number_of_edges(handle) == 0) { break; }
+
+    cugraph::edge_src_property_t<graph_view_t, flag_t> src_mis_flags(handle, current_graph_view);
+    cugraph::edge_dst_property_t<graph_view_t, flag_t> dst_mis_flags(handle, current_graph_view);
+
+    cugraph::update_edge_src_property(
+      handle, current_graph_view, is_vertex_in_mis.begin(), src_mis_flags);
+
+    cugraph::update_edge_dst_property(
+      handle, current_graph_view, is_vertex_in_mis.begin(), dst_mis_flags);
+
+    if (color_id % 2 == 0) {
+      cugraph::transform_e(
+        handle,
+        current_graph_view,
+        src_mis_flags.view(),
+        dst_mis_flags.view(),
+        cugraph::edge_dummy_property_t{}.view(),
+        [color_id] __device__(
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
+        },
+        edge_masks_odd.mutable_view());
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+      current_graph_view.attach_edge_mask(edge_masks_odd.view());
+    } else {
+      cugraph::transform_e(
+        handle,
+        current_graph_view,
+        src_mis_flags.view(),
+        dst_mis_flags.view(),
+        cugraph::edge_dummy_property_t{}.view(),
+        [color_id] __device__(
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
+        },
+        edge_masks_even.mutable_view());
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+      current_graph_view.attach_edge_mask(edge_masks_even.view());
+    }
+
+    color_id++;
+  }
+  return colors;
+}
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
+  return detail::vertex_coloring(handle, graph_view, rng_state);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/components/vertex_coloring_mg.cu b/cpp/src/components/vertex_coloring_mg.cu
new file mode 100644
index 00000000000..8f87e8bd534
--- /dev/null
+++ b/cpp/src/components/vertex_coloring_mg.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "vertex_coloring_impl.cuh"
+
+namespace cugraph {
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace cugraph
diff --git a/cpp/src/components/vertex_coloring_sg.cu b/cpp/src/components/vertex_coloring_sg.cu
new file mode 100644
index 00000000000..427bc0b2c81
--- /dev/null
+++ b/cpp/src/components/vertex_coloring_sg.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "vertex_coloring_impl.cuh"
+
+namespace cugraph {
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int32_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  raft::random::RngState& rng_state);
+
+template rmm::device_uvector<int64_t> vertex_coloring(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  raft::random::RngState& rng_state);
+
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 46a895536ef..af0dffcbf65 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -372,6 +372,14 @@ ConfigureTest(EDGE_BETWEENNESS_CENTRALITY_TEST centrality/edge_betweenness_centr
 # - WEAKLY CONNECTED COMPONENTS tests -------------------------------------------------------------
 ConfigureTest(WEAKLY_CONNECTED_COMPONENTS_TEST components/weakly_connected_components_test.cpp)
 
+###############################################################################################
+# - MIS tests ------------------------------------------------------------------------------
+ConfigureTest(MIS_TEST components/mis_test.cu)
+
+###############################################################################################
+# - VERTEX COLORING tests -------------------------------------------------------------------
+ConfigureTest(VERTEX_COLORING_TEST components/vertex_coloring_test.cu)
+
 ###################################################################################################
 # - SIMILARITY tests ------------------------------------------------------------------------------
 ConfigureTest(SIMILARITY_TEST link_prediction/similarity_test.cu)
@@ -535,10 +543,6 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG ECG tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_ECG_TEST community/mg_ecg_test.cpp)
 
-    ###############################################################################################
-    # - MG MIS tests ------------------------------------------------------------------------------
-    ConfigureTestMG(MG_MIS_TEST community/mg_mis_test.cu)
-
     ###############################################################################################
     # - MG SELECT RANDOM VERTICES tests -----------------------------------------------------------
     ConfigureTestMG(MG_SELECT_RANDOM_VERTICES structure/mg_select_random_vertices_test.cpp)
@@ -552,6 +556,14 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
                     components/mg_weakly_connected_components_test.cpp)
 
+    ###############################################################################################
+    # - MG MIS tests ------------------------------------------------------------------------------
+    ConfigureTestMG(MG_MIS_TEST components/mg_mis_test.cu)
+
+    ###############################################################################################
+    # - MG VERTEX COLORING tests -------------------------------------------------------------------
+    ConfigureTestMG(MG_VERTEX_COLORING_TEST components/mg_vertex_coloring_test.cu)
+
     ###############################################################################################
     # - MG Core Number tests ----------------------------------------------------------------------
     ConfigureTestMG(MG_CORE_NUMBER_TEST cores/mg_core_number_test.cpp)
diff --git a/cpp/tests/community/mg_mis_test.cu b/cpp/tests/components/mg_mis_test.cu
similarity index 95%
rename from cpp/tests/community/mg_mis_test.cu
rename to cpp/tests/components/mg_mis_test.cu
index 1240cf812f9..04c346b0f00 100644
--- a/cpp/tests/community/mg_mis_test.cu
+++ b/cpp/tests/components/mg_mis_test.cu
@@ -245,18 +245,20 @@ TEST_P(Tests_MGMaximalIndependentSet_Rmat, CheckInt64Int64FloatFloat)
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
+bool constexpr check_correctness = false;
 INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGMaximalIndependentSet_File,
-  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{false},
-                                       MaximalIndependentSet_Usecase{false}),
+  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                                       MaximalIndependentSet_Usecase{check_correctness}),
                      ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
-INSTANTIATE_TEST_SUITE_P(rmat_small_test,
-                         Tests_MGMaximalIndependentSet_Rmat,
-                         ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{false}),
-                                            ::testing::Values(cugraph::test::Rmat_Usecase(
-                                              3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
@@ -266,7 +268,8 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGMaximalIndependentSet_Rmat,
   ::testing::Combine(
-    ::testing::Values(MaximalIndependentSet_Usecase{false}, MaximalIndependentSet_Usecase{false}),
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                      MaximalIndependentSet_Usecase{check_correctness}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/mg_vertex_coloring_test.cu b/cpp/tests/components/mg_vertex_coloring_test.cu
new file mode 100644
index 00000000000..ce4dadaa786
--- /dev/null
+++ b/cpp/tests/components/mg_vertex_coloring_test.cu
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "prims/fill_edge_src_dst_property.cuh"
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/property_generator.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/transform_reduce_e.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct GraphColoring_UseCase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGGraphColoring
+  : public ::testing::TestWithParam<std::tuple<GraphColoring_UseCase, input_usecase_t>> {
+ public:
+  Tests_MGGraphColoring() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<GraphColoring_UseCase, input_usecase_t> const& param)
+  {
+    auto [coloring_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    constexpr bool multi_gpu = true;
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(multi_gpu ? handle_->get_comms().get_rank() : 0);
+    auto d_colors =
+      cugraph::vertex_coloring<vertex_t, edge_t, multi_gpu>(*handle_, mg_graph_view, rng_state);
+
+    // Test Graph Coloring
+
+    if (coloring_usecase.check_correctness) {
+      std::vector<vertex_t> h_colors(d_colors.size());
+      raft::update_host(h_colors.data(), d_colors.data(), d_colors.size(), handle_->get_stream());
+
+      std::for_each(h_colors.begin(),
+                    h_colors.end(),
+                    [num_vertices = mg_graph_view.number_of_vertices()](vertex_t color_id) {
+                      ASSERT_TRUE(color_id <= num_vertices);
+                    });
+
+      using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+      cugraph::edge_src_property_t<GraphViewType, vertex_t> src_color_cache(*handle_);
+      cugraph::edge_dst_property_t<GraphViewType, vertex_t> dst_color_cache(*handle_);
+
+      if constexpr (multi_gpu) {
+        src_color_cache =
+          cugraph::edge_src_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
+        dst_color_cache =
+          cugraph::edge_dst_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
+        update_edge_src_property(*handle_, mg_graph_view, d_colors.begin(), src_color_cache);
+        update_edge_dst_property(*handle_, mg_graph_view, d_colors.begin(), dst_color_cache);
+      }
+
+      rmm::device_uvector<uint8_t> d_color_conflicts(
+        mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+      per_v_transform_reduce_outgoing_e(
+        *handle_,
+        mg_graph_view,
+        multi_gpu
+          ? src_color_cache.view()
+          : cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(d_colors.data()),
+        multi_gpu ? dst_color_cache.view()
+                  : cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                      d_colors.data(), vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return uint8_t{1};
+          } else {
+            return uint8_t{0};
+          }
+        },
+        uint8_t{0},
+        cugraph::reduce_op::maximum<uint8_t>{},
+        d_color_conflicts.begin());
+
+      std::vector<uint8_t> h_color_conflicts(d_color_conflicts.size());
+      raft::update_host(h_color_conflicts.data(),
+                        d_color_conflicts.data(),
+                        d_color_conflicts.size(),
+                        handle_->get_stream());
+
+      std::vector<vertex_t> h_vertices_in_this_proces((*mg_renumber_map).size());
+
+      raft::update_host(h_vertices_in_this_proces.data(),
+                        (*mg_renumber_map).data(),
+                        (*mg_renumber_map).size(),
+                        handle_->get_stream());
+      handle_->sync_stream();
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      size_t nr_conflicts = cugraph::transform_reduce_e(
+        *handle_,
+        mg_graph_view,
+        multi_gpu ? src_color_cache.view()
+                  : cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(
+                      d_colors.begin()),
+        multi_gpu ? dst_color_cache.view()
+                  : cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                      d_colors.begin(), vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [renumber_map = (*mg_renumber_map).data()] __device__(
+          auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return vertex_t{1};
+          } else {
+            return vertex_t{0};
+          }
+        },
+        vertex_t{0});
+
+      ASSERT_TRUE(nr_conflicts == edge_t{0})
+        << "adjacent vertices can't have same color." << std::endl;
+
+      {
+        thrust::for_each(
+          thrust::host,
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.begin(), h_vertices_in_this_proces.begin(), h_color_conflicts.begin())),
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.end(), h_vertices_in_this_proces.end(), h_color_conflicts.end())),
+          [](auto color_vetex_and_conflict_flag) {
+            auto color         = thrust::get<0>(color_vetex_and_conflict_flag);
+            auto v             = thrust::get<1>(color_vetex_and_conflict_flag);
+            auto conflict_flag = thrust::get<2>(color_vetex_and_conflict_flag);
+            ASSERT_TRUE(conflict_flag == 0)
+              << v << " got same color as one of its neighbor" << std::endl;
+          });
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGGraphColoring<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGGraphColoring_File = Tests_MGGraphColoring<cugraph::test::File_Usecase>;
+using Tests_MGGraphColoring_Rmat = Tests_MGGraphColoring<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGGraphColoring_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGGraphColoring_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+bool constexpr check_correctness = false;
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGGraphColoring_File,
+  ::testing::Combine(::testing::Values(GraphColoring_UseCase{check_correctness},
+                                       GraphColoring_UseCase{check_correctness}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness},
+                      GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/mis_test.cu b/cpp/tests/components/mis_test.cu
new file mode 100644
index 00000000000..f3bdd3d0e8b
--- /dev/null
+++ b/cpp/tests/components/mis_test.cu
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/reduce_op.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct MaximalIndependentSet_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SGMaximalIndependentSet
+  : public ::testing::TestWithParam<std::tuple<MaximalIndependentSet_Usecase, input_usecase_t>> {
+ public:
+  Tests_SGMaximalIndependentSet() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<MaximalIndependentSet_Usecase, input_usecase_t> const& param)
+  {
+    auto [mis_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.start("Construct graph");
+    }
+
+    constexpr bool multi_gpu = false;
+
+    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        handle, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(0);
+    auto d_mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
+      handle, sg_graph_view, rng_state);
+
+    // Test MIS
+    if (mis_usecase.check_correctness) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      std::vector<vertex_t> h_mis(d_mis.size());
+      raft::update_host(h_mis.data(), d_mis.data(), d_mis.size(), handle.get_stream());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      auto vertex_first = sg_graph_view.local_vertex_partition_range_first();
+      auto vertex_last  = sg_graph_view.local_vertex_partition_range_last();
+
+      std::for_each(h_mis.begin(), h_mis.end(), [vertex_first, vertex_last](vertex_t v) {
+        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
+      });
+
+      // If a vertex is included in MIS, then none of its neighbor should be
+
+      vertex_t local_vtx_partitoin_size = sg_graph_view.local_vertex_partition_range_size();
+      rmm::device_uvector<vertex_t> d_total_outgoing_nbrs_included_mis(local_vtx_partitoin_size,
+                                                                       handle.get_stream());
+
+      rmm::device_uvector<vertex_t> inclusiong_flags(local_vtx_partitoin_size, handle.get_stream());
+
+      thrust::uninitialized_fill(
+        handle.get_thrust_policy(), inclusiong_flags.begin(), inclusiong_flags.end(), vertex_t{0});
+
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        d_mis.begin(),
+        d_mis.end(),
+        [inclusiong_flags =
+           raft::device_span<vertex_t>(inclusiong_flags.data(), inclusiong_flags.size()),
+         v_first = sg_graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+          auto v_offset              = v - v_first;
+          inclusiong_flags[v_offset] = vertex_t{1};
+        });
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      per_v_transform_reduce_outgoing_e(
+        handle,
+        sg_graph_view,
+        cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(
+          inclusiong_flags.data()),
+        cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+          inclusiong_flags.data(), vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
+          return (src == dst) ? 0 : dst_included;
+        },
+        vertex_t{0},
+        cugraph::reduce_op::plus<vertex_t>{},
+        d_total_outgoing_nbrs_included_mis.begin());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      std::vector<vertex_t> h_total_outgoing_nbrs_included_mis(
+        d_total_outgoing_nbrs_included_mis.size());
+      raft::update_host(h_total_outgoing_nbrs_included_mis.data(),
+                        d_total_outgoing_nbrs_included_mis.data(),
+                        d_total_outgoing_nbrs_included_mis.size(),
+                        handle.get_stream());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      {
+        auto vertex_first = sg_graph_view.local_vertex_partition_range_first();
+        auto vertex_last  = sg_graph_view.local_vertex_partition_range_last();
+
+        std::for_each(h_mis.begin(),
+                      h_mis.end(),
+                      [vertex_first, vertex_last, &h_total_outgoing_nbrs_included_mis](vertex_t v) {
+                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last))
+                          << v << " is not within vertex parition range" << std::endl;
+
+                        ASSERT_TRUE(h_total_outgoing_nbrs_included_mis[v - vertex_first] == 0)
+                          << v << "'s neighbor is included in MIS" << std::endl;
+                      });
+      }
+    }
+  }
+};
+
+using Tests_SGMaximalIndependentSet_File =
+  Tests_SGMaximalIndependentSet<cugraph::test::File_Usecase>;
+using Tests_SGMaximalIndependentSet_Rmat =
+  Tests_SGMaximalIndependentSet<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+bool constexpr check_correctness = false;
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SGMaximalIndependentSet_File,
+  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                                       MaximalIndependentSet_Usecase{check_correctness}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_SGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                      MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/vertex_coloring_test.cu b/cpp/tests/components/vertex_coloring_test.cu
new file mode 100644
index 00000000000..27a0c5013bd
--- /dev/null
+++ b/cpp/tests/components/vertex_coloring_test.cu
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/transform_reduce_e.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct GraphColoring_UseCase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SGGraphColoring
+  : public ::testing::TestWithParam<std::tuple<GraphColoring_UseCase, input_usecase_t>> {
+ public:
+  Tests_SGGraphColoring() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<GraphColoring_UseCase, input_usecase_t> const& param)
+  {
+    auto [coloring_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.start("Construct graph");
+    }
+
+    constexpr bool multi_gpu = false;
+
+    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        handle, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(0);
+    auto d_colors =
+      cugraph::vertex_coloring<vertex_t, edge_t, multi_gpu>(handle, sg_graph_view, rng_state);
+
+    // Test Graph Coloring
+
+    if (coloring_usecase.check_correctness) {
+      std::vector<vertex_t> h_colors(d_colors.size());
+      raft::update_host(h_colors.data(), d_colors.data(), d_colors.size(), handle.get_stream());
+
+      std::for_each(h_colors.begin(),
+                    h_colors.end(),
+                    [num_vertices = sg_graph_view.number_of_vertices()](vertex_t color_id) {
+                      ASSERT_TRUE(color_id <= num_vertices);
+                    });
+
+      rmm::device_uvector<uint8_t> d_color_conflict_flags(
+        sg_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+
+      per_v_transform_reduce_outgoing_e(
+        handle,
+        sg_graph_view,
+        cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(d_colors.data()),
+        cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(d_colors.data(),
+                                                                               vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return uint8_t{1};
+          } else {
+            return uint8_t{0};
+          }
+        },
+        uint8_t{0},
+        cugraph::reduce_op::maximum<uint8_t>{},
+        d_color_conflict_flags.begin());
+
+      std::vector<uint8_t> h_color_conflict_flags(d_color_conflict_flags.size());
+      raft::update_host(h_color_conflict_flags.data(),
+                        d_color_conflict_flags.data(),
+                        d_color_conflict_flags.size(),
+                        handle.get_stream());
+
+      std::vector<vertex_t> h_vertices_in_this_proces((*sg_renumber_map).size());
+
+      raft::update_host(h_vertices_in_this_proces.data(),
+                        (*sg_renumber_map).data(),
+                        (*sg_renumber_map).size(),
+                        handle.get_stream());
+      handle.sync_stream();
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      edge_t nr_conflicts = cugraph::transform_reduce_e(
+        handle,
+        sg_graph_view,
+        cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(d_colors.begin()),
+        cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(d_colors.begin(),
+                                                                               vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [renumber_map = (*sg_renumber_map).data()] __device__(
+          auto src, auto dst, auto src_color, auto dst_color, thrust::nullopt_t) {
+          if ((src != dst) && (src_color == dst_color)) {
+            return vertex_t{1};
+          } else {
+            return vertex_t{0};
+          }
+        },
+        vertex_t{0});
+
+      ASSERT_TRUE(nr_conflicts == edge_t{0})
+        << "adjacent vertices can't have same color." << std::endl;
+
+      if (nr_conflicts >= 0) {
+        thrust::for_each(
+          thrust::host,
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.begin(), h_vertices_in_this_proces.begin(), h_color_conflict_flags.begin())),
+          thrust::make_zip_iterator(thrust::make_tuple(
+            h_colors.end(), h_vertices_in_this_proces.end(), h_color_conflict_flags.end())),
+          [](auto color_vetex_and_conflict_flag) {
+            auto color         = thrust::get<0>(color_vetex_and_conflict_flag);
+            auto v             = thrust::get<1>(color_vetex_and_conflict_flag);
+            auto conflict_flag = thrust::get<2>(color_vetex_and_conflict_flag);
+            ASSERT_TRUE(conflict_flag == 0)
+              << v << " got same color as one of its neighbor" << std::endl;
+          });
+      }
+    }
+  }
+};
+
+using Tests_SGGraphColoring_File = Tests_SGGraphColoring<cugraph::test::File_Usecase>;
+using Tests_SGGraphColoring_Rmat = Tests_SGGraphColoring<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SGGraphColoring_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGGraphColoring_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+bool constexpr check_correctness = false;
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SGGraphColoring_File,
+  ::testing::Combine(::testing::Values(GraphColoring_UseCase{check_correctness},
+                                       GraphColoring_UseCase{check_correctness}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_SGGraphColoring_Rmat,
+  ::testing::Combine(
+    ::testing::Values(GraphColoring_UseCase{check_correctness},
+                      GraphColoring_UseCase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()

From fdc6aa5acde918e50b6f6f365cc1ec03fcfb18ec Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Tue, 12 Mar 2024 19:01:26 -0400
Subject: [PATCH 4/5] Add degrees to C API (#4212)

Add new method `cugraph_degrees` to the C API to compute and return the degrees of vertices.

Closes #4171

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Joseph Nke (https://github.com/jnke2016)
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4212
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cugraph_c/graph_functions.h       | 112 +++++
 cpp/src/c_api/abstract_functor.hpp            |   2 +-
 cpp/src/c_api/degrees.cu                      | 225 ++++++++++
 cpp/src/c_api/degrees_result.cpp              |  63 +++
 cpp/src/c_api/degrees_result.hpp              |  32 ++
 cpp/tests/CMakeLists.txt                      |   2 +
 cpp/tests/c_api/c_test_utils.h                |   4 +-
 cpp/tests/c_api/degrees_test.c                | 387 +++++++++++++++++
 cpp/tests/c_api/mg_degrees_test.c             | 407 ++++++++++++++++++
 cpp/tests/c_api/test_utils.cpp                |  14 +-
 .../simpleDistributedGraph.py                 | 307 +++++++------
 .../graph_implementation/simpleGraph.py       | 198 +++++----
 .../centrality/test_degree_centrality_mg.py   |   4 +-
 .../pylibcugraph/pylibcugraph/CMakeLists.txt  |   3 +-
 python/pylibcugraph/pylibcugraph/__init__.py  |   2 +
 .../_cugraph_c/graph_functions.pxd            |  55 +++
 python/pylibcugraph/pylibcugraph/degrees.pyx  | 307 +++++++++++++
 18 files changed, 1905 insertions(+), 221 deletions(-)
 create mode 100644 cpp/src/c_api/degrees.cu
 create mode 100644 cpp/src/c_api/degrees_result.cpp
 create mode 100644 cpp/src/c_api/degrees_result.hpp
 create mode 100644 cpp/tests/c_api/degrees_test.c
 create mode 100644 cpp/tests/c_api/mg_degrees_test.c
 create mode 100644 python/pylibcugraph/pylibcugraph/degrees.pyx

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b12403710ab..88908ef70ce 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -415,6 +415,8 @@ endif()
 add_library(cugraph_c
         src/c_api/resource_handle.cpp
         src/c_api/array.cpp
+        src/c_api/degrees.cu
+        src/c_api/degrees_result.cpp
         src/c_api/error.cpp
         src/c_api/graph_sg.cpp
         src/c_api/graph_mg.cpp
diff --git a/cpp/include/cugraph_c/graph_functions.h b/cpp/include/cugraph_c/graph_functions.h
index 8fe1ea0b958..94b06189796 100644
--- a/cpp/include/cugraph_c/graph_functions.h
+++ b/cpp/include/cugraph_c/graph_functions.h
@@ -229,6 +229,118 @@ cugraph_error_code_t cugraph_allgather(const cugraph_resource_handle_t* handle,
                                        cugraph_induced_subgraph_result_t** result,
                                        cugraph_error_t** error);
 
+/**
+ * @brief       Opaque degree result type
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_degrees_result_t;
+
+/**
+ * @brief      Compute in degrees
+ *
+ * Compute the in degrees for the vertices in the graph.
+ *
+ * @param [in]  handle              Handle for accessing resources.
+ * @param [in]  graph               Pointer to graph
+ * @param [in]  source_vertices     Device array of vertices we want to compute in degrees for.
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to degrees result
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_in_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief      Compute out degrees
+ *
+ * Compute the out degrees for the vertices in the graph.
+ *
+ * @param [in]  handle              Handle for accessing resources.
+ * @param [in]  graph               Pointer to graph
+ * @param [in]  source_vertices     Device array of vertices we want to compute out degrees for.
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to degrees result
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_out_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief      Compute degrees
+ *
+ * Compute the degrees for the vertices in the graph.
+ *
+ * @param [in]  handle              Handle for accessing resources.
+ * @param [in]  graph               Pointer to graph
+ * @param [in]  source_vertices     Device array of vertices we want to compute degrees for.
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to degrees result
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_degrees(const cugraph_resource_handle_t* handle,
+                                     cugraph_graph_t* graph,
+                                     const cugraph_type_erased_device_array_view_t* source_vertices,
+                                     bool_t do_expensive_check,
+                                     cugraph_degrees_result_t** result,
+                                     cugraph_error_t** error);
+
+/**
+ * @brief       Get the vertex ids
+ *
+ * @param [in]     degrees_result   Opaque pointer to degree result
+ * @return type erased array view of vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_vertices(
+  cugraph_degrees_result_t* degrees_result);
+
+/**
+ * @brief       Get the in degrees
+ *
+ * @param [in]     degrees_result   Opaque pointer to degree result
+ * @return type erased array view of vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_in_degrees(
+  cugraph_degrees_result_t* degrees_result);
+
+/**
+ * @brief       Get the out degrees
+ *
+ * If the graph is symmetric, in degrees and out degrees will be equal (and
+ * will be stored in the same memory).
+ *
+ * @param [in]     degrees_result   Opaque pointer to degree result
+ * @return type erased array view of vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_out_degrees(
+  cugraph_degrees_result_t* degrees_result);
+
+/**
+ * @brief     Free degree result
+ *
+ * @param [in]    degrees_result   Opaque pointer to degree result
+ */
+void cugraph_degrees_result_free(cugraph_degrees_result_t* degrees_result);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/src/c_api/abstract_functor.hpp b/cpp/src/c_api/abstract_functor.hpp
index 219b1256065..8d3ed11341f 100644
--- a/cpp/src/c_api/abstract_functor.hpp
+++ b/cpp/src/c_api/abstract_functor.hpp
@@ -27,7 +27,7 @@ namespace c_api {
 struct abstract_functor {
   // Move to abstract functor... make operator a void, add cugraph_graph_t * result to functor
   // try that with instantiation questions
-  std::unique_ptr<cugraph_error_t> error_{std::make_unique<cugraph_error_t>("")};
+  std::unique_ptr<cugraph_error_t> error_ = {std::make_unique<cugraph_error_t>("")};
   cugraph_error_code_t error_code_{CUGRAPH_SUCCESS};
 
   void unsupported()
diff --git a/cpp/src/c_api/degrees.cu b/cpp/src/c_api/degrees.cu
new file mode 100644
index 00000000000..d6481efa905
--- /dev/null
+++ b/cpp/src/c_api/degrees.cu
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/abstract_functor.hpp"
+#include "c_api/degrees_result.hpp"
+#include "c_api/graph.hpp"
+#include "c_api/resource_handle.hpp"
+#include "c_api/utils.hpp"
+
+#include <cugraph_c/algorithms.h>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <thrust/gather.h>
+
+#include <optional>
+
+namespace {
+
+struct degrees_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_graph_t* graph_{};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* source_vertices_;
+  bool in_degrees_{false};
+  bool out_degrees_{false};
+  bool do_expensive_check_{false};
+  cugraph::c_api::cugraph_degrees_result_t* result_{};
+
+  degrees_functor(cugraph_resource_handle_t const* handle,
+                  cugraph_graph_t* graph,
+                  ::cugraph_type_erased_device_array_view_t const* source_vertices,
+                  bool in_degrees,
+                  bool out_degrees,
+                  bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      source_vertices_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+          source_vertices)),
+      in_degrees_{in_degrees},
+      out_degrees_{out_degrees},
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    // FIXME: Think about how to handle SG vice MG
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>*>(
+          graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      std::optional<rmm::device_uvector<edge_t>> in_degrees{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> out_degrees{std::nullopt};
+
+      if (in_degrees_ && out_degrees_ && graph_view.is_symmetric()) {
+        in_degrees = store_transposed ? graph_view.compute_in_degrees(handle_)
+                                      : graph_view.compute_out_degrees(handle_);
+        // out_degrees will be extracted from in_degrees in the result
+      } else {
+        if (in_degrees_) in_degrees = graph_view.compute_in_degrees(handle_);
+
+        if (out_degrees_) out_degrees = graph_view.compute_out_degrees(handle_);
+      }
+
+      rmm::device_uvector<vertex_t> vertex_ids(0, handle_.get_stream());
+
+      if (source_vertices_) {
+        // FIXME: Would be more efficient if graph_view.compute_*_degrees could take a vertex
+        //  subset
+        vertex_ids.resize(source_vertices_->size_, handle_.get_stream());
+        raft::copy(vertex_ids.data(),
+                   source_vertices_->as_type<vertex_t>(),
+                   vertex_ids.size(),
+                   handle_.get_stream());
+
+        if constexpr (multi_gpu) {
+          vertex_ids = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+            handle_, std::move(vertex_ids));
+        }
+
+        cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
+          handle_,
+          vertex_ids.data(),
+          vertex_ids.size(),
+          number_map->data(),
+          graph_view.local_vertex_partition_range_first(),
+          graph_view.local_vertex_partition_range_last(),
+          do_expensive_check_);
+
+        auto vertex_partition = cugraph::vertex_partition_device_view_t<vertex_t, multi_gpu>(
+          graph_view.local_vertex_partition_view());
+
+        auto vertices_iter = thrust::make_transform_iterator(
+          vertex_ids.begin(),
+          cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
+            return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+          }));
+
+        if (in_degrees && out_degrees) {
+          rmm::device_uvector<edge_t> tmp_in_degrees(vertex_ids.size(), handle_.get_stream());
+          rmm::device_uvector<edge_t> tmp_out_degrees(vertex_ids.size(), handle_.get_stream());
+          thrust::gather(
+            handle_.get_thrust_policy(),
+            vertices_iter,
+            vertices_iter + vertex_ids.size(),
+            thrust::make_zip_iterator(in_degrees->begin(), out_degrees->begin()),
+            thrust::make_zip_iterator(tmp_in_degrees.begin(), tmp_out_degrees.begin()));
+          in_degrees  = std::move(tmp_in_degrees);
+          out_degrees = std::move(tmp_out_degrees);
+        } else if (in_degrees) {
+          rmm::device_uvector<edge_t> tmp_in_degrees(vertex_ids.size(), handle_.get_stream());
+          thrust::gather(handle_.get_thrust_policy(),
+                         vertices_iter,
+                         vertices_iter + vertex_ids.size(),
+                         in_degrees->begin(),
+                         tmp_in_degrees.begin());
+          in_degrees = std::move(tmp_in_degrees);
+        } else {
+          rmm::device_uvector<edge_t> tmp_out_degrees(vertex_ids.size(), handle_.get_stream());
+          thrust::gather(handle_.get_thrust_policy(),
+                         vertices_iter,
+                         vertices_iter + vertex_ids.size(),
+                         out_degrees->begin(),
+                         tmp_out_degrees.begin());
+          out_degrees = std::move(tmp_out_degrees);
+        }
+
+        cugraph::unrenumber_local_int_vertices<vertex_t>(
+          handle_,
+          vertex_ids.data(),
+          vertex_ids.size(),
+          number_map->data(),
+          graph_view.local_vertex_partition_range_first(),
+          graph_view.local_vertex_partition_range_last(),
+          do_expensive_check_);
+      } else {
+        vertex_ids.resize(graph_view.local_vertex_partition_range_size(), handle_.get_stream());
+        raft::copy(vertex_ids.data(), number_map->data(), vertex_ids.size(), handle_.get_stream());
+      }
+
+      result_ = new cugraph::c_api::cugraph_degrees_result_t{
+        graph_view.is_symmetric(),
+        new cugraph::c_api::cugraph_type_erased_device_array_t(vertex_ids, graph_->vertex_type_),
+        in_degrees
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*in_degrees, graph_->edge_type_)
+          : nullptr,
+        out_degrees
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*out_degrees, graph_->edge_type_)
+          : nullptr};
+    }
+  }
+};
+
+}  // namespace
+
+extern "C" cugraph_error_code_t cugraph_in_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error)
+{
+  degrees_functor functor(handle, graph, source_vertices, true, false, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+extern "C" cugraph_error_code_t cugraph_out_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error)
+{
+  degrees_functor functor(handle, graph, source_vertices, false, true, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+extern "C" cugraph_error_code_t cugraph_degrees(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* source_vertices,
+  bool_t do_expensive_check,
+  cugraph_degrees_result_t** result,
+  cugraph_error_t** error)
+{
+  degrees_functor functor(handle, graph, source_vertices, true, true, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/c_api/degrees_result.cpp b/cpp/src/c_api/degrees_result.cpp
new file mode 100644
index 00000000000..a4649e36d05
--- /dev/null
+++ b/cpp/src/c_api/degrees_result.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/degrees_result.hpp"
+
+#include <cugraph_c/graph_functions.h>
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_vertices(
+  cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+    internal_pointer->vertex_ids_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_in_degrees(
+  cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  return internal_pointer->in_degrees_ == nullptr
+           ? nullptr
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->in_degrees_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_out_degrees(
+  cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  return internal_pointer->out_degrees_ != nullptr
+           ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->out_degrees_->view())
+         : internal_pointer->is_symmetric
+           ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->in_degrees_->view())
+           : nullptr;
+}
+
+extern "C" void cugraph_degrees_result_free(cugraph_degrees_result_t* degrees_result)
+{
+  auto internal_pointer =
+    reinterpret_cast<cugraph::c_api::cugraph_degrees_result_t*>(degrees_result);
+  delete internal_pointer->vertex_ids_;
+  delete internal_pointer->in_degrees_;
+  delete internal_pointer->out_degrees_;
+  delete internal_pointer;
+}
diff --git a/cpp/src/c_api/degrees_result.hpp b/cpp/src/c_api/degrees_result.hpp
new file mode 100644
index 00000000000..c6e9bffa5a1
--- /dev/null
+++ b/cpp/src/c_api/degrees_result.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "c_api/array.hpp"
+
+namespace cugraph {
+namespace c_api {
+
+struct cugraph_degrees_result_t {
+  bool is_symmetric{false};
+  cugraph_type_erased_device_array_t* vertex_ids_{};
+  cugraph_type_erased_device_array_t* in_degrees_{};
+  cugraph_type_erased_device_array_t* out_degrees_{};
+};
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index af0dffcbf65..c84711e1a69 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -696,6 +696,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_SIMILARITY_TEST c_api/mg_similarity_test.c)
     ConfigureCTestMG(MG_CAPI_K_CORE_TEST c_api/mg_k_core_test.c)
     ConfigureCTestMG(MG_CAPI_INDUCED_SUBGRAPH_TEST c_api/mg_induced_subgraph_test.c)
+    ConfigureCTestMG(MG_CAPI_DEGREES c_api/mg_degrees_test.c)
     ConfigureCTestMG(MG_CAPI_EGONET_TEST c_api/mg_egonet_test.c)
     ConfigureCTestMG(MG_CAPI_TWO_HOP_NEIGHBORS_TEST c_api/mg_two_hop_neighbors_test.c)
 
@@ -764,6 +765,7 @@ ConfigureCTest(CAPI_CORE_NUMBER_TEST c_api/core_number_test.c)
 ConfigureCTest(CAPI_SIMILARITY_TEST c_api/similarity_test.c)
 ConfigureCTest(CAPI_K_CORE_TEST c_api/k_core_test.c)
 ConfigureCTest(CAPI_INDUCED_SUBGRAPH_TEST c_api/induced_subgraph_test.c)
+ConfigureCTest(CAPI_DEGREES c_api/degrees_test.c)
 ConfigureCTest(CAPI_EGONET_TEST c_api/egonet_test.c)
 ConfigureCTest(CAPI_TWO_HOP_NEIGHBORS_TEST c_api/two_hop_neighbors_test.c)
 ConfigureCTest(CAPI_LEGACY_K_TRUSS_TEST c_api/legacy_k_truss_test.c)
diff --git a/cpp/tests/c_api/c_test_utils.h b/cpp/tests/c_api/c_test_utils.h
index ab9fbeccd4b..fbbf6333ee3 100644
--- a/cpp/tests/c_api/c_test_utils.h
+++ b/cpp/tests/c_api/c_test_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,6 +101,8 @@ int create_sg_test_graph(const cugraph_resource_handle_t* handle,
                          cugraph_graph_t** graph,
                          cugraph_error_t** ret_error);
 
+size_t cugraph_size_t_allreduce(const cugraph_resource_handle_t* handle, size_t value);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/tests/c_api/degrees_test.c b/cpp/tests/c_api/degrees_test.c
new file mode 100644
index 00000000000..10a038b323b
--- /dev/null
+++ b/cpp/tests/c_api/degrees_test.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+/*
+ * Simple check of creating a graph from a COO on device memory.
+ */
+int generic_degrees_test(vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         size_t num_vertices,
+                         size_t num_edges,
+                         vertex_t* h_vertices,
+                         size_t num_vertices_to_compute,
+                         bool_t in_degrees,
+                         bool_t out_degrees,
+                         bool_t store_transposed,
+                         bool_t is_symmetric,
+                         edge_t *h_in_degrees,
+                         edge_t *h_out_degrees)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_resource_handle_t* handle = NULL;
+  cugraph_graph_t* graph            = NULL;
+  cugraph_degrees_result_t* result  = NULL;
+
+  handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
+
+  ret_code = create_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, is_symmetric, &graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  if (h_vertices == NULL) {
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  } else {
+    cugraph_type_erased_device_array_t* vertices           = NULL;
+    cugraph_type_erased_device_array_view_t* vertices_view = NULL;
+
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_vertices_to_compute, INT32, &vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds create failed.");
+
+    vertices_view = cugraph_type_erased_device_array_view(vertices);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, vertices_view, (byte_t*)h_vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src copy_from_host failed.");
+
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  }
+
+  cugraph_type_erased_device_array_view_t* result_vertices;
+  cugraph_type_erased_device_array_view_t* result_in_degrees;
+  cugraph_type_erased_device_array_view_t* result_out_degrees;
+
+  result_vertices    = cugraph_degrees_result_get_vertices(result);
+  result_in_degrees  = cugraph_degrees_result_get_in_degrees(result);
+  result_out_degrees = cugraph_degrees_result_get_out_degrees(result);
+
+  size_t num_result_vertices = cugraph_type_erased_device_array_view_size(result_vertices);
+
+  vertex_t h_result_vertices[num_result_vertices];
+  edge_t   h_result_in_degrees[num_result_vertices];
+  edge_t   h_result_out_degrees[num_result_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_vertices, result_vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  if (result_in_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_in_degrees, result_in_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (result_out_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_out_degrees, result_out_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (h_vertices != NULL) {
+    TEST_ASSERT(test_ret_value, num_result_vertices == num_vertices_to_compute, "results not the same size");
+  } else {
+    TEST_ASSERT(test_ret_value, num_result_vertices == num_vertices, "results not the same size");
+  }
+
+  for (size_t i = 0; (i < num_result_vertices) && (test_ret_value == 0); ++i) {
+    if (h_in_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_in_degrees[i] == h_in_degrees[h_result_vertices[i]], "in degree did not match");
+    }
+
+    if (h_out_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_out_degrees[i] == h_out_degrees[h_result_vertices[i]], "out degree did not match");
+    }
+  }
+
+  cugraph_degrees_result_free(result);
+  cugraph_graph_free(graph);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
+int test_degrees()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {1, 2, 0, 2, 1, 2};
+  vertex_t h_out_degrees[] = {1, 2, 3, 1, 1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric()
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {2, 4, 3, 3, 2, 2};
+  vertex_t h_out_degrees[] = {2, 4, 3, 3, 2, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {1, 2, 0, 2, 1, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_out_degrees[] = {1, 2, 3, 1, 1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+int test_degrees_subset()
+{
+  size_t num_edges               = 8;
+  size_t num_vertices            = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric_subset()
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 3, 3, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 3, -1, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees_subset()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees_subset()
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  int result = 0;
+  result |= RUN_TEST(test_degrees);
+  result |= RUN_TEST(test_degrees_symmetric);
+  result |= RUN_TEST(test_in_degrees);
+  result |= RUN_TEST(test_out_degrees);
+  result |= RUN_TEST(test_degrees_subset);
+  result |= RUN_TEST(test_degrees_symmetric_subset);
+  result |= RUN_TEST(test_in_degrees_subset);
+  result |= RUN_TEST(test_out_degrees_subset);
+  return result;
+}
diff --git a/cpp/tests/c_api/mg_degrees_test.c b/cpp/tests/c_api/mg_degrees_test.c
new file mode 100644
index 00000000000..3312dd4f5bb
--- /dev/null
+++ b/cpp/tests/c_api/mg_degrees_test.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+/*
+ * Simple check of creating a graph from a COO on device memory.
+ */
+int generic_degrees_test(const cugraph_resource_handle_t* handle,
+                         vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         size_t num_vertices,
+                         size_t num_edges,
+                         vertex_t* h_vertices,
+                         size_t num_vertices_to_compute,
+                         bool_t in_degrees,
+                         bool_t out_degrees,
+                         bool_t store_transposed,
+                         bool_t is_symmetric,
+                         edge_t* h_in_degrees,
+                         edge_t* h_out_degrees)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* graph                    = NULL;
+  cugraph_degrees_result_t* result = NULL;
+
+  ret_code = create_mg_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, is_symmetric, &graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  if (h_vertices == NULL) {
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, NULL, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  } else {
+    cugraph_type_erased_device_array_t* vertices           = NULL;
+    cugraph_type_erased_device_array_view_t* vertices_view = NULL;
+
+    int rank = cugraph_resource_handle_get_rank(handle);
+
+    size_t num_to_allocate = 0;
+    if (rank == 0) num_to_allocate = num_vertices_to_compute;
+
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_to_allocate, INT32, &vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds create failed.");
+
+    vertices_view = cugraph_type_erased_device_array_view(vertices);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, vertices_view, (byte_t*)h_vertices, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src copy_from_host failed.");
+
+    if (in_degrees && out_degrees) {
+      ret_code = cugraph_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else if (in_degrees) {
+      ret_code = cugraph_in_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    } else {
+      ret_code = cugraph_out_degrees(
+        handle, graph, vertices_view, FALSE, &result, &ret_error);
+    }
+
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_extract_degrees failed.");
+  }
+
+  cugraph_type_erased_device_array_view_t* result_vertices;
+  cugraph_type_erased_device_array_view_t* result_in_degrees;
+  cugraph_type_erased_device_array_view_t* result_out_degrees;
+
+  result_vertices    = cugraph_degrees_result_get_vertices(result);
+  result_in_degrees  = cugraph_degrees_result_get_in_degrees(result);
+  result_out_degrees = cugraph_degrees_result_get_out_degrees(result);
+
+  size_t num_result_vertices = cugraph_type_erased_device_array_view_size(result_vertices);
+
+  vertex_t h_result_vertices[num_result_vertices];
+  edge_t   h_result_in_degrees[num_result_vertices];
+  edge_t   h_result_out_degrees[num_result_vertices];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_vertices, result_vertices, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  if (result_in_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_in_degrees, result_in_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (result_out_degrees != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      handle, (byte_t*)h_result_out_degrees, result_out_degrees, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
+
+  if (h_vertices != NULL) {
+    size_t xxx = cugraph_size_t_allreduce(handle, num_result_vertices);
+    TEST_ASSERT(test_ret_value, cugraph_size_t_allreduce(handle, num_result_vertices) == num_vertices_to_compute, "results not the same size");
+  } else {
+    size_t xxx = cugraph_size_t_allreduce(handle, num_result_vertices);
+    TEST_ASSERT(test_ret_value, cugraph_size_t_allreduce(handle, num_result_vertices) == num_vertices, "results not the same size");
+  }
+
+  for (size_t i = 0; (i < num_result_vertices) && (test_ret_value == 0); ++i) {
+    if (h_in_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_in_degrees[i] == h_in_degrees[h_result_vertices[i]], "in degree did not match");
+    }
+
+    if (h_out_degrees != NULL) {
+      TEST_ASSERT(test_ret_value, h_result_out_degrees[i] == h_out_degrees[h_result_vertices[i]], "out degree did not match");
+    }
+  }
+
+  cugraph_degrees_result_free(result);
+  cugraph_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_degrees(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges            = 8;
+  size_t num_vertices         = 6;
+
+  vertex_t h_src[]               = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]               = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]               = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]        = {1, 2, 0, 2, 1, 2};
+  vertex_t h_out_degrees[]       = {1, 2, 3, 1, 1, 0};
+
+  // Pagerank wants store_transposed = TRUE
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {2, 4, 3, 3, 2, 2};
+  vertex_t h_out_degrees[] = {2, 4, 3, 3, 2, 2};
+
+  // Pagerank wants store_transposed = TRUE
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              TRUE,
+                              TRUE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees(const cugraph_resource_handle_t *handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_in_degrees[]  = {1, 2, 0, 2, 1, 2};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees(const cugraph_resource_handle_t *handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_out_degrees[] = {1, 2, 3, 1, 1, 0};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              NULL,
+                              0,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+int test_degrees_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges               = 8;
+  size_t num_vertices            = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_degrees_symmetric_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 16;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f,
+                              0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 3, 3, -1, 2};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 3, -1, 2};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              h_out_degrees);
+}
+
+int test_in_degrees_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_in_degrees[]  = {-1, -1, 0, 2, -1, 2};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              TRUE,
+                              FALSE,
+                              FALSE,
+                              TRUE,
+                              h_in_degrees,
+                              NULL);
+}
+
+int test_out_degrees_subset(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges         = 8;
+  size_t num_vertices      = 6;
+  size_t num_vertices_to_compute = 3;
+
+  vertex_t h_src[]         = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[]         = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[]         = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_vertices[]    = {2, 3, 5};
+  vertex_t h_out_degrees[] = {-1, -1, 3, 1, -1, 0};
+
+  return generic_degrees_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_vertices,
+                              num_edges,
+                              h_vertices,
+                              num_vertices_to_compute,
+                              FALSE,
+                              TRUE,
+                              FALSE,
+                              TRUE,
+                              NULL,
+                              h_out_degrees);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_degrees, handle);
+  result |= RUN_MG_TEST(test_degrees_symmetric, handle);
+  result |= RUN_MG_TEST(test_in_degrees, handle);
+  result |= RUN_MG_TEST(test_out_degrees, handle);
+  result |= RUN_MG_TEST(test_degrees_subset, handle);
+  result |= RUN_MG_TEST(test_degrees_symmetric_subset, handle);
+  result |= RUN_MG_TEST(test_in_degrees_subset, handle);
+  result |= RUN_MG_TEST(test_out_degrees_subset, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/c_api/test_utils.cpp b/cpp/tests/c_api/test_utils.cpp
index e37cc4555dd..3013cbb7cc6 100644
--- a/cpp/tests/c_api/test_utils.cpp
+++ b/cpp/tests/c_api/test_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,9 @@
  */
 
 #include "c_test_utils.h"
+#include "c_api/resource_handle.hpp"
+
+#include <cugraph/utilities/host_scalar_comm.hpp>
 
 #include <math.h>
 
@@ -388,3 +391,12 @@ int create_sg_test_graph(const cugraph_resource_handle_t* handle,
 
   return test_ret_value;
 }
+
+extern "C" size_t cugraph_size_t_allreduce(const cugraph_resource_handle_t* handle, size_t value)
+{
+  auto internal_handle = reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const *>(handle);
+  return cugraph::host_scalar_allreduce(internal_handle->handle_->get_comms(),
+                                        value,
+                                        raft::comms::op_t::SUM,
+                                        internal_handle->handle_->get_stream());
+}
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index cdf1e937e67..0ef5eaf1b9e 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import gc
-from typing import Union
+from typing import Union, Iterable
 import warnings
 
 import cudf
@@ -28,10 +28,11 @@
     GraphProperties,
     get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
     select_random_vertices as pylibcugraph_select_random_vertices,
+    degrees as pylibcugraph_degrees,
+    in_degrees as pylibcugraph_in_degrees,
+    out_degrees as pylibcugraph_out_degrees,
 )
 
-from cugraph.structure import graph_primtypes_wrapper
-from cugraph.structure.graph_primtypes_wrapper import Direction
 from cugraph.structure.number_map import NumberMap
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.dask.common.part_utils import (
@@ -536,7 +537,158 @@ def number_of_edges(self, directed_edges=False):
                 raise RuntimeError("Graph is Empty")
         return self.properties.edge_count
 
-    def in_degree(self, vertex_subset=None):
+    def degrees_function(
+        self,
+        vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None,
+        degree_type: str = "in_degree",
+    ) -> dask_cudf.DataFrame:
+        """
+        Compute vertex in-degree, out-degree, degree and degrees.
+
+        1) Vertex in-degree is the number of edges pointing into the vertex.
+        2) Vertex out-degree is the number of edges pointing out from the vertex.
+        3) Vertex degree, is the total number of edges incident to a vertex
+            (both in and out edges)
+        4) Vertex degrees computes vertex in-degree and out-degree.
+
+        By default, this method computes vertex in-degree, out-degree, degree
+        or degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or dask_cudf.Series, iterable container, optional
+            A container of vertices for displaying corresponding in-degree.
+            If not set, degrees are computed for the entire set of vertices.
+
+        degree_type : str (default='in_degree')
+
+        Returns
+        -------
+        df : dask_cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the in_degree, out_degrees,
+            degree or degrees. The ordering is relative to the adjacency list,
+            or that given by the specified vertex_subset.
+
+        Examples
+        --------
+        >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
+        ...                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.degrees_function([0,9,12], "in_degree")
+
+        """
+        _client = default_client()
+
+        def _call_plc_degrees_function(
+            sID: bytes, mg_graph_x, source_vertices: cudf.Series, degree_type: str
+        ) -> cp.array:
+
+            if degree_type == "in_degree":
+                results = pylibcugraph_in_degrees(
+                    resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                    graph=mg_graph_x,
+                    source_vertices=source_vertices,
+                    do_expensive_check=False,
+                )
+            elif degree_type == "out_degree":
+                results = pylibcugraph_out_degrees(
+                    resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                    graph=mg_graph_x,
+                    source_vertices=source_vertices,
+                    do_expensive_check=False,
+                )
+            elif degree_type in ["degree", "degrees"]:
+                results = pylibcugraph_degrees(
+                    resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                    graph=mg_graph_x,
+                    source_vertices=source_vertices,
+                    do_expensive_check=False,
+                )
+            else:
+                raise ValueError(
+                    "Incorrect degree type passed, valid values are ",
+                    "'in_degree', 'out_degree', 'degree' and 'degrees' ",
+                    f"got '{degree_type}'",
+                )
+
+            return results
+
+        if isinstance(vertex_subset, int):
+            vertex_subset = [vertex_subset]
+
+        if isinstance(vertex_subset, list):
+            vertex_subset = cudf.Series(vertex_subset)
+
+        if vertex_subset is not None:
+            if self.renumbered:
+                vertex_subset = self.renumber_map.to_internal_vertex_id(vertex_subset)
+                vertex_subset_type = self.edgelist.edgelist_df.dtypes.iloc[0]
+            else:
+                vertex_subset_type = self.input_df.dtypes.iloc[0]
+
+            vertex_subset = vertex_subset.astype(vertex_subset_type)
+
+        cupy_result = [
+            _client.submit(
+                _call_plc_degrees_function,
+                Comms.get_session_id(),
+                self._plc_graph[w],
+                vertex_subset,
+                degree_type,
+                workers=[w],
+                allow_other_workers=False,
+            )
+            for w in Comms.get_workers()
+        ]
+
+        wait(cupy_result)
+
+        def convert_to_cudf(cp_arrays: cp.ndarray, degree_type: bool) -> cudf.DataFrame:
+            """
+            Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
+            """
+            df = cudf.DataFrame()
+            df["vertex"] = cp_arrays[0]
+            if degree_type in ["in_degree", "out_degree"]:
+                df["degree"] = cp_arrays[1]
+            # degree_type must be either 'degree' or 'degrees'
+            else:
+                if degree_type == "degrees":
+                    df["in_degree"] = cp_arrays[1]
+                    df["out_degree"] = cp_arrays[2]
+                else:
+                    df["degree"] = cp_arrays[1] + cp_arrays[2]
+            return df
+
+        cudf_result = [
+            _client.submit(
+                convert_to_cudf,
+                cp_arrays,
+                degree_type,
+                workers=_client.who_has(cp_arrays)[cp_arrays.key],
+            )
+            for cp_arrays in cupy_result
+        ]
+
+        wait(cudf_result)
+        ddf = dask_cudf.from_delayed(cudf_result).persist()
+        wait(ddf)
+
+        # Wait until the inactive futures are released
+        wait([(r.release(), c_r.release()) for r, c_r in zip(cupy_result, cudf_result)])
+
+        if self.properties.renumbered:
+            ddf = self.renumber_map.unrenumber(ddf, "vertex")
+
+        return ddf
+
+    def in_degree(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex in-degree. Vertex in-degree is the number of edges
         pointing into the vertex. By default, this method computes vertex
@@ -572,61 +724,11 @@ def in_degree(self, vertex_subset=None):
         >>> df = G.in_degree([0,9,12])
 
         """
-        src_col_name = self.source_columns
-        dst_col_name = self.destination_columns
-
-        # select only the vertex columns
-        if not isinstance(src_col_name, list) and not isinstance(dst_col_name, list):
-            vertex_col_names = [src_col_name] + [dst_col_name]
-
-        df = self.input_df[vertex_col_names]
-        df = df.drop(columns=src_col_name)
-
-        nodes = self.nodes()
-        if isinstance(nodes, dask_cudf.Series):
-            nodes = nodes.to_frame()
-
-        if not isinstance(dst_col_name, list):
-            df = df.rename(columns={dst_col_name: "vertex"})
-            dst_col_name = "vertex"
-
-        vertex_col_names = df.columns
-        nodes.columns = vertex_col_names
-
-        df["degree"] = 1
-
-        # FIXME: leverage the C++ in_degree for optimal performance
-        in_degree = (
-            df.groupby(dst_col_name)
-            .degree.count(split_out=df.npartitions)
-            .reset_index()
-        )
-
-        # Add vertices with zero in_degree
-        in_degree = nodes.merge(in_degree, how="outer").fillna(0)
-
-        # Convert vertex_subset to dataframe.
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                if isinstance(vertex_subset, dask_cudf.Series):
-                    vertex_subset = vertex_subset.to_frame()
-                else:
-                    df = cudf.DataFrame()
-                    if isinstance(vertex_subset, (cudf.Series, list)):
-                        df["vertex"] = vertex_subset
-                        vertex_subset = df
-            if isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                vertex_subset.columns = vertex_col_names
-                in_degree = in_degree.merge(vertex_subset, how="inner")
-            else:
-                raise TypeError(
-                    f"Expected type are: cudf, dask_cudf objects, "
-                    f"iterable container, got "
-                    f"{type(vertex_subset)}"
-                )
-        return in_degree
+        return self.degrees_function(vertex_subset, "in_degree")
 
-    def out_degree(self, vertex_subset=None):
+    def out_degree(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex out-degree. Vertex out-degree is the number of edges
         pointing out from the vertex. By default, this method computes vertex
@@ -662,62 +764,11 @@ def out_degree(self, vertex_subset=None):
         >>> df = G.out_degree([0,9,12])
 
         """
-        src_col_name = self.source_columns
-        dst_col_name = self.destination_columns
-
-        # select only the vertex columns
-        if not isinstance(src_col_name, list) and not isinstance(dst_col_name, list):
-            vertex_col_names = [src_col_name] + [dst_col_name]
-
-        df = self.input_df[vertex_col_names]
-        df = df.drop(columns=dst_col_name)
-
-        nodes = self.nodes()
-        if isinstance(nodes, dask_cudf.Series):
-            nodes = nodes.to_frame()
-
-        if not isinstance(src_col_name, list):
-            df = df.rename(columns={src_col_name: "vertex"})
-            src_col_name = "vertex"
-
-        vertex_col_names = df.columns
-
-        nodes.columns = vertex_col_names
-
-        df["degree"] = 1
-        # leverage the C++ out_degree for optimal performance
-        out_degree = (
-            df.groupby(src_col_name)
-            .degree.count(split_out=df.npartitions)
-            .reset_index()
-        )
-
-        # Add vertices with zero out_degree
-        out_degree = nodes.merge(out_degree, how="outer").fillna(0)
-
-        # Convert vertex_subset to dataframe.
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                if isinstance(vertex_subset, dask_cudf.Series):
-                    vertex_subset = vertex_subset.to_frame()
-                else:
-                    df = cudf.DataFrame()
-                    if isinstance(vertex_subset, (cudf.Series, list)):
-                        df["vertex"] = vertex_subset
-                        vertex_subset = df
-            if isinstance(vertex_subset, (dask_cudf.DataFrame, cudf.DataFrame)):
-                vertex_subset.columns = vertex_col_names
-                out_degree = out_degree.merge(vertex_subset, how="inner")
-            else:
-                raise TypeError(
-                    f"Expected type are: cudf, dask_cudf objects, "
-                    f"iterable container, got "
-                    f"{type(vertex_subset)}"
-                )
+        return self.degrees_function(vertex_subset, "out_degree")
 
-        return out_degree
-
-    def degree(self, vertex_subset=None):
+    def degree(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex degree, which is the total number of edges incident
         to a vertex (both in and out edges). By default, this method computes
@@ -754,18 +805,12 @@ def degree(self, vertex_subset=None):
 
         """
 
-        vertex_in_degree = self.in_degree(vertex_subset)
-        vertex_out_degree = self.out_degree(vertex_subset)
-        # FIXME: leverage the C++ degree for optimal performance
-        vertex_degree = dask_cudf.concat([vertex_in_degree, vertex_out_degree])
-        vertex_degree = vertex_degree.groupby(["vertex"], as_index=False).sum(
-            split_out=self.input_df.npartitions
-        )
-
-        return vertex_degree
+        return self.degrees_function(vertex_subset, "degree")
 
     # FIXME:  vertex_subset could be a DataFrame for multi-column vertices
-    def degrees(self, vertex_subset=None):
+    def degrees(
+        self, vertex_subset: Union[cudf.Series, dask_cudf.Series, Iterable] = None
+    ) -> dask_cudf.DataFrame:
         """
         Compute vertex in-degree and out-degree. By default, this method
         computes vertex degrees for the entire set of vertices. If
@@ -802,21 +847,7 @@ def degrees(self, vertex_subset=None):
         >>> df = G.degrees([0,9,12])
 
         """
-        raise NotImplementedError("Not supported for distributed graph")
-
-    def _degree(self, vertex_subset, direction=Direction.ALL):
-        vertex_col, degree_col = graph_primtypes_wrapper._mg_degree(self, direction)
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["degree"] = degree_col
-
-        if self.renumbered is True:
-            df = self.renumber_map.unrenumber(df, "vertex")
-
-        if vertex_subset is not None:
-            df = df[df["vertex"].isin(vertex_subset)]
-
-        return df
+        return self.degrees_function(vertex_subset, "degrees")
 
     def get_two_hop_neighbors(self, start_vertices=None):
         """
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index 121a4c6245a..99934e02b10 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 from cugraph.structure import graph_primtypes_wrapper
-from cugraph.structure.graph_primtypes_wrapper import Direction
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.structure.number_map import NumberMap
 import cugraph.dask.common.mg_utils as mg_utils
@@ -23,10 +22,13 @@
 import numpy as np
 import warnings
 from cugraph.dask.structure import replication
-from typing import Union, Dict
+from typing import Union, Dict, Iterable
 from pylibcugraph import (
     get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
     select_random_vertices as pylibcugraph_select_random_vertices,
+    degrees as pylibcugraph_degrees,
+    in_degrees as pylibcugraph_in_degrees,
+    out_degrees as pylibcugraph_out_degrees,
 )
 
 from pylibcugraph import (
@@ -854,7 +856,111 @@ def number_of_edges(self, directed_edges=False):
                 raise ValueError("Graph is Empty")
         return self.properties.edge_count
 
-    def in_degree(self, vertex_subset=None):
+    def degrees_function(
+        self,
+        vertex_subset: Union[cudf.Series, Iterable] = None,
+        degree_type: str = "in_degree",
+    ) -> cudf.DataFrame:
+        """
+        Compute vertex in-degree, out-degree, degree and degrees.
+
+        1) Vertex in-degree is the number of edges pointing into the vertex.
+        2) Vertex out-degree is the number of edges pointing out from the vertex.
+        3) Vertex degree, is the total number of edges incident to a vertex
+            (both in and out edges)
+        4) Vertex degrees computes vertex in-degree and out-degree.
+
+        By default, this method computes vertex in-degree, out-degree, degree
+        or degrees for the entire set of vertices. If vertex_subset is provided,
+        this method optionally filters out all but those listed in
+        vertex_subset.
+
+        Parameters
+        ----------
+        vertex_subset : cudf.Series or iterable container, optional
+            A container of vertices for displaying corresponding in-degree.
+            If not set, degrees are computed for the entire set of vertices.
+
+        degree_type : str (default='in_degree')
+
+        Returns
+        -------
+        df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the in_degree, out_degrees,
+            degree or degrees. The ordering is relative to the adjacency list,
+            or that given by the specified vertex_subset.
+
+        Examples
+        --------
+        >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
+        ...                   dtype=['int32', 'int32', 'float32'], header=None)
+        >>> G = cugraph.Graph()
+        >>> G.from_cudf_edgelist(M, '0', '1')
+        >>> df = G.degrees_function([0,9,12], "in_degree")
+
+        """
+        if vertex_subset is not None:
+            if not isinstance(vertex_subset, cudf.Series):
+                vertex_subset = cudf.Series(vertex_subset)
+                if self.properties.renumbered is True:
+                    vertex_subset = self.renumber_map.to_internal_vertex_id(
+                        vertex_subset
+                    )
+                    vertex_subset_type = self.edgelist.edgelist_df.dtypes.iloc[0]
+                else:
+                    vertex_subset_type = self.input_df.dtypes.iloc[0]
+
+                vertex_subset = vertex_subset.astype(vertex_subset_type)
+
+        do_expensive_check = False
+        df = cudf.DataFrame()
+        vertex = None
+
+        if degree_type == "in_degree":
+            vertex, in_degrees = pylibcugraph_in_degrees(
+                resource_handle=ResourceHandle(),
+                graph=self._plc_graph,
+                source_vertices=vertex_subset,
+                do_expensive_check=do_expensive_check,
+            )
+            df["degree"] = in_degrees
+        elif degree_type == "out_degree":
+            vertex, out_degrees = pylibcugraph_out_degrees(
+                resource_handle=ResourceHandle(),
+                graph=self._plc_graph,
+                source_vertices=vertex_subset,
+                do_expensive_check=do_expensive_check,
+            )
+            df["degree"] = out_degrees
+        elif degree_type in ["degree", "degrees"]:
+            vertex, in_degrees, out_degrees = pylibcugraph_degrees(
+                resource_handle=ResourceHandle(),
+                graph=self._plc_graph,
+                source_vertices=vertex_subset,
+                do_expensive_check=do_expensive_check,
+            )
+            if degree_type == "degrees":
+                df["in_degree"] = in_degrees
+                df["out_degree"] = out_degrees
+
+            else:
+                df["degree"] = in_degrees + out_degrees
+        else:
+            raise ValueError(
+                "Incorrect degree type passed, valid values are ",
+                "'in_degree', 'out_degree', 'degree' and 'degrees' ",
+                f"got '{degree_type}'",
+            )
+        df["vertex"] = vertex
+        if self.properties.renumbered is True:
+            df = self.renumber_map.unrenumber(df, "vertex")
+
+        return df
+
+    def in_degree(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex in-degree. Vertex in-degree is the number of edges
         pointing into the vertex. By default, this method computes vertex
@@ -892,11 +998,11 @@ def in_degree(self, vertex_subset=None):
         >>> df = G.in_degree([0,9,12])
 
         """
-        in_degree = self._degree(vertex_subset, direction=Direction.IN)
-
-        return in_degree
+        return self.degrees_function(vertex_subset, "in_degree")
 
-    def out_degree(self, vertex_subset=None):
+    def out_degree(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex out-degree. Vertex out-degree is the number of edges
         pointing out from the vertex. By default, this method computes vertex
@@ -934,10 +1040,11 @@ def out_degree(self, vertex_subset=None):
         >>> df = G.out_degree([0,9,12])
 
         """
-        out_degree = self._degree(vertex_subset, direction=Direction.OUT)
-        return out_degree
+        return self.degrees_function(vertex_subset, "out_degree")
 
-    def degree(self, vertex_subset=None):
+    def degree(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex degree, which is the total number of edges incident
         to a vertex (both in and out edges). By default, this method computes
@@ -976,10 +1083,12 @@ def degree(self, vertex_subset=None):
         >>> subset_df = G.degree([0,9,12])
 
         """
-        return self._degree(vertex_subset)
+        return self.degrees_function(vertex_subset, "degree")
 
     # FIXME:  vertex_subset could be a DataFrame for multi-column vertices
-    def degrees(self, vertex_subset=None):
+    def degrees(
+        self, vertex_subset: Union[cudf.Series, Iterable] = None
+    ) -> cudf.DataFrame:
         """
         Compute vertex in-degree and out-degree. By default, this method
         computes vertex degrees for the entire set of vertices. If
@@ -1019,70 +1128,7 @@ def degrees(self, vertex_subset=None):
         >>> df = G.degrees([0,9,12])
 
         """
-        (
-            vertex_col,
-            in_degree_col,
-            out_degree_col,
-        ) = graph_primtypes_wrapper._degrees(self)
-
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["in_degree"] = in_degree_col
-        df["out_degree"] = out_degree_col
-
-        if self.properties.renumbered:
-            # Get the internal vertex IDs
-            nodes = self.renumber_map.df_internal_to_external["id"]
-        else:
-            nodes = self.nodes()
-        # If the vertex IDs are not contiguous, remove results for the
-        # isolated vertices
-        df = df[df["vertex"].isin(nodes.to_cupy())]
-
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, cudf.Series):
-                vertex_subset = cudf.Series(vertex_subset)
-                if self.properties.renumbered:
-                    vertex_subset = self.renumber_map.to_internal_vertex_id(
-                        vertex_subset
-                    )
-                vertex_subset = vertex_subset.to_cupy()
-            df = df[df["vertex"].isin(vertex_subset)]
-
-        if self.properties.renumbered:
-            df = self.renumber_map.unrenumber(df, "vertex")
-
-        return df
-
-    def _degree(self, vertex_subset, direction=Direction.ALL):
-        vertex_col, degree_col = graph_primtypes_wrapper._degree(self, direction)
-        df = cudf.DataFrame()
-        df["vertex"] = vertex_col
-        df["degree"] = degree_col
-
-        if self.properties.renumbered:
-            # Get the internal vertex IDs
-            nodes = self.renumber_map.df_internal_to_external["id"]
-        else:
-            nodes = self.nodes()
-        # If the vertex IDs are not contiguous, remove results for the
-        # isolated vertices
-        df = df[df["vertex"].isin(nodes.to_cupy())]
-
-        if vertex_subset is not None:
-            if not isinstance(vertex_subset, cudf.Series):
-                vertex_subset = cudf.Series(vertex_subset)
-                if self.properties.renumbered:
-                    vertex_subset = self.renumber_map.to_internal_vertex_id(
-                        vertex_subset
-                    )
-                vertex_subset = vertex_subset.to_cupy()
-            df = df[df["vertex"].isin(vertex_subset)]
-
-        if self.properties.renumbered:
-            df = self.renumber_map.unrenumber(df, "vertex")
-
-        return df
+        return self.degrees_function(vertex_subset, "degrees")
 
     def _make_plc_graph(
         self,
diff --git a/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
index a46f4b9463b..1bef1e0872b 100644
--- a/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_degree_centrality_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,7 +18,6 @@
 import cudf
 import dask_cudf
 import cugraph
-from cugraph.dask.common.mg_utils import is_single_gpu
 from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH
 from cudf.testing import assert_series_equal
 
@@ -41,7 +40,6 @@ def setup_function():
 
 
 @pytest.mark.mg
-@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 @pytest.mark.parametrize("directed", IS_DIRECTED)
 @pytest.mark.parametrize("data_file", DATA_PATH)
 def test_dask_mg_degree(dask_client, directed, data_file):
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index c2e22fc1ff7..7cc90145949 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -57,6 +57,7 @@ set(cython_sources
     utils.pyx
     weakly_connected_components.pyx
     replicate_edgelist.pyx
+    degrees.pyx
 )
 set(linked_libraries cugraph::cugraph;cugraph::cugraph_c)
 
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index ab518e24cae..dcdef05e106 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -95,6 +95,8 @@
 
 from pylibcugraph.sorensen_coefficients import sorensen_coefficients
 
+from pylibcugraph.degrees import in_degrees, out_degrees, degrees
+
 
 from pylibcugraph import exceptions
 
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
index 90bc041e5f0..6f1ac1f640b 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
@@ -182,3 +182,58 @@ cdef extern from "cugraph_c/graph_functions.h":
             cugraph_induced_subgraph_result_t** result,
             cugraph_error_t** error
         )
+
+    ###########################################################################
+    # degrees
+    ctypedef struct cugraph_degrees_result_t:
+        pass
+
+    cdef cugraph_error_code_t \
+        cugraph_in_degrees(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* source_vertices,
+            bool_t do_expensive_check,
+            cugraph_degrees_result_t** result,
+            cugraph_error_t** error
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_out_degrees(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* source_vertices,
+            bool_t do_expensive_check,
+            cugraph_degrees_result_t** result,
+            cugraph_error_t** error
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_degrees(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* source_vertices,
+            bool_t do_expensive_check,
+            cugraph_degrees_result_t** result,
+            cugraph_error_t** error
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_degrees_result_get_vertices(
+            cugraph_degrees_result_t* degrees_result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_degrees_result_get_in_degrees(
+            cugraph_degrees_result_t* degrees_result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_degrees_result_get_out_degrees(
+            cugraph_degrees_result_t* degrees_result
+        )
+
+    cdef void \
+        cugraph_degrees_result_free(
+            cugraph_degrees_result_t* degrees_result
+        )
diff --git a/python/pylibcugraph/pylibcugraph/degrees.pyx b/python/pylibcugraph/pylibcugraph/degrees.pyx
new file mode 100644
index 00000000000..7818da441bd
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/degrees.pyx
@@ -0,0 +1,307 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    data_type_id_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_degrees_result_t,
+    cugraph_degrees,
+    cugraph_in_degrees,
+    cugraph_out_degrees,
+    cugraph_degrees_result_get_vertices,
+    cugraph_degrees_result_get_in_degrees,
+    cugraph_degrees_result_get_out_degrees,
+    cugraph_degrees_result_free,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    assert_CAI_type,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+)
+
+
+def in_degrees(ResourceHandle resource_handle,
+               _GPUGraph graph,
+               source_vertices,
+               bool_t do_expensive_check):
+    """
+    Compute the in degrees for the nodes of the graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    source_vertices : cupy array
+        The nodes for which we will compute degrees.
+
+    do_expensive_check : bool_t
+        A flag to run expensive checks for input arguments if True.
+
+    Returns
+    -------
+    A tuple of device arrays, where the first item in the tuple is a device
+    array containing the vertices, the second item in the tuple is a device
+    array containing the in degrees for the vertices.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> (vertices, in_degrees) = pylibcugraph.in_degrees(
+                                   resource_handle, G, None, False)
+
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_degrees_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    assert_CAI_type(source_vertices, "source_vertices", True)
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        source_vertices_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                source_vertices)
+
+    error_code = cugraph_in_degrees(c_resource_handle_ptr,
+                                    c_graph_ptr,
+                                    source_vertices_ptr,
+                                    do_expensive_check,
+                                    &result_ptr,
+                                    &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_in_degrees")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        cugraph_degrees_result_get_vertices(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* in_degrees_ptr = \
+        cugraph_degrees_result_get_in_degrees(result_ptr)
+
+    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+    cupy_in_degrees = copy_to_cupy_array(c_resource_handle_ptr, in_degrees_ptr)
+
+    cugraph_degrees_result_free(result_ptr)
+
+    return (cupy_vertices, cupy_in_degrees)
+
+def out_degrees(ResourceHandle resource_handle,
+                _GPUGraph graph,
+                source_vertices,
+                bool_t do_expensive_check):
+    """
+    Compute the out degrees for the nodes of the graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    source_vertices : cupy array
+        The nodes for which we will compute degrees.
+
+    do_expensive_check : bool_t
+        A flag to run expensive checks for input arguments if True.
+
+    Returns
+    -------
+    A tuple of device arrays, where the first item in the tuple is a device
+    array containing the vertices, the second item in the tuple is a device
+    array containing the out degrees for the vertices.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> (vertices, out_degrees) = pylibcugraph.out_degrees(
+                                    resource_handle, G, None, False)
+
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_degrees_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    assert_CAI_type(source_vertices, "source_vertices", True)
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        source_vertices_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                source_vertices)
+
+    error_code = cugraph_out_degrees(c_resource_handle_ptr,
+                                     c_graph_ptr,
+                                     source_vertices_ptr,
+                                     do_expensive_check,
+                                     &result_ptr,
+                                     &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_out_degrees")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        cugraph_degrees_result_get_vertices(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* out_degrees_ptr = \
+        cugraph_degrees_result_get_out_degrees(result_ptr)
+
+    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+    cupy_out_degrees = copy_to_cupy_array(c_resource_handle_ptr, out_degrees_ptr)
+
+    cugraph_degrees_result_free(result_ptr)
+
+    return (cupy_vertices, cupy_out_degrees)
+
+
+def degrees(ResourceHandle resource_handle,
+            _GPUGraph graph,
+            source_vertices,
+            bool_t do_expensive_check):
+    """
+    Compute the degrees for the nodes of the graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    source_vertices : cupy array
+        The nodes for which we will compute degrees.
+
+    do_expensive_check : bool_t
+        A flag to run expensive checks for input arguments if True.
+
+    Returns
+    -------
+    A tuple of device arrays, where the first item in the tuple is a device
+    array containing the vertices, the second item in the tuple is a device
+    array containing the in degrees for the vertices, the third item in the
+    tuple is a device array containing the out degrees for the vertices.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> (vertices, in_degrees, out_degrees) = pylibcugraph.degrees(
+                                                resource_handle, G, None, False)
+
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_degrees_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    assert_CAI_type(source_vertices, "source_vertices", True)
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        source_vertices_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                source_vertices)
+
+    error_code = cugraph_degrees(c_resource_handle_ptr,
+                                 c_graph_ptr,
+                                 source_vertices_ptr,
+                                 do_expensive_check,
+                                 &result_ptr,
+                                 &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_degrees")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        cugraph_degrees_result_get_vertices(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* in_degrees_ptr = \
+        cugraph_degrees_result_get_in_degrees(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* out_degrees_ptr = \
+        cugraph_degrees_result_get_out_degrees(result_ptr)
+
+    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+    cupy_in_degrees = copy_to_cupy_array(c_resource_handle_ptr, in_degrees_ptr)
+    cupy_out_degrees = copy_to_cupy_array(c_resource_handle_ptr, out_degrees_ptr)
+
+    cugraph_degrees_result_free(result_ptr)
+
+    return (cupy_vertices, cupy_in_degrees, cupy_out_degrees)

From 120e5b892d5396f10ceb5d957de01c8d74a2e781 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Tue, 12 Mar 2024 20:10:13 -0700
Subject: [PATCH 5/5] Enable edge masking in the remaining primitives (#4186)

This PR will update all the remaining primitives to support edge masking.

This PR pulls updates from https://github.com/rapidsai/cugraph/pull/4126 and better be reviewed/merged after PR https://github.com/rapidsai/cugraph/pull/4126.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4186
---
 cpp/include/cugraph/graph_view.hpp            |   4 +-
 cpp/include/cugraph/utilities/misc_utils.cuh  |   8 +
 cpp/src/prims/detail/nbr_intersection.cuh     |  11 +-
 ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 529 +++++++++++-----
 ...t_nbr_intersection_of_e_endpoints_by_v.cuh |  24 +-
 .../transform_reduce_e_by_src_dst_key.cuh     | 292 +++++++--
 cpp/tests/CMakeLists.txt                      |  33 +-
 ...rm_reduce_dst_key_aggregated_outgoing_e.cu | 599 ++++++++++++++++++
 ..._v_transform_reduce_incoming_outgoing_e.cu |  84 +--
 cpp/tests/prims/mg_reduce_v.cu                |  47 +-
 ...st_nbr_intersection_of_e_endpoints_by_v.cu | 289 +++++++++
 cpp/tests/prims/mg_transform_reduce_e.cu      |  41 +-
 .../mg_transform_reduce_e_by_src_dst_key.cu   | 495 +++++++++++++++
 cpp/tests/prims/mg_transform_reduce_v.cu      |  47 +-
 cpp/tests/prims/result_compare.cuh            | 143 +++++
 15 files changed, 2189 insertions(+), 457 deletions(-)
 create mode 100644 cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
 create mode 100644 cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
 create mode 100644 cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
 create mode 100644 cpp/tests/prims/result_compare.cuh

diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 3f3514179bf..cbb52ef3b1e 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -613,7 +613,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       major_value_range_start_offset);
   }
 
-  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // FIXME: deprecated, replaced with compute_number_of_edges (which works with or without edge
   // masking)
   edge_t number_of_edges() const
   {
@@ -923,7 +923,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       offsets_, indices_, this->number_of_vertices());
   }
 
-  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // FIXME: deprecated, replaced with compute_number_of_edges (which works with or without edge
   // masking)
   edge_t number_of_edges() const
   {
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index d3917a3e851..633dabe5b40 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -94,6 +94,14 @@ thrust::optional<T> to_thrust_optional(std::optional<T> val)
   return ret;
 }
 
+template <typename T>
+std::optional<T> to_std_optional(thrust::optional<T> val)
+{
+  std::optional<T> ret{std::nullopt};
+  if (val) { ret = *val; }
+  return ret;
+}
+
 template <typename idx_t, typename offset_t>
 rmm::device_uvector<idx_t> expand_sparse_offsets(raft::device_span<offset_t const> offsets,
                                                  idx_t base_idx,
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index e0a04eb59da..847c1db6937 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -50,6 +50,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/optional.h>
+#include <thrust/reduce.h>
 #include <thrust/remove.h>
 #include <thrust/scan.h>
 #include <thrust/set_operations.h>
@@ -1232,9 +1233,11 @@ nbr_intersection(raft::handle_t const& handle,
           rx_v_pair_nbr_intersection_sizes.size() + 1, handle.get_stream());
         rx_v_pair_nbr_intersection_offsets.set_element_to_zero_async(size_t{0},
                                                                      handle.get_stream());
+        auto size_first = thrust::make_transform_iterator(
+          rx_v_pair_nbr_intersection_sizes.begin(), cugraph::detail::typecast_t<edge_t, size_t>{});
         thrust::inclusive_scan(handle.get_thrust_policy(),
-                               rx_v_pair_nbr_intersection_sizes.begin(),
-                               rx_v_pair_nbr_intersection_sizes.end(),
+                               size_first,
+                               size_first + rx_v_pair_nbr_intersection_sizes.size(),
                                rx_v_pair_nbr_intersection_offsets.begin() + 1);
 
         rx_v_pair_nbr_intersection_indices.resize(
@@ -1344,8 +1347,8 @@ nbr_intersection(raft::handle_t const& handle,
         }
 
         thrust::inclusive_scan(handle.get_thrust_policy(),
-                               rx_v_pair_nbr_intersection_sizes.begin(),
-                               rx_v_pair_nbr_intersection_sizes.end(),
+                               size_first,
+                               size_first + rx_v_pair_nbr_intersection_sizes.size(),
                                rx_v_pair_nbr_intersection_offsets.begin() + 1);
 
         std::vector<size_t> h_rx_v_pair_lasts(rx_v_pair_counts.size());
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index 3b25ae50773..5e4cd81513e 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "detail/graph_partition_utils.cuh"
+#include "prims/detail/optional_dataframe_buffer.hpp"
 #include "prims/kv_store.cuh"
 #include "utilities/collect_comm.cuh"
 
@@ -83,15 +84,23 @@ struct rebase_offset_t {
 
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t, typename edge_value_t>
-struct triplet_to_minor_comm_rank_t {
+struct tuple_to_minor_comm_rank_t {
   compute_vertex_partition_id_from_ext_vertex_t<vertex_t> key_func{};
   int minor_comm_size{};
 
-  __device__ int operator()(
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, thrust::nullopt_t>, int> operator()(
     thrust::tuple<vertex_t, vertex_t, edge_value_t> val /* major, minor key, edge value */) const
   {
     return key_func(thrust::get<1>(val)) % minor_comm_size;
   }
+
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<std::is_same_v<edge_value_type, thrust::nullopt_t>, int> operator()(
+    thrust::tuple<vertex_t, vertex_t> val /* major, minor key */) const
+  {
+    return key_func(thrust::get<1>(val)) % minor_comm_size;
+  }
 };
 
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
@@ -106,6 +115,7 @@ struct pair_to_binary_partition_id_t {
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t,
           typename edge_value_t,
+          typename e_op_result_t,
           typename EdgePartitionDeviceView,
           typename EdgeMajorValueMap,
           typename EdgePartitionMajorValueInputWrapper,
@@ -118,8 +128,10 @@ struct call_key_aggregated_e_op_t {
   EdgeMinorKeyValueMap edge_minor_key_value_map{};
   KeyAggregatedEdgeOp key_aggregated_e_op{};
 
-  __device__ auto operator()(thrust::tuple<vertex_t, vertex_t, edge_value_t>
-                               val /* major, minor key, aggregated edge value */) const
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<!std::is_same_v<edge_value_type, thrust::nullopt_t>, e_op_result_t>
+  operator()(thrust::tuple<vertex_t, vertex_t, edge_value_t>
+               val /* major, minor key, aggregated edge value */) const
   {
     auto major                 = thrust::get<0>(val);
     auto minor_key             = thrust::get<1>(val);
@@ -131,6 +143,20 @@ struct call_key_aggregated_e_op_t {
     return key_aggregated_e_op(
       major, minor_key, major_val, edge_minor_key_value_map.find(minor_key), aggregated_edge_value);
   }
+
+  template <typename edge_value_type = edge_value_t>
+  __device__ std::enable_if_t<std::is_same_v<edge_value_type, thrust::nullopt_t>, e_op_result_t>
+  operator()(thrust::tuple<vertex_t, vertex_t> val /* major, minor key */) const
+  {
+    auto major     = thrust::get<0>(val);
+    auto minor_key = thrust::get<1>(val);
+    auto major_val = edge_major_value_map
+                       ? (*edge_major_value_map).find(major)
+                       : edge_partition_major_value_input.get(
+                           edge_partition.major_offset_from_major_nocheck(major));
+    return key_aggregated_e_op(
+      major, minor_key, major_val, edge_minor_key_value_map.find(minor_key), thrust::nullopt);
+  }
 };
 
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
@@ -182,9 +208,8 @@ struct reduce_with_init_t {
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
  * @tparam EdgeDstKeyInputWrapper Type of the wrapper for edge destination key values.
- * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
- * coincide with vertex type).
- * @tparam ValueIterator Type of the iterator for values in (key, value) pairs.
+ * @tparam KVStoreViewType Type of the (key, value) store. Key type should coincide with vertex
+ * type.
  * @tparam KeyAggregatedEdgeOp Type of the quinary key-aggregated edge operator.
  * @tparam ReduceOp Type of the binary reduction operator.
  * @tparam T Type of the initial value for per-vertex reduction.
@@ -204,15 +229,10 @@ struct reduce_with_init_t {
  * @param edge_dst_key_input Wrapper used to access destination input key values (for the edge
  * destinations assigned to this process in multi-GPU). Use  cugraph::edge_dst_property_t::view().
  * Use update_edge_dst_property to fill the wrapper.
- * @param map_unique_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
- * (assigned to this process in multi-GPU, `cugraph::detail::compute_gpu_id_from_ext_vertex_t` is
- * used to map keys to processes). (Key, value) pairs may be provided by
- * transform_reduce_by_src_key_e() or transform_reduce_by_dst_key_e().
- * @param map_unique_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs
- * (assigned to this process in multi-GPU).
- * @param map_value_first Iterator pointing to the first (inclusive) value in (key, value) pairs
- * (assigned to this process in multi-GPU). `map_value_last` (exclusive) is deduced as @p
- * map_value_first + thrust::distance(@p map_unique_key_first, @p map_unique_key_last).
+ * @param kv_store_view view object of the (key, value) store (for the keys assigned to this process
+ * in multi-GPU). `cugraph::detail::compute_gpu_id_from_ext_vertex_t` is used to map keys to
+ * processes). (Key, value) pairs may be provided by transform_reduce_e_by_src_key() or
+ * transform_reduce_e_by_dst_key().
  * @param key_aggregated_e_op Quinary operator takes 1) edge source, 2) key, 3) *(@p
  * edge_partition_src_value_input_first + i), 4) value for the key stored in the input (key, value)
  * pairs provided by @p map_unique_key_first, @p map_unique_key_last, and @p map_value_first
@@ -263,8 +283,11 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
   using edge_src_value_t = typename EdgeSrcValueInputWrapper::value_type;
   using edge_value_t     = typename EdgeValueInputWrapper::value_type;
   using kv_pair_value_t  = typename KVStoreViewType::value_type;
+  using optional_edge_value_buffer_value_type =
+    std::conditional_t<!std::is_same_v<edge_value_t, thrust::nullopt_t>, edge_value_t, void>;
+
   static_assert(
-    std::is_arithmetic_v<edge_value_t>,
+    std::is_same_v<edge_value_t, thrust::nullopt_t> || std::is_arithmetic_v<edge_value_t>,
     "Currently only scalar values are supported, should be extended to support thrust::tuple of "
     "arithmetic types and void (for dummy property values) to be consistent with other "
     "primitives.");  // this will also require a custom edge value aggregation op.
@@ -284,16 +307,15 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
-      typename EdgeValueInputWrapper::value_iterator>>;
-
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
 
   if (do_expensive_check) { /* currently, nothing to do */
   }
 
   auto total_global_mem = handle.get_device_properties().totalGlobalMem;
   size_t element_size   = sizeof(vertex_t) * 2;  // major + minor keys
-  if constexpr (!std::is_same_v<edge_value_t, void>) {
+  if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
     static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<edge_value_t>::value);
     if constexpr (is_thrust_tuple_of_arithmetic<edge_value_t>::value) {
       element_size += sum_thrust_tuple_element_sizes<edge_value_t>();
@@ -317,24 +339,78 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
 
   // 1. aggregate each vertex out-going edges based on keys and transform-reduce.
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   rmm::device_uvector<vertex_t> majors(0, handle.get_stream());
   auto e_op_result_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     auto edge_partition_src_value_input =
       edge_partition_src_input_device_view_t(edge_src_value_input, i);
     auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i);
 
-    rmm::device_uvector<vertex_t> tmp_majors(edge_partition.number_of_edges(), handle.get_stream());
+    std::optional<rmm::device_uvector<edge_t>> offsets_with_mask{std::nullopt};
+    if (edge_partition_e_mask) {
+      rmm::device_uvector<edge_t> degrees_with_mask(0, handle.get_stream());
+      if (edge_partition.dcs_nzd_vertices()) {
+        auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
+
+        auto major_sparse_range_size =
+          (*segment_offsets)[detail::num_sparse_segments_per_vertex_partition];
+        degrees_with_mask = rmm::device_uvector<edge_t>(
+          major_sparse_range_size + *(edge_partition.dcs_nzd_vertex_count()), handle.get_stream());
+        auto major_first = thrust::make_transform_iterator(
+          thrust::make_counting_iterator(vertex_t{0}),
+          cuda::proclaim_return_type<vertex_t>(
+            [major_sparse_range_size,
+             major_range_first = edge_partition.major_range_first(),
+             dcs_nzd_vertices  = *(edge_partition.dcs_nzd_vertices())] __device__(vertex_t i) {
+              if (i < major_sparse_range_size) {  // sparse
+                return major_range_first + i;
+              } else {  // hypersparse
+                return *(dcs_nzd_vertices + (i - major_sparse_range_size));
+              }
+            }));
+        degrees_with_mask =
+          edge_partition.compute_local_degrees_with_mask((*edge_partition_e_mask).value_first(),
+                                                         major_first,
+                                                         major_first + degrees_with_mask.size(),
+                                                         handle.get_stream());
+      } else {
+        degrees_with_mask = edge_partition.compute_local_degrees_with_mask(
+          (*edge_partition_e_mask).value_first(),
+          thrust::make_counting_iterator(edge_partition.major_range_first()),
+          thrust::make_counting_iterator(edge_partition.major_range_last()),
+          handle.get_stream());
+      }
+      offsets_with_mask =
+        rmm::device_uvector<edge_t>(degrees_with_mask.size() + 1, handle.get_stream());
+      (*offsets_with_mask).set_element_to_zero_async(0, handle.get_stream());
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             degrees_with_mask.begin(),
+                             degrees_with_mask.end(),
+                             (*offsets_with_mask).begin() + 1);
+    }
+
+    rmm::device_uvector<vertex_t> tmp_majors(
+      edge_partition_e_mask ? (*offsets_with_mask).back_element(handle.get_stream())
+                            : edge_partition.number_of_edges(),
+      handle.get_stream());
     rmm::device_uvector<vertex_t> tmp_minor_keys(tmp_majors.size(), handle.get_stream());
-    // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-    rmm::device_uvector<edge_value_t> tmp_key_aggregated_edge_values(tmp_majors.size(),
-                                                                     handle.get_stream());
+    auto tmp_key_aggregated_edge_values =
+      detail::allocate_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+        tmp_majors.size(), handle.get_stream());
 
-    if (edge_partition.number_of_edges() > 0) {
+    if (tmp_majors.size() > 0) {
       auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i);
 
       detail::decompress_edge_partition_to_fill_edgelist_majors<vertex_t,
@@ -342,7 +418,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
                                                                 GraphViewType::is_multi_gpu>(
         handle,
         edge_partition,
-        std::nullopt,
+        detail::to_std_optional(edge_partition_e_mask),
         raft::device_span<vertex_t>(tmp_majors.data(), tmp_majors.size()),
         segment_offsets);
 
@@ -357,14 +433,14 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 20);
       auto [h_vertex_offsets, h_edge_offsets] = detail::compute_offset_aligned_element_chunks(
         handle,
-        raft::device_span<edge_t const>{
-          edge_partition.offsets(),
-          1 + static_cast<size_t>(
-                edge_partition.dcs_nzd_vertices()
-                  ? (*segment_offsets)[detail::num_sparse_segments_per_vertex_partition] +
-                      *(edge_partition.dcs_nzd_vertex_count())
-                  : edge_partition.major_range_size())},
-        edge_partition.number_of_edges(),
+        raft::device_span<edge_t const>(
+          offsets_with_mask ? (*offsets_with_mask).data() : edge_partition.offsets(),
+          (edge_partition.dcs_nzd_vertices()
+             ? (*segment_offsets)[detail::num_sparse_segments_per_vertex_partition] +
+                 *(edge_partition.dcs_nzd_vertex_count())
+             : edge_partition.major_range_size()) +
+            1),
+        static_cast<edge_t>(tmp_majors.size()),
         approx_edges_to_sort_per_iteration);
       auto num_chunks = h_vertex_offsets.size() - 1;
 
@@ -376,30 +452,69 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       rmm::device_uvector<vertex_t> unreduced_majors(max_chunk_size, handle.get_stream());
       rmm::device_uvector<vertex_t> unreduced_minor_keys(unreduced_majors.size(),
                                                          handle.get_stream());
-      // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-      rmm::device_uvector<edge_value_t> unreduced_key_aggregated_edge_values(
-        unreduced_majors.size(), handle.get_stream());
+      auto unreduced_key_aggregated_edge_values =
+        detail::allocate_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          unreduced_majors.size(), handle.get_stream());
       rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
 
       size_t reduced_size{0};
       for (size_t j = 0; j < num_chunks; ++j) {
-        thrust::copy(handle.get_thrust_policy(),
-                     minor_key_first + h_edge_offsets[j],
-                     minor_key_first + h_edge_offsets[j + 1],
-                     tmp_minor_keys.begin() + h_edge_offsets[j]);
+        if (edge_partition_e_mask) {
+          std::array<edge_t, 2> unmasked_ranges{};
+          raft::update_host(unmasked_ranges.data(),
+                            edge_partition.offsets() + h_vertex_offsets[j],
+                            1,
+                            handle.get_stream());
+          raft::update_host(unmasked_ranges.data() + 1,
+                            edge_partition.offsets() + h_vertex_offsets[j + 1],
+                            1,
+                            handle.get_stream());
+          handle.sync_stream();
+          if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+            detail::copy_if_mask_set(
+              handle,
+              thrust::make_zip_iterator(minor_key_first,
+                                        edge_partition_e_value_input.value_first()) +
+                unmasked_ranges[0],
+              thrust::make_zip_iterator(minor_key_first,
+                                        edge_partition_e_value_input.value_first()) +
+                unmasked_ranges[1],
+              (*edge_partition_e_mask).value_first() + unmasked_ranges[0],
+              thrust::make_zip_iterator(tmp_minor_keys.begin(),
+                                        detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                          tmp_key_aggregated_edge_values)) +
+                h_edge_offsets[j]);
+          } else {
+            detail::copy_if_mask_set(handle,
+                                     minor_key_first + unmasked_ranges[0],
+                                     minor_key_first + unmasked_ranges[1],
+                                     (*edge_partition_e_mask).value_first() + unmasked_ranges[0],
+                                     tmp_minor_keys.begin() + h_edge_offsets[j]);
+          }
+        } else {
+          thrust::copy(handle.get_thrust_policy(),
+                       minor_key_first + h_edge_offsets[j],
+                       minor_key_first + h_edge_offsets[j + 1],
+                       tmp_minor_keys.begin() + h_edge_offsets[j]);
+        }
 
         size_t tmp_storage_bytes{0};
-        auto offset_first =
-          thrust::make_transform_iterator(edge_partition.offsets() + h_vertex_offsets[j],
-                                          detail::rebase_offset_t<edge_t>{h_edge_offsets[j]});
-        if constexpr (!std::is_same_v<edge_value_t, void>) {
+        auto offset_first = thrust::make_transform_iterator(
+          (offsets_with_mask ? (*offsets_with_mask).data() : edge_partition.offsets()) +
+            h_vertex_offsets[j],
+          detail::rebase_offset_t<edge_t>{h_edge_offsets[j]});
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
           cub::DeviceSegmentedSort::SortPairs(
             static_cast<void*>(nullptr),
             tmp_storage_bytes,
             tmp_minor_keys.begin() + h_edge_offsets[j],
             unreduced_minor_keys.begin(),
-            edge_partition_e_value_input.value_first() + h_edge_offsets[j],
-            unreduced_key_aggregated_edge_values.begin(),
+            (edge_partition_e_mask ? detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                       tmp_key_aggregated_edge_values)
+                                   : edge_partition_e_value_input.value_first()) +
+              h_edge_offsets[j],
+            detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+              unreduced_key_aggregated_edge_values),
             h_edge_offsets[j + 1] - h_edge_offsets[j],
             h_vertex_offsets[j + 1] - h_vertex_offsets[j],
             offset_first,
@@ -419,14 +534,18 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         if (tmp_storage_bytes > d_tmp_storage.size()) {
           d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
         }
-        if constexpr (!std::is_same_v<edge_value_t, void>) {
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
           cub::DeviceSegmentedSort::SortPairs(
             d_tmp_storage.data(),
             tmp_storage_bytes,
             tmp_minor_keys.begin() + h_edge_offsets[j],
             unreduced_minor_keys.begin(),
-            edge_partition_e_value_input.value_first() + h_edge_offsets[j],
-            unreduced_key_aggregated_edge_values.begin(),
+            (edge_partition_e_mask ? detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                       tmp_key_aggregated_edge_values)
+                                   : edge_partition_e_value_input.value_first()) +
+              h_edge_offsets[j],
+            detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+              unreduced_key_aggregated_edge_values),
             h_edge_offsets[j + 1] - h_edge_offsets[j],
             h_vertex_offsets[j + 1] - h_vertex_offsets[j],
             offset_first,
@@ -448,39 +567,44 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
                      tmp_majors.begin() + h_edge_offsets[j],
                      tmp_majors.begin() + h_edge_offsets[j + 1],
                      unreduced_majors.begin());
-        auto input_key_first = thrust::make_zip_iterator(
-          thrust::make_tuple(unreduced_majors.begin(), unreduced_minor_keys.begin()));
+        auto input_key_first =
+          thrust::make_zip_iterator(unreduced_majors.begin(), unreduced_minor_keys.begin());
         auto output_key_first =
-          thrust::make_zip_iterator(thrust::make_tuple(tmp_majors.begin(), tmp_minor_keys.begin()));
-        if constexpr (!std::is_same_v<edge_value_t, void>) {
+          thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
           reduced_size +=
             thrust::distance(output_key_first + reduced_size,
                              thrust::get<0>(thrust::reduce_by_key(
                                handle.get_thrust_policy(),
                                input_key_first,
                                input_key_first + (h_edge_offsets[j + 1] - h_edge_offsets[j]),
-                               unreduced_key_aggregated_edge_values.begin(),
+                               detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                 unreduced_key_aggregated_edge_values),
                                output_key_first + reduced_size,
-                               tmp_key_aggregated_edge_values.begin() + reduced_size)));
+                               detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                 tmp_key_aggregated_edge_values) +
+                                 reduced_size)));
         } else {
-          reduced_size +=
-            thrust::distance(output_key_first + reduced_size,
-                             thrust::get<0>(thrust::unique(
-                               handle.get_thrust_policy(),
-                               input_key_first,
-                               input_key_first + (h_edge_offsets[j + 1] - h_edge_offsets[j]),
-                               output_key_first + reduced_size)));
+          reduced_size += thrust::distance(
+            output_key_first + reduced_size,
+            thrust::copy_if(
+              handle.get_thrust_policy(),
+              input_key_first,
+              input_key_first + (h_edge_offsets[j + 1] - h_edge_offsets[j]),
+              thrust::make_counting_iterator(size_t{0}),
+              output_key_first + reduced_size,
+              cugraph::detail::is_first_in_run_t<decltype(input_key_first)>{input_key_first}));
         }
       }
       tmp_majors.resize(reduced_size, handle.get_stream());
       tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
-      // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-      tmp_key_aggregated_edge_values.resize(tmp_majors.size(), handle.get_stream());
+      detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+        tmp_key_aggregated_edge_values, tmp_majors.size(), handle.get_stream());
     }
     tmp_majors.shrink_to_fit(handle.get_stream());
     tmp_minor_keys.shrink_to_fit(handle.get_stream());
-    // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-    tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+    detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+      tmp_key_aggregated_edge_values, handle.get_stream());
 
     std::unique_ptr<
       kv_store_t<vertex_t,
@@ -499,18 +623,34 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
       auto const minor_comm_size = minor_comm.get_size();
 
-      // FIXME: this doesn't work if edge_value_t is thrust::tuple or void
-      auto triplet_first     = thrust::make_zip_iterator(thrust::make_tuple(
-        tmp_majors.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_values.begin()));
-      auto d_tx_value_counts = cugraph::groupby_and_count(
-        triplet_first,
-        triplet_first + tmp_majors.size(),
-        detail::triplet_to_minor_comm_rank_t<vertex_t, edge_value_t>{
-          detail::compute_vertex_partition_id_from_ext_vertex_t<vertex_t>{comm_size},
-          minor_comm_size},
-        minor_comm_size,
-        mem_frugal_threshold,
-        handle.get_stream());
+      rmm::device_uvector<size_t> d_tx_value_counts(0, handle.get_stream());
+      if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        auto triplet_first =
+          thrust::make_zip_iterator(tmp_majors.begin(),
+                                    tmp_minor_keys.begin(),
+                                    detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                      tmp_key_aggregated_edge_values));
+        d_tx_value_counts = cugraph::groupby_and_count(
+          triplet_first,
+          triplet_first + tmp_majors.size(),
+          detail::tuple_to_minor_comm_rank_t<vertex_t, edge_value_t>{
+            detail::compute_vertex_partition_id_from_ext_vertex_t<vertex_t>{comm_size},
+            minor_comm_size},
+          minor_comm_size,
+          mem_frugal_threshold,
+          handle.get_stream());
+      } else {
+        auto pair_first   = thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+        d_tx_value_counts = cugraph::groupby_and_count(
+          pair_first,
+          pair_first + tmp_majors.size(),
+          detail::tuple_to_minor_comm_rank_t<vertex_t, edge_value_t>{
+            detail::compute_vertex_partition_id_from_ext_vertex_t<vertex_t>{comm_size},
+            minor_comm_size},
+          minor_comm_size,
+          mem_frugal_threshold,
+          handle.get_stream());
+      }
 
       std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
       raft::update_host(h_tx_value_counts.data(),
@@ -544,8 +684,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
           thrust::copy(
             handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end(), majors.begin());
 
-          auto pair_first =
-            thrust::make_zip_iterator(thrust::make_tuple(minor_comm_ranks.begin(), majors.begin()));
+          auto pair_first = thrust::make_zip_iterator(minor_comm_ranks.begin(), majors.begin());
           thrust::sort(
             handle.get_thrust_policy(), pair_first, pair_first + minor_comm_ranks.size());
           auto unique_pair_last = thrust::unique(
@@ -622,7 +761,9 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
 
       rmm::device_uvector<vertex_t> rx_majors(0, handle.get_stream());
       rmm::device_uvector<vertex_t> rx_minor_keys(0, handle.get_stream());
-      rmm::device_uvector<edge_value_t> rx_key_aggregated_edge_values(0, handle.get_stream());
+      auto rx_key_aggregated_edge_values =
+        detail::allocate_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          0, handle.get_stream());
       auto mem_frugal_flag =
         host_scalar_allreduce(minor_comm,
                               tmp_majors.size() > mem_frugal_threshold ? int{1} : int{0},
@@ -639,66 +780,120 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         tmp_minor_keys.resize(0, handle.get_stream());
         tmp_minor_keys.shrink_to_fit(handle.get_stream());
 
-        std::tie(rx_key_aggregated_edge_values, std::ignore) =
-          shuffle_values(minor_comm,
-                         tmp_key_aggregated_edge_values.begin(),
-                         h_tx_value_counts,
-                         handle.get_stream());
-        tmp_key_aggregated_edge_values.resize(0, handle.get_stream());
-        tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+          std::tie(rx_key_aggregated_edge_values, std::ignore) =
+            shuffle_values(minor_comm,
+                           detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                             tmp_key_aggregated_edge_values),
+                           h_tx_value_counts,
+                           handle.get_stream());
+        }
+        detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, 0, handle.get_stream());
+        detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, handle.get_stream());
       } else {
-        std::forward_as_tuple(std::tie(rx_majors, rx_minor_keys, rx_key_aggregated_edge_values),
-                              std::ignore) =
-          shuffle_values(minor_comm, triplet_first, h_tx_value_counts, handle.get_stream());
+        if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+          auto triplet_first =
+            thrust::make_zip_iterator(tmp_majors.begin(),
+                                      tmp_minor_keys.begin(),
+                                      detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                        tmp_key_aggregated_edge_values));
+          std::forward_as_tuple(std::tie(rx_majors, rx_minor_keys, rx_key_aggregated_edge_values),
+                                std::ignore) =
+            shuffle_values(minor_comm, triplet_first, h_tx_value_counts, handle.get_stream());
+        } else {
+          auto pair_first = thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+          std::forward_as_tuple(std::tie(rx_majors, rx_minor_keys), std::ignore) =
+            shuffle_values(minor_comm, pair_first, h_tx_value_counts, handle.get_stream());
+        }
         tmp_majors.resize(0, handle.get_stream());
         tmp_majors.shrink_to_fit(handle.get_stream());
         tmp_minor_keys.resize(0, handle.get_stream());
         tmp_minor_keys.shrink_to_fit(handle.get_stream());
-        tmp_key_aggregated_edge_values.resize(0, handle.get_stream());
-        tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+        detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, 0, handle.get_stream());
+        detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+          tmp_key_aggregated_edge_values, handle.get_stream());
       }
 
-      auto key_pair_first =
-        thrust::make_zip_iterator(thrust::make_tuple(rx_majors.begin(), rx_minor_keys.begin()));
-      if (rx_majors.size() > mem_frugal_threshold) {  // trade-off parallelism to lower peak memory
-        auto second_first =
-          detail::mem_frugal_partition(key_pair_first,
-                                       key_pair_first + rx_majors.size(),
-                                       rx_key_aggregated_edge_values.begin(),
-                                       detail::pair_to_binary_partition_id_t<vertex_t>{},
-                                       int{1},
-                                       handle.get_stream());
-
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            key_pair_first,
-                            std::get<0>(second_first),
-                            rx_key_aggregated_edge_values.begin());
-
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            std::get<0>(second_first),
-                            key_pair_first + rx_majors.size(),
-                            std::get<1>(second_first));
+      auto key_pair_first = thrust::make_zip_iterator(rx_majors.begin(), rx_minor_keys.begin());
+      if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+        if (rx_majors.size() >
+            mem_frugal_threshold) {  // trade-off parallelism to lower peak memory
+          auto second_first =
+            detail::mem_frugal_partition(key_pair_first,
+                                         key_pair_first + rx_majors.size(),
+                                         detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                           rx_key_aggregated_edge_values),
+                                         detail::pair_to_binary_partition_id_t<vertex_t>{},
+                                         int{1},
+                                         handle.get_stream());
+
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              key_pair_first,
+                              std::get<0>(second_first),
+                              detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                rx_key_aggregated_edge_values));
+
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              std::get<0>(second_first),
+                              key_pair_first + rx_majors.size(),
+                              std::get<1>(second_first));
+        } else {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              key_pair_first,
+                              key_pair_first + rx_majors.size(),
+                              detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+                                rx_key_aggregated_edge_values));
+        }
+
+        auto num_uniques =
+          thrust::count_if(handle.get_thrust_policy(),
+                           thrust::make_counting_iterator(size_t{0}),
+                           thrust::make_counting_iterator(rx_majors.size()),
+                           detail::is_first_in_run_t<decltype(key_pair_first)>{key_pair_first});
+        tmp_majors.resize(num_uniques, handle.get_stream());
+        tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
+        detail::resize_optional_dataframe_buffer<edge_value_t>(
+          tmp_key_aggregated_edge_values, tmp_majors.size(), handle.get_stream());
+        thrust::reduce_by_key(
+          handle.get_thrust_policy(),
+          key_pair_first,
+          key_pair_first + rx_majors.size(),
+          detail::get_optional_dataframe_buffer_begin<edge_value_t>(rx_key_aggregated_edge_values),
+          thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin()),
+          detail::get_optional_dataframe_buffer_begin<edge_value_t>(
+            tmp_key_aggregated_edge_values));
       } else {
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            key_pair_first,
-                            key_pair_first + rx_majors.size(),
-                            rx_key_aggregated_edge_values.begin());
+        if (rx_majors.size() >
+            mem_frugal_threshold) {  // trade-off parallelism to lower peak memory
+          auto second_first =
+            detail::mem_frugal_partition(key_pair_first,
+                                         key_pair_first + rx_majors.size(),
+                                         detail::pair_to_binary_partition_id_t<vertex_t>{},
+                                         int{1},
+                                         handle.get_stream());
+
+          thrust::sort(handle.get_thrust_policy(), key_pair_first, second_first);
+
+          thrust::sort(handle.get_thrust_policy(), second_first, key_pair_first + rx_majors.size());
+        } else {
+          thrust::sort(
+            handle.get_thrust_policy(), key_pair_first, key_pair_first + rx_majors.size());
+        }
+
+        auto num_uniques = thrust::distance(
+          key_pair_first,
+          thrust::unique(
+            handle.get_thrust_policy(), key_pair_first, key_pair_first + rx_majors.size()));
+        tmp_majors.resize(num_uniques, handle.get_stream());
+        tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
+        thrust::copy(handle.get_thrust_policy(),
+                     key_pair_first,
+                     key_pair_first + num_uniques,
+                     thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin()));
       }
-      auto num_uniques =
-        thrust::count_if(handle.get_thrust_policy(),
-                         thrust::make_counting_iterator(size_t{0}),
-                         thrust::make_counting_iterator(rx_majors.size()),
-                         detail::is_first_in_run_t<decltype(key_pair_first)>{key_pair_first});
-      tmp_majors.resize(num_uniques, handle.get_stream());
-      tmp_minor_keys.resize(tmp_majors.size(), handle.get_stream());
-      tmp_key_aggregated_edge_values.resize(tmp_majors.size(), handle.get_stream());
-      thrust::reduce_by_key(
-        handle.get_thrust_policy(),
-        key_pair_first,
-        key_pair_first + rx_majors.size(),
-        rx_key_aggregated_edge_values.begin(),
-        thrust::make_zip_iterator(thrust::make_tuple(tmp_majors.begin(), tmp_minor_keys.begin())),
-        tmp_key_aggregated_edge_values.begin());
     }
 
     std::unique_ptr<kv_store_t<vertex_t, kv_pair_value_t, KVStoreViewType::binary_search>>
@@ -756,8 +951,6 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
     auto tmp_e_op_result_buffer =
       allocate_dataframe_buffer<T>(tmp_majors.size(), handle.get_stream());
 
-    auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
-      tmp_majors.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_values.begin()));
     auto major_value_map_device_view =
       (GraphViewType::is_multi_gpu && edge_src_value_input.keys())
         ? thrust::make_optional<detail::kv_binary_search_store_device_view_t<
@@ -768,28 +961,56 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
                        detail::kv_cuco_store_find_device_view_t<KVStoreViewType>>
       dst_key_value_map_device_view(
         GraphViewType::is_multi_gpu ? multi_gpu_minor_key_value_map_ptr->view() : kv_store_view);
-    thrust::transform(handle.get_thrust_policy(),
-                      triplet_first,
-                      triplet_first + tmp_majors.size(),
-                      get_dataframe_buffer_begin(tmp_e_op_result_buffer),
-                      detail::call_key_aggregated_e_op_t<
-                        vertex_t,
-                        edge_value_t,
-                        decltype(edge_partition),
-                        std::remove_reference_t<decltype(*major_value_map_device_view)>,
-                        edge_partition_src_input_device_view_t,
-                        decltype(dst_key_value_map_device_view),
-                        KeyAggregatedEdgeOp>{edge_partition,
-                                             major_value_map_device_view,
-                                             edge_partition_src_value_input,
-                                             dst_key_value_map_device_view,
-                                             key_aggregated_e_op});
+    if constexpr (!std::is_same_v<edge_value_t, thrust::nullopt_t>) {
+      auto triplet_first = thrust::make_zip_iterator(
+        tmp_majors.begin(),
+        tmp_minor_keys.begin(),
+        detail::get_optional_dataframe_buffer_begin<edge_value_t>(tmp_key_aggregated_edge_values));
+      thrust::transform(handle.get_thrust_policy(),
+                        triplet_first,
+                        triplet_first + tmp_majors.size(),
+                        get_dataframe_buffer_begin(tmp_e_op_result_buffer),
+                        detail::call_key_aggregated_e_op_t<
+                          vertex_t,
+                          edge_value_t,
+                          T,
+                          decltype(edge_partition),
+                          std::remove_reference_t<decltype(*major_value_map_device_view)>,
+                          edge_partition_src_input_device_view_t,
+                          decltype(dst_key_value_map_device_view),
+                          KeyAggregatedEdgeOp>{edge_partition,
+                                               major_value_map_device_view,
+                                               edge_partition_src_value_input,
+                                               dst_key_value_map_device_view,
+                                               key_aggregated_e_op});
+    } else {
+      auto pair_first = thrust::make_zip_iterator(tmp_majors.begin(), tmp_minor_keys.begin());
+      thrust::transform(handle.get_thrust_policy(),
+                        pair_first,
+                        pair_first + tmp_majors.size(),
+                        get_dataframe_buffer_begin(tmp_e_op_result_buffer),
+                        detail::call_key_aggregated_e_op_t<
+                          vertex_t,
+                          edge_value_t,
+                          T,
+                          decltype(edge_partition),
+                          std::remove_reference_t<decltype(*major_value_map_device_view)>,
+                          edge_partition_src_input_device_view_t,
+                          decltype(dst_key_value_map_device_view),
+                          KeyAggregatedEdgeOp>{edge_partition,
+                                               major_value_map_device_view,
+                                               edge_partition_src_value_input,
+                                               dst_key_value_map_device_view,
+                                               key_aggregated_e_op});
+    }
 
     if constexpr (GraphViewType::is_multi_gpu) { multi_gpu_minor_key_value_map_ptr.reset(); }
     tmp_minor_keys.resize(0, handle.get_stream());
     tmp_minor_keys.shrink_to_fit(handle.get_stream());
-    tmp_key_aggregated_edge_values.resize(0, handle.get_stream());
-    tmp_key_aggregated_edge_values.shrink_to_fit(handle.get_stream());
+    detail::resize_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+      tmp_key_aggregated_edge_values, 0, handle.get_stream());
+    detail::shrink_to_fit_optional_dataframe_buffer<optional_edge_value_buffer_value_type>(
+      tmp_key_aggregated_edge_values, handle.get_stream());
 
     {
       auto num_uniques =
diff --git a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
index b63b014ed05..244586e6d9e 100644
--- a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
+++ b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
@@ -19,6 +19,7 @@
 #include "prims/detail/nbr_intersection.cuh"
 #include "prims/property_op_utils.cuh"
 
+#include <cugraph/detail/decompress_edge_partition.cuh>
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
@@ -130,7 +131,9 @@ std::tuple<rmm::device_uvector<vertex_t>, ValueBuffer> sort_and_reduce_by_vertic
                         vertices.end(),
                         get_dataframe_buffer_begin(value_buffer),
                         reduced_vertices.begin(),
-                        get_dataframe_buffer_begin(reduced_value_buffer));
+                        get_dataframe_buffer_begin(reduced_value_buffer),
+                        thrust::equal_to<vertex_t>{},
+                        property_op<value_t, thrust::plus>{});
 
   vertices.resize(size_t{0}, handle.get_stream());
   resize_dataframe_buffer(value_buffer, size_t{0}, handle.get_stream());
@@ -201,14 +204,14 @@ struct accumulate_vertex_property_t {
  * @param graph_view Non-owning graph object.
  * @param edge_src_value_input Wrapper used to access source input property values (for the edge
  * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
- * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
- * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
- * wrapper.
+ * (if @p intersection_op needs to access source property values) or
+ * cugraph::edge_src_dummy_property_t::view() (if @p intersection_op does not access source property
+ * values). Use update_edge_src_property to fill the wrapper.
  * @param edge_dst_value_input Wrapper used to access destination input property values (for the
  * edge destinations assigned to this process in multi-GPU). Use either
- * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
- * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
- * values). Use update_edge_dst_property to fill the wrapper.
+ * cugraph::edge_dst_property_t::view() (if @p intersection_op needs to access destination property
+ * values) or cugraph::edge_dst_dummy_property_t::view() (if @p intersection_op does not access
+ * destination property values). Use update_edge_dst_property to fill the wrapper.
  * @param intersection_op quinary operator takes edge source, edge destination, property values for
  * the source, property values for the destination, and a list of vertices in the intersection of
  * edge source & destination vertices' destination neighbors and returns a thrust::tuple of three
@@ -260,8 +263,6 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
       typename EdgeDstValueInputWrapper::value_iterator,
       typename EdgeDstValueInputWrapper::value_type>>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do.
   }
@@ -272,6 +273,7 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
                init);
 
   auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
@@ -484,7 +486,9 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
                             merged_vertices.end(),
                             get_dataframe_buffer_begin(merged_value_buffer),
                             reduced_vertices.begin(),
-                            get_dataframe_buffer_begin(reduced_value_buffer));
+                            get_dataframe_buffer_begin(reduced_value_buffer),
+                            thrust::equal_to<vertex_t>{},
+                            property_op<T, thrust::plus>{});
       merged_vertices.resize(size_t{0}, handle.get_stream());
       merged_vertices.shrink_to_fit(handle.get_stream());
       resize_dataframe_buffer(merged_value_buffer, size_t{0}, handle.get_stream());
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index eee0ed03d1c..00876012906 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -95,6 +95,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
 __global__ static void transform_reduce_by_src_dst_key_hypersparse(
@@ -105,6 +106,9 @@ __global__ static void transform_reduce_by_src_dst_key_hypersparse(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -129,19 +133,42 @@ __global__ static void transform_reduce_by_src_dst_key_hypersparse(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_idx));
-    auto local_offset = edge_partition.local_offset(major_idx);
-    for (edge_t i = 0; i < local_degree; ++i) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      auto major_offset          = edge_partition.major_offset_from_major_nocheck(major);
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = 0; i < local_degree; ++i) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = 0; i < local_degree; ++i) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x * blockDim.x;
@@ -154,6 +181,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
 __global__ static void transform_reduce_by_src_dst_key_low_degree(
@@ -166,6 +194,9 @@ __global__ static void transform_reduce_by_src_dst_key_low_degree(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -187,19 +218,41 @@ __global__ static void transform_reduce_by_src_dst_key_low_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto local_offset = edge_partition.local_offset(major_offset);
-    for (edge_t i = 0; i < local_degree; ++i) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = 0; i < local_degree; ++i) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = 0; i < local_degree; ++i) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x * blockDim.x;
@@ -212,6 +265,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
 __global__ static void transform_reduce_by_src_dst_key_mid_degree(
@@ -224,6 +278,9 @@ __global__ static void transform_reduce_by_src_dst_key_mid_degree(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -238,6 +295,9 @@ __global__ static void transform_reduce_by_src_dst_key_mid_degree(
     static_cast<size_t>(major_range_first - edge_partition.major_range_first());
   size_t idx = static_cast<size_t>(tid / raft::warp_size());
 
+  using WarpScan = cub::WarpScan<edge_t, raft::warp_size()>;
+  __shared__ typename WarpScan::TempStorage temp_storage;
+
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
     auto major_offset = major_start_offset + idx;
     auto major =
@@ -247,19 +307,49 @@ __global__ static void transform_reduce_by_src_dst_key_mid_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto local_offset = edge_partition.local_offset(major_offset);
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      // FIXME: it might be faster to update in warp-sync way
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
+      }
+      edge_t offset_within_warp{};
+      WarpScan(temp_storage).ExclusiveSum(counter, offset_within_warp);
+      edge_offset_with_mask += offset_within_warp;
+      counter = 0;
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x * (blockDim.x / raft::warp_size());
@@ -272,6 +362,7 @@ template <bool edge_partition_src_key,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
           typename EdgePartitionSrcDstKeyInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgeOp,
           typename ValueIterator>
 __global__ static void transform_reduce_by_src_dst_key_high_degree(
@@ -284,6 +375,9 @@ __global__ static void transform_reduce_by_src_dst_key_high_degree(
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
   EdgePartitionSrcDstKeyInputWrapper edge_partition_src_dst_key_input,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
+  thrust::optional<raft::device_span<typename GraphViewType::edge_type const>>
+    edge_offsets_with_mask,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   ValueIterator value_iter)
@@ -295,6 +389,9 @@ __global__ static void transform_reduce_by_src_dst_key_high_degree(
     static_cast<size_t>(major_range_first - edge_partition.major_range_first());
   auto idx = static_cast<size_t>(blockIdx.x);
 
+  using BlockScan = cub::BlockScan<edge_t, transform_reduce_e_by_src_dst_key_kernel_block_size>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
     auto major_offset = major_start_offset + idx;
     auto major =
@@ -304,19 +401,49 @@ __global__ static void transform_reduce_by_src_dst_key_high_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto local_offset = edge_partition.local_offset(major_offset);
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      update_buffer_element<edge_partition_src_key, GraphViewType>(edge_partition,
-                                                                   major,
-                                                                   indices[i],
-                                                                   edge_offset + i,
-                                                                   edge_partition_src_value_input,
-                                                                   edge_partition_dst_value_input,
-                                                                   edge_partition_e_value_input,
-                                                                   edge_partition_src_dst_key_input,
-                                                                   e_op,
-                                                                   keys + local_offset + i,
-                                                                   value_iter + local_offset + i);
+    if (edge_partition_e_mask) {
+      // FIXME: it might be faster to update in block-sync way
+      auto edge_offset_with_mask = (*edge_offsets_with_mask)[major_offset];
+      edge_t counter{0};
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) { ++counter; }
+      }
+      edge_t offset_within_block{};
+      BlockScan(temp_storage).ExclusiveSum(counter, offset_within_block);
+      edge_offset_with_mask += offset_within_block;
+      counter = 0;
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          update_buffer_element<edge_partition_src_key, GraphViewType>(
+            edge_partition,
+            major,
+            indices[i],
+            edge_offset + i,
+            edge_partition_src_value_input,
+            edge_partition_dst_value_input,
+            edge_partition_e_value_input,
+            edge_partition_src_dst_key_input,
+            e_op,
+            keys + edge_offset_with_mask + counter,
+            value_iter + edge_offset_with_mask + counter);
+          ++counter;
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        update_buffer_element<edge_partition_src_key, GraphViewType>(
+          edge_partition,
+          major,
+          indices[i],
+          edge_offset + i,
+          edge_partition_src_value_input,
+          edge_partition_dst_value_input,
+          edge_partition_e_value_input,
+          edge_partition_src_dst_key_input,
+          e_op,
+          keys + edge_offset + i,
+          value_iter + edge_offset + i);
+      }
     }
 
     idx += gridDim.x;
@@ -410,19 +537,41 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
       typename EdgeSrcDstKeyInputWrapper::value_iterator,
       typename EdgeSrcDstKeyInputWrapper::value_type>;
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   rmm::device_uvector<vertex_t> keys(0, handle.get_stream());
   auto value_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
-
-    auto num_edges = edge_partition.number_of_edges();
-
-    rmm::device_uvector<vertex_t> tmp_keys(num_edges, handle.get_stream());
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
+    rmm::device_uvector<vertex_t> tmp_keys(0, handle.get_stream());
+    std::optional<rmm::device_uvector<edge_t>> edge_offsets_with_mask{std::nullopt};
+    if (edge_partition_e_mask) {
+      auto local_degrees = edge_partition.compute_local_degrees_with_mask(
+        (*edge_partition_e_mask).value_first(), handle.get_stream());
+      edge_offsets_with_mask =
+        rmm::device_uvector<edge_t>(edge_partition.major_range_size() + 1, handle.get_stream());
+      (*edge_offsets_with_mask).set_element_to_zero_async(0, handle.get_stream());
+      thrust::inclusive_scan(handle.get_thrust_policy(),
+                             local_degrees.begin(),
+                             local_degrees.end(),
+                             (*edge_offsets_with_mask).begin() + 1);
+      tmp_keys.resize((*edge_offsets_with_mask).back_element(handle.get_stream()),
+                      handle.get_stream());
+    } else {
+      tmp_keys.resize(edge_partition.number_of_edges(), handle.get_stream());
+    }
     auto tmp_value_buffer = allocate_dataframe_buffer<T>(tmp_keys.size(), handle.get_stream());
 
-    if (num_edges > 0) {
+    if (tmp_keys.size() > 0) {
       edge_partition_src_input_device_view_t edge_partition_src_value_input{};
       edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
       if constexpr (GraphViewType::is_storage_transposed) {
@@ -467,6 +616,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -485,6 +639,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -503,6 +662,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -520,6 +684,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
               edge_partition_src_dst_key_input,
+              edge_partition_e_mask,
+              edge_offsets_with_mask
+                ? thrust::make_optional<raft::device_span<edge_t const>>(
+                    (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+                : thrust::nullopt,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin(tmp_value_buffer));
@@ -539,6 +708,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
             edge_partition_src_dst_key_input,
+            edge_partition_e_mask,
+            edge_offsets_with_mask
+              ? thrust::make_optional<raft::device_span<edge_t const>>(
+                  (*edge_offsets_with_mask).data(), (*edge_offsets_with_mask).size())
+              : thrust::nullopt,
             e_op,
             tmp_keys.data(),
             get_dataframe_buffer_begin(tmp_value_buffer));
@@ -682,8 +856,6 @@ auto transform_reduce_e_by_src_key(raft::handle_t const& handle,
                              typename GraphViewType::vertex_type>::value);
   static_assert(ReduceOp::pure_function, "ReduceOp should be a pure function.");
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -772,8 +944,6 @@ auto transform_reduce_e_by_dst_key(raft::handle_t const& handle,
                              typename GraphViewType::vertex_type>::value);
   static_assert(ReduceOp::pure_function, "ReduceOp should be a pure function.");
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c84711e1a69..4d37c93326d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -396,12 +396,10 @@ ConfigureTest(RANDOM_WALKS_TEST sampling/sg_random_walks_test.cpp)
 ###################################################################################################
 # - NBR SAMPLING tests ----------------------------------------------------------------------------
 ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/sg_uniform_neighbor_sampling.cu)
-target_link_libraries(UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
 # - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
 ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu)
-target_link_libraries(SAMPLING_POST_PROCESSING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
 # - Renumber tests --------------------------------------------------------------------------------
@@ -583,78 +581,79 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ###############################################################################################
     # - MG PRIMS COUNT_IF_V tests -----------------------------------------------------------------
     ConfigureTestMG(MG_COUNT_IF_V_TEST prims/mg_count_if_v.cu)
-    target_link_libraries(MG_COUNT_IF_V_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST tests ------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST
                     prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu)
-    target_link_libraries(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_DST_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS REDUCE_V tests -------------------------------------------------------------------
     ConfigureTestMG(MG_REDUCE_V_TEST prims/mg_reduce_v.cu)
-    target_link_libraries(MG_REDUCE_V_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_REDUCE_V tests ---------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_V_TEST prims/mg_transform_reduce_v.cu)
-    target_link_libraries(MG_TRANSFORM_REDUCE_V_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_REDUCE_E tests ---------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_REDUCE_E_TEST prims/mg_transform_reduce_e.cu)
-    target_link_libraries(MG_TRANSFORM_REDUCE_E_TEST PRIVATE cuco::cuco)
+
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_REDUCE_E _BY_SRC_DST_KEY tests -----------------------------------------
+    ConfigureTestMG(MG_TRANSFORM_REDUCE_E_BY_SRC_DST_KEY_TEST
+                    prims/mg_transform_reduce_e_by_src_dst_key.cu)
 
     ###############################################################################################
     # - MG PRIMS TRANSFORM_E tests ----------------------------------------------------------------
     ConfigureTestMG(MG_TRANSFORM_E_TEST prims/mg_transform_e.cu)
-    target_link_libraries(MG_TRANSFORM_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS COUNT_IF_E tests -----------------------------------------------------------------
     ConfigureTestMG(MG_COUNT_IF_E_TEST prims/mg_count_if_e.cu)
-    target_link_libraries(MG_COUNT_IF_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E tests ---------------------------------
     ConfigureTestMG(MG_PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E_TEST
       prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu)
-    target_link_libraries(MG_PER_V_TRANSFORM_REDUCE_INCOMING_OUTGOING_E_TEST PRIVATE cuco::cuco)
+
+    ###############################################################################################
+    # - MG PRIMS PER_V_TRANSFORM_REDUCE_DST_KEY_AGGREGATED_OUTGOING_E tests -----------------------
+    ConfigureTestMG(MG_PER_V_TRANSFORM_REDUCE_DST_KEY_AGGREGATED_OUTGOING_E_TEST
+      prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu)
 
     ###############################################################################################
     # - MG PRIMS EXTRACT_TRANSFORM_E tests --------------------------------------------------------
     ConfigureTestMG(MG_EXTRACT_TRANSFORM_E_TEST prims/mg_extract_transform_e.cu)
-    target_link_libraries(MG_EXTRACT_TRANSFORM_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E tests ------------------------------------
     ConfigureTestMG(MG_EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E_TEST
                     prims/mg_extract_transform_v_frontier_outgoing_e.cu)
-    target_link_libraries(MG_EXTRACT_TRANSFORM_V_FRONTIER_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E tests -----------------------------------
     ConfigureTestMG(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST
                     prims/mg_per_v_random_select_transform_outgoing_e.cu)
-    target_link_libraries(MG_PER_V_RANDOM_SELECT_TRANSFORM_OUTGOING_E_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG PRIMS PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION tests ----------------------------------
     ConfigureTestMG(MG_PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION_TEST
                     prims/mg_per_v_pair_transform_dst_nbr_intersection.cu)
-    target_link_libraries(MG_PER_V_PAIR_TRANSFORM_DST_NBR_INTERSECTION_TEST PRIVATE cuco::cuco)
+
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_REDUCE_DST_NBR_INTERSECTION OF_E_ENDPOINTS_BY_V tests ------------------
+    ConfigureTestMG(MG_TRANSFORM_REDUCE_DST_NBR_INTERSECTION_BY_E_ENDPOINTS_BY_V_TEST
+                    prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu)
 
     ###############################################################################################
     # - MG PRIMS PER_V_PAIR_TRANSFORM_DST_NBR_WEIGHTED_INTERSECTION tests -------------------------
     ConfigureTestMG(MG_PER_V_PAIR_TRANSFORM_DST_NBR_WEIGHTED_INTERSECTION_TEST
-    prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu)
-    target_link_libraries(MG_PER_V_PAIR_TRANSFORM_DST_NBR_WEIGHTED_INTERSECTION_TEST PRIVATE cuco::cuco)
+                    prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu)
 
     ###############################################################################################
     # - MG NBR SAMPLING tests ---------------------------------------------------------------------
     ConfigureTestMG(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_uniform_neighbor_sampling.cu)
-    target_link_libraries(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
     ###############################################################################################
     # - MG RANDOM_WALKS tests ---------------------------------------------------------------------
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
new file mode 100644
index 00000000000..af56807746a
--- /dev/null
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+#include "property_generator.cuh"
+#include "result_compare.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/test_utilities.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/equal.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <cuco/hash_functions.cuh>
+
+#include <gtest/gtest.h>
+
+#include <random>
+#include <sstream>
+
+template <typename vertex_t, typename edge_value_t, typename result_t>
+struct key_aggregated_e_op_t {
+  __device__ result_t operator()(vertex_t src,
+                                 vertex_t key,
+                                 result_t src_property,
+                                 result_t key_property,
+                                 edge_value_t edge_property) const
+  {
+    if (src_property < key_property) {
+      return src_property;
+    } else {
+      return key_property;
+    }
+  }
+};
+
+struct Prims_Usecase {
+  bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of per_v_transform_reduce_incoming|outgoing_e primitive
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResTimer hr_timer{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, prims_usecase.test_weighted, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG per_v_transform_reduce_dst_key_aggregated_outgoing_e
+
+    const int vertex_prop_hash_bin_count = 5;
+    const int key_hash_bin_count         = 10;
+    const int key_prop_hash_bin_count    = 20;
+    const int initial_value              = 4;
+
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *mg_renumber_map, vertex_prop_hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto mg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+      *handle_, *mg_renumber_map, key_hash_bin_count);
+    auto mg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_key);
+
+    rmm::device_uvector<vertex_t> mg_kv_store_keys(comm_rank == 0 ? key_hash_bin_count : int{0},
+                                                   handle_->get_stream());
+    thrust::sequence(
+      handle_->get_thrust_policy(), mg_kv_store_keys.begin(), mg_kv_store_keys.end(), vertex_t{0});
+    mg_kv_store_keys = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+      *handle_, std::move(mg_kv_store_keys));
+    auto mg_kv_store_values = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, mg_kv_store_keys, key_prop_hash_bin_count);
+
+    static_assert(std::is_same_v<result_t, int> ||
+                  std::is_same_v<result_t, thrust::tuple<int, float>>);
+    result_t invalid_value{};
+    if constexpr (std::is_same_v<result_t, int>) {
+      invalid_value = std::numeric_limits<int>::max();
+    } else {
+      invalid_value =
+        thrust::make_tuple(std::numeric_limits<int>::max(), std::numeric_limits<float>::max());
+    }
+    cugraph::kv_store_t<vertex_t, result_t, false> mg_kv_store(
+      mg_kv_store_keys.begin(),
+      mg_kv_store_keys.end(),
+      cugraph::get_dataframe_buffer_begin(mg_kv_store_values),
+      cugraph::invalid_vertex_id<vertex_t>::value,
+      invalid_value,
+      handle_->get_stream());
+
+    enum class reduction_type_t { PLUS, ELEMWISE_MIN, ELEMWISE_MAX };
+    std::array<reduction_type_t, 3> reduction_types = {
+      reduction_type_t::PLUS, reduction_type_t::ELEMWISE_MIN, reduction_type_t::ELEMWISE_MAX};
+
+    std::vector<decltype(cugraph::allocate_dataframe_buffer<result_t>(0, rmm::cuda_stream_view{}))>
+      mg_results{};
+    mg_results.reserve(reduction_types.size());
+
+    for (size_t i = 0; i < reduction_types.size(); ++i) {
+      mg_results.push_back(cugraph::allocate_dataframe_buffer<result_t>(
+        mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream()));
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        handle_->get_comms().barrier();
+        hr_timer.start("MG per_v_transform_reduce_outgoing_e");
+      }
+
+      switch (reduction_types[i]) {
+        case reduction_type_t::PLUS:
+          if (mg_edge_weight_view) {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              *mg_edge_weight_view,
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::plus<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          } else {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::plus<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          }
+          break;
+        case reduction_type_t::ELEMWISE_MIN:
+          if (mg_edge_weight_view) {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              *mg_edge_weight_view,
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_minimum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          } else {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_minimum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          }
+          break;
+        case reduction_type_t::ELEMWISE_MAX:
+          if (mg_edge_weight_view) {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              *mg_edge_weight_view,
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_maximum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          } else {
+            per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+              *handle_,
+              mg_graph_view,
+              mg_src_prop.view(),
+              cugraph::edge_dummy_property_t{}.view(),
+              mg_dst_key.view(),
+              mg_kv_store.view(),
+              key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+              property_initial_value,
+              cugraph::reduce_op::elementwise_maximum<result_t>{},
+              cugraph::get_dataframe_buffer_begin(mg_results[i]));
+          }
+          break;
+        default: FAIL() << "should not be reached.";
+      }
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        handle_->get_comms().barrier();
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+    }
+
+    // 3. compare SG & MG results
+
+    if (prims_usecase.check_correctness) {
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      for (size_t i = 0; i < reduction_types.size(); ++i) {
+        auto mg_aggregate_results =
+          cugraph::allocate_dataframe_buffer<result_t>(0, handle_->get_stream());
+
+        static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<result_t>::value);
+        if constexpr (std::is_arithmetic_v<result_t>) {
+          std::tie(std::ignore, mg_aggregate_results) =
+            cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+              *handle_,
+              std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                    (*mg_renumber_map).size()),
+              mg_graph_view.local_vertex_partition_range(),
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              raft::device_span<result_t const>(mg_results[i].data(), mg_results[i].size()));
+        } else {
+          std::tie(std::ignore, std::get<0>(mg_aggregate_results)) =
+            cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+              *handle_,
+              std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                    (*mg_renumber_map).size()),
+              mg_graph_view.local_vertex_partition_range(),
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              raft::device_span<typename thrust::tuple_element<0, result_t>::type const>(
+                std::get<0>(mg_results[i]).data(), std::get<0>(mg_results[i]).size()));
+
+          std::tie(std::ignore, std::get<1>(mg_aggregate_results)) =
+            cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+              *handle_,
+              std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                    (*mg_renumber_map).size()),
+              mg_graph_view.local_vertex_partition_range(),
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+              raft::device_span<typename thrust::tuple_element<1, result_t>::type const>(
+                std::get<1>(mg_results[i]).data(), std::get<1>(mg_results[i]).size()));
+        }
+
+        if (handle_->get_comms().get_rank() == int{0}) {
+          auto sg_graph_view = sg_graph.view();
+          auto sg_edge_weight_view =
+            sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+          auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+            *handle_,
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+            vertex_prop_hash_bin_count);
+          auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+            *handle_, sg_graph_view, sg_vertex_prop);
+
+          auto sg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+            *handle_,
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+            thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+            key_hash_bin_count);
+          auto sg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+            *handle_, sg_graph_view, sg_vertex_key);
+
+          rmm::device_uvector<vertex_t> sg_kv_store_keys(key_hash_bin_count, handle_->get_stream());
+          thrust::sequence(handle_->get_thrust_policy(),
+                           sg_kv_store_keys.begin(),
+                           sg_kv_store_keys.end(),
+                           vertex_t{0});
+          auto sg_kv_store_values = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+            *handle_, sg_kv_store_keys, key_prop_hash_bin_count);
+
+          cugraph::kv_store_t<vertex_t, result_t, false> sg_kv_store(
+            sg_kv_store_keys.begin(),
+            sg_kv_store_keys.end(),
+            cugraph::get_dataframe_buffer_begin(sg_kv_store_values),
+            cugraph::invalid_vertex_id<vertex_t>::value,
+            invalid_value,
+            handle_->get_stream());
+
+          cugraph::test::vector_result_compare compare{*handle_};
+
+          auto global_result = cugraph::allocate_dataframe_buffer<result_t>(
+            sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+          switch (reduction_types[i]) {
+            case reduction_type_t::PLUS:
+              if (sg_edge_weight_view) {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  *sg_edge_weight_view,
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::plus<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              } else {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  cugraph::edge_dummy_property_t{}.view(),
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::plus<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              }
+              break;
+            case reduction_type_t::ELEMWISE_MIN:
+              if (sg_edge_weight_view) {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  *sg_edge_weight_view,
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_minimum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              } else {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  cugraph::edge_dummy_property_t{}.view(),
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_minimum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              }
+              break;
+            case reduction_type_t::ELEMWISE_MAX:
+              if (sg_edge_weight_view) {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  *sg_edge_weight_view,
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, weight_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_maximum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              } else {
+                per_v_transform_reduce_dst_key_aggregated_outgoing_e(
+                  *handle_,
+                  sg_graph_view,
+                  sg_src_prop.view(),
+                  cugraph::edge_dummy_property_t{}.view(),
+                  sg_dst_key.view(),
+                  sg_kv_store.view(),
+                  key_aggregated_e_op_t<vertex_t, thrust::nullopt_t, result_t>{},
+                  property_initial_value,
+                  cugraph::reduce_op::elementwise_maximum<result_t>{},
+                  cugraph::get_dataframe_buffer_begin(global_result));
+              }
+              break;
+            default: FAIL() << "should not be reached.";
+          }
+
+          ASSERT_TRUE(compare(mg_aggregate_results, global_result));
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File =
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE<cugraph::test::File_Usecase>;
+using Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat =
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE<cugraph::test::Rmat_Usecase>;
+
+// FIXME: this tests do not build as cugrpah::kv_store_t has a build error when use_binary_search =
+// false and value_t is thrust::tuple, this will be fixed in a separate PR
+#if 0
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File,
+       CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(std::get<0>(param),
+                                                                              std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt64Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+#endif
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File,
+       CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+       CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index efab53f89e6..a459a677569 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -18,6 +18,7 @@
 #include "prims/reduce_op.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -72,83 +73,6 @@ struct e_op_t {
   }
 };
 
-template <typename T>
-__host__ __device__ bool compare_scalar(T val0, T val1, thrust::optional<T> threshold_ratio)
-{
-  if (threshold_ratio) {
-    return std::abs(val0 - val1) <= (std::max(std::abs(val0), std::abs(val1)) * *threshold_ratio);
-  } else {
-    return val0 == val1;
-  }
-}
-
-template <typename T>
-struct comparator {
-  static constexpr double threshold_ratio{1e-2};
-
-  __host__ __device__ bool operator()(T t0, T t1) const
-  {
-    static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-    if constexpr (std::is_arithmetic_v<T>) {
-      return compare_scalar(
-        t0,
-        t1,
-        std::is_floating_point_v<T> ? thrust::optional<T>{threshold_ratio} : thrust::nullopt);
-    } else {
-      auto val0   = thrust::get<0>(t0);
-      auto val1   = thrust::get<0>(t1);
-      auto passed = compare_scalar(val0,
-                                   val1,
-                                   std::is_floating_point_v<decltype(val0)>
-                                     ? thrust::optional<decltype(val0)>{threshold_ratio}
-                                     : thrust::nullopt);
-      if (!passed) return false;
-
-      if constexpr (thrust::tuple_size<T>::value >= 2) {
-        auto val0   = thrust::get<1>(t0);
-        auto val1   = thrust::get<1>(t1);
-        auto passed = compare_scalar(val0,
-                                     val1,
-                                     std::is_floating_point_v<decltype(val1)>
-                                       ? thrust::optional<decltype(val1)>{threshold_ratio}
-                                       : thrust::nullopt);
-        if (!passed) return false;
-      }
-      if constexpr (thrust::tuple_size<T>::value >= 3) {
-        assert(false);  // should not be reached.
-      }
-      return true;
-    }
-  }
-};
-
-struct result_compare {
-  const raft::handle_t& handle_;
-  result_compare(raft::handle_t const& handle) : handle_(handle) {}
-
-  template <typename... Args>
-  auto operator()(const std::tuple<rmm::device_uvector<Args>...>& t1,
-                  const std::tuple<rmm::device_uvector<Args>...>& t2)
-  {
-    using type = thrust::tuple<Args...>;
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
-  }
-
-  template <typename T>
-  auto operator()(const rmm::device_uvector<T>& t1, const rmm::device_uvector<T>& t2)
-  {
-    return thrust::equal(
-      handle_.get_thrust_policy(), t1.begin(), t1.end(), t2.begin(), comparator<T>());
-  }
-
- private:
-  template <typename T, std::size_t... I>
-  auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (result_compare::operator()(std::get<I>(t1), std::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool test_weighted{false};
   bool edge_masking{false};
@@ -440,7 +364,7 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
             *handle_, sg_graph_view, sg_vertex_prop);
           auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
             *handle_, sg_graph_view, sg_vertex_prop);
-          result_compare comp{*handle_};
+          cugraph::test::vector_result_compare compare{*handle_};
 
           auto global_in_result = cugraph::allocate_dataframe_buffer<result_t>(
             sg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
@@ -528,8 +452,8 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
             default: FAIL() << "should not be reached.";
           }
 
-          ASSERT_TRUE(comp(mg_aggregate_in_results, global_in_result));
-          ASSERT_TRUE(comp(mg_aggregate_out_results, global_out_result));
+          ASSERT_TRUE(compare(mg_aggregate_in_results, global_in_result));
+          ASSERT_TRUE(compare(mg_aggregate_out_results, global_out_result));
         }
       }
     }
diff --git a/cpp/tests/prims/mg_reduce_v.cu b/cpp/tests/prims/mg_reduce_v.cu
index da3354b77d9..783e17b6d8f 100644
--- a/cpp/tests/prims/mg_reduce_v.cu
+++ b/cpp/tests/prims/mg_reduce_v.cu
@@ -17,6 +17,7 @@
 #include "prims/property_op_utils.cuh"
 #include "prims/reduce_v.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -49,50 +50,6 @@
 
 #include <random>
 
-template <typename T>
-struct result_compare {
-  static constexpr double threshold_ratio{1e-2};
-  constexpr auto operator()(const T& t1, const T& t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-};
-
-template <typename... Args>
-struct result_compare<thrust::tuple<Args...>> {
-  static constexpr double threshold_ratio{1e-3};
-
-  using Type = thrust::tuple<Args...>;
-  constexpr auto operator()(const Type& t1, const Type& t2)
-  {
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<Type>::value>());
-  }
-
- private:
-  template <typename T>
-  constexpr bool equal(T t1, T t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-  template <typename T, std::size_t... I>
-  constexpr auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (equal(thrust::get<I>(t1), thrust::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
 };
@@ -249,7 +206,7 @@ class Tests_MGReduceV
               break;
             default: FAIL() << "should not be reached.";
           }
-          result_compare<result_t> compare{};
+          cugraph::test::scalar_result_compare compare{};
           ASSERT_TRUE(compare(expected_result, results[reduction_type]));
         }
       }
diff --git a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
new file mode 100644
index 00000000000..5fa37250e21
--- /dev/null
+++ b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "property_generator.cuh"
+
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tuple.h>
+
+#include <gtest/gtest.h>
+#include <prims/transform_e.cuh>
+#include <prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh>
+#include <prims/update_edge_src_dst_property.cuh>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <random>
+
+template <typename vertex_t, typename edge_t>
+struct intersection_op_t {
+  __device__ thrust::tuple<edge_t, edge_t, edge_t> operator()(
+    vertex_t v0,
+    vertex_t v1,
+    edge_t v0_prop,
+    edge_t v1_prop,
+    raft::device_span<vertex_t const> intersection) const
+  {
+    return thrust::make_tuple(
+      v0_prop + v1_prop, v0_prop + v1_prop, static_cast<edge_t>(intersection.size()));
+  }
+};
+
+struct Prims_Usecase {
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Verify the results of transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v primitive
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResTimer hr_timer{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, false, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v primitive
+
+    const int hash_bin_count = 5;
+    const int initial_value  = 4;
+
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, edge_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, edge_t>::vertex_property(
+      *handle_, *mg_renumber_map, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, edge_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, edge_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto mg_result_buffer = rmm::device_uvector<edge_t>(
+      mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v");
+    }
+
+    cugraph::transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
+      *handle_,
+      mg_graph_view,
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      intersection_op_t<vertex_t, edge_t>{},
+      property_initial_value,
+      mg_result_buffer.begin());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. validate MG results
+
+    if (prims_usecase.check_correctness) {
+      rmm::device_uvector<edge_t> mg_aggregate_result_buffer(0, handle_->get_stream());
+      std::tie(std::ignore, mg_aggregate_result_buffer) =
+        cugraph::test::mg_vertex_property_values_to_sg_vertex_property_values(
+          *handle_,
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          mg_graph_view.local_vertex_partition_range(),
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+          raft::device_span<edge_t const>(mg_result_buffer.data(), mg_result_buffer.size()));
+
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, edge_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, edge_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, edge_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+
+        auto sg_result_buffer = cugraph::allocate_dataframe_buffer<edge_t>(
+          sg_graph_view.number_of_vertices(), handle_->get_stream());
+
+        cugraph::transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
+          *handle_,
+          sg_graph_view,
+          sg_src_prop.view(),
+          sg_dst_prop.view(),
+          intersection_op_t<vertex_t, edge_t>{},
+          property_initial_value,
+          sg_result_buffer.begin());
+
+        bool valid = thrust::equal(handle_->get_thrust_policy(),
+                                   mg_aggregate_result_buffer.begin(),
+                                   mg_aggregate_result_buffer.end(),
+                                   sg_result_buffer.begin());
+
+        ASSERT_TRUE(valid);
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_File =
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV<cugraph::test::File_Usecase>;
+using Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat =
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_File, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat, CheckInt32Int32Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat, CheckInt32Int64Float)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat, CheckInt64Int64Float)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat,
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
+                                                              Prims_Usecase{true, true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index c8ce9fc3a47..53f37e83b30 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -17,6 +17,7 @@
 #include "prims/transform_reduce_e.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -52,44 +53,6 @@
 
 #include <random>
 
-template <typename T>
-struct result_compare {
-  static constexpr double threshold_ratio{1e-3};
-  constexpr auto operator()(const T& t1, const T& t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      return std::abs(t1 - t2) < (std::max(t1, t2) * threshold_ratio);
-    }
-    return t1 == t2;
-  }
-};
-
-template <typename... Args>
-struct result_compare<thrust::tuple<Args...>> {
-  static constexpr double threshold_ratio{1e-3};
-
-  using type = thrust::tuple<Args...>;
-  constexpr auto operator()(const type& t1, const type& t2)
-  {
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
-  }
-
- private:
-  template <typename T>
-  constexpr bool equal(T t1, T t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      return std::abs(t1 - t2) < (std::max(t1, t2) * threshold_ratio);
-    }
-    return t1 == t2;
-  }
-  template <typename T, std::size_t... I>
-  constexpr auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (equal(thrust::get<I>(t1), thrust::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool test_weighted{false};
   bool edge_masking{false};
@@ -231,7 +194,7 @@ class Tests_MGTransformReduceE
             }
           },
           property_initial_value);
-        result_compare<result_t> compare{};
+        cugraph::test::scalar_result_compare compare{};
         ASSERT_TRUE(compare(expected_result, result));
       }
     }
diff --git a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
new file mode 100644
index 00000000000..457e6b5ab93
--- /dev/null
+++ b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "property_generator.cuh"
+#include "result_compare.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <cuco/hash_functions.cuh>
+
+#include <gtest/gtest.h>
+#include <prims/reduce_op.cuh>
+#include <prims/transform_reduce_e_by_src_dst_key.cuh>
+#include <prims/update_edge_src_dst_property.cuh>
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <random>
+
+struct Prims_Usecase {
+  bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGTransformReduceEBySrcDstKey
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGTransformReduceEBySrcDstKey() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of transform_reduce_e_by_src|dst_key primitive
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool store_transposed>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        *handle_, input_usecase, prims_usecase.test_weighted, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG transform reduce
+
+    const int hash_bin_count = 5;
+    const int initial_value  = 4;
+
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *mg_renumber_map, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    auto mg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+      *handle_, *mg_renumber_map, hash_bin_count);
+    auto mg_src_key = cugraph::test::generate<vertex_t, vertex_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_key);
+    auto mg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_key);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_e_by_src_key");
+    }
+
+    auto [by_src_keys, by_src_values] = transform_reduce_e_by_src_key(
+      *handle_,
+      mg_graph_view,
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      mg_src_key.view(),
+      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+        if (src_property < dst_property) {
+          return src_property;
+        } else {
+          return dst_property;
+        }
+      },
+      property_initial_value,
+      cugraph::reduce_op::plus<result_t>{});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_e_by_dst_key");
+    }
+
+    auto [by_dst_keys, by_dst_values] = transform_reduce_e_by_dst_key(
+      *handle_,
+      mg_graph_view,
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      mg_dst_key.view(),
+      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+        if (src_property < dst_property) {
+          return src_property;
+        } else {
+          return dst_property;
+        }
+      },
+      property_initial_value,
+      cugraph::reduce_op::plus<result_t>{});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. compare SG & MG results
+
+    if (prims_usecase.check_correctness) {
+      auto mg_aggregate_by_src_keys =
+        cugraph::test::device_gatherv(*handle_, by_src_keys.data(), by_src_keys.size());
+      auto mg_aggregate_by_src_values =
+        cugraph::allocate_dataframe_buffer<result_t>(0, handle_->get_stream());
+      if constexpr (std::is_arithmetic_v<result_t>) {
+        mg_aggregate_by_src_values =
+          cugraph::test::device_gatherv(*handle_, by_src_values.data(), by_src_values.size());
+      } else {
+        std::get<0>(mg_aggregate_by_src_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<0>(by_src_values).data(), std::get<0>(by_src_values).size());
+        std::get<1>(mg_aggregate_by_src_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<1>(by_src_values).data(), std::get<1>(by_src_values).size());
+      }
+      thrust::sort_by_key(handle_->get_thrust_policy(),
+                          mg_aggregate_by_src_keys.begin(),
+                          mg_aggregate_by_src_keys.end(),
+                          cugraph::get_dataframe_buffer_begin(mg_aggregate_by_src_values));
+
+      auto mg_aggregate_by_dst_keys =
+        cugraph::test::device_gatherv(*handle_, by_dst_keys.data(), by_dst_keys.size());
+      auto mg_aggregate_by_dst_values =
+        cugraph::allocate_dataframe_buffer<result_t>(0, handle_->get_stream());
+      if constexpr (std::is_arithmetic_v<result_t>) {
+        mg_aggregate_by_dst_values =
+          cugraph::test::device_gatherv(*handle_, by_dst_values.data(), by_dst_values.size());
+      } else {
+        std::get<0>(mg_aggregate_by_dst_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<0>(by_dst_values).data(), std::get<0>(by_dst_values).size());
+        std::get<1>(mg_aggregate_by_dst_values) = cugraph::test::device_gatherv(
+          *handle_, std::get<1>(by_dst_values).data(), std::get<1>(by_dst_values).size());
+      }
+      thrust::sort_by_key(handle_->get_thrust_policy(),
+                          mg_aggregate_by_dst_keys.begin(),
+                          mg_aggregate_by_dst_keys.end(),
+                          cugraph::get_dataframe_buffer_begin(mg_aggregate_by_dst_values));
+
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+
+        auto sg_vertex_key = cugraph::test::generate<vertex_t, vertex_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_key = cugraph::test::generate<vertex_t, vertex_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_key);
+        auto sg_dst_key = cugraph::test::generate<vertex_t, vertex_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_key);
+
+        auto [sg_by_src_keys, sg_by_src_values] = transform_reduce_e_by_src_key(
+          *handle_,
+          sg_graph_view,
+          sg_src_prop.view(),
+          sg_dst_prop.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          sg_src_key.view(),
+          [] __device__(
+            auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+            if (src_property < dst_property) {
+              return src_property;
+            } else {
+              return dst_property;
+            }
+          },
+          property_initial_value,
+          cugraph::reduce_op::plus<result_t>{});
+        thrust::sort_by_key(handle_->get_thrust_policy(),
+                            sg_by_src_keys.begin(),
+                            sg_by_src_keys.end(),
+                            cugraph::get_dataframe_buffer_begin(sg_by_src_values));
+
+        auto [sg_by_dst_keys, sg_by_dst_values] = transform_reduce_e_by_dst_key(
+          *handle_,
+          sg_graph_view,
+          sg_src_prop.view(),
+          sg_dst_prop.view(),
+          cugraph::edge_dummy_property_t{}.view(),
+          sg_dst_key.view(),
+          [] __device__(
+            auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+            if (src_property < dst_property) {
+              return src_property;
+            } else {
+              return dst_property;
+            }
+          },
+          property_initial_value,
+          cugraph::reduce_op::plus<result_t>{});
+        thrust::sort_by_key(handle_->get_thrust_policy(),
+                            sg_by_dst_keys.begin(),
+                            sg_by_dst_keys.end(),
+                            cugraph::get_dataframe_buffer_begin(sg_by_dst_values));
+
+        cugraph::test::vector_result_compare compare{*handle_};
+
+        ASSERT_TRUE(compare(sg_by_src_keys, mg_aggregate_by_src_keys));
+        ASSERT_TRUE(compare(sg_by_src_values, mg_aggregate_by_src_values));
+
+        ASSERT_TRUE(compare(sg_by_dst_keys, mg_aggregate_by_dst_keys));
+        ASSERT_TRUE(compare(sg_by_dst_values, mg_aggregate_by_dst_values));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGTransformReduceEBySrcDstKey<input_usecase_t>::handle_ =
+  nullptr;
+
+using Tests_MGTransformReduceEBySrcDstKey_File =
+  Tests_MGTransformReduceEBySrcDstKey<cugraph::test::File_Usecase>;
+using Tests_MGTransformReduceEBySrcDstKey_Rmat =
+  Tests_MGTransformReduceEBySrcDstKey<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(std::get<0>(param),
+                                                                              std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(std::get<0>(param),
+                                                                             std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt32Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformReduceEBySrcDstKey_Rmat, CheckInt64Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGTransformReduceEBySrcDstKey_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGTransformReduceEBySrcDstKey_Rmat,
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGTransformReduceEBySrcDstKey_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_v.cu b/cpp/tests/prims/mg_transform_reduce_v.cu
index c0d44bc94f1..c954f31d0f9 100644
--- a/cpp/tests/prims/mg_transform_reduce_v.cu
+++ b/cpp/tests/prims/mg_transform_reduce_v.cu
@@ -16,6 +16,7 @@
 
 #include "prims/transform_reduce_v.cuh"
 #include "property_generator.cuh"
+#include "result_compare.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/device_comm_wrapper.hpp"
 #include "utilities/mg_utilities.hpp"
@@ -56,50 +57,6 @@ struct v_op_t {
   }
 };
 
-template <typename T>
-struct result_compare {
-  static constexpr double threshold_ratio{1e-3};
-  constexpr auto operator()(const T& t1, const T& t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-};
-
-template <typename... Args>
-struct result_compare<thrust::tuple<Args...>> {
-  static constexpr double threshold_ratio{1e-3};
-
-  using Type = thrust::tuple<Args...>;
-  constexpr auto operator()(const Type& t1, const Type& t2)
-  {
-    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<Type>::value>());
-  }
-
- private:
-  template <typename T>
-  constexpr bool equal(T t1, T t2)
-  {
-    if constexpr (std::is_floating_point_v<T>) {
-      bool passed = (t1 == t2)  // when t1 == t2 == 0
-                    ||
-                    (std::abs(t1 - t2) < (std::max(std::abs(t1), std::abs(t2)) * threshold_ratio));
-      return passed;
-    }
-    return t1 == t2;
-  }
-  template <typename T, std::size_t... I>
-  constexpr auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
-  {
-    return (... && (equal(thrust::get<I>(t1), thrust::get<I>(t2))));
-  }
-};
-
 struct Prims_Usecase {
   bool check_correctness{true};
 };
@@ -254,7 +211,7 @@ class Tests_MGTransformReduceV
               break;
             default: FAIL() << "should not be reached.";
           }
-          result_compare<result_t> compare{};
+          cugraph::test::scalar_result_compare compare{};
           ASSERT_TRUE(compare(expected_result, results[reduction_type]));
         }
       }
diff --git a/cpp/tests/prims/result_compare.cuh b/cpp/tests/prims/result_compare.cuh
new file mode 100644
index 00000000000..5a1abb90e3c
--- /dev/null
+++ b/cpp/tests/prims/result_compare.cuh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <thrust/equal.h>
+#include <thrust/optional.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+#include <utility>
+
+namespace cugraph {
+namespace test {
+
+namespace detail {
+
+template <typename T>
+__host__ __device__ bool compare_arithmetic_scalar(T val0,
+                                                   T val1,
+                                                   thrust::optional<T> threshold_ratio)
+{
+  if (threshold_ratio) {
+    return std::abs(val0 - val1) <= (std::max(std::abs(val0), std::abs(val1)) * *threshold_ratio);
+  } else {
+    return val0 == val1;
+  }
+}
+
+}  // namespace detail
+
+template <typename T>
+struct comparator {
+  static constexpr double threshold_ratio{1e-2};
+
+  __host__ __device__ bool operator()(T t0, T t1) const
+  {
+    static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+    if constexpr (std::is_arithmetic_v<T>) {
+      return detail::compare_arithmetic_scalar(
+        t0,
+        t1,
+        std::is_floating_point_v<T> ? thrust::optional<T>{threshold_ratio} : thrust::nullopt);
+    } else {
+      auto val0   = thrust::get<0>(t0);
+      auto val1   = thrust::get<0>(t1);
+      auto passed = detail::compare_arithmetic_scalar(
+        val0,
+        val1,
+        std::is_floating_point_v<decltype(val0)> ? thrust::optional<decltype(val0)>{threshold_ratio}
+                                                 : thrust::nullopt);
+      if (!passed) return false;
+
+      if constexpr (thrust::tuple_size<T>::value >= 2) {
+        auto val0 = thrust::get<1>(t0);
+        auto val1 = thrust::get<1>(t1);
+        auto passed =
+          detail::compare_arithmetic_scalar(val0,
+                                            val1,
+                                            std::is_floating_point_v<decltype(val1)>
+                                              ? thrust::optional<decltype(val1)>{threshold_ratio}
+                                              : thrust::nullopt);
+        if (!passed) return false;
+      }
+      if constexpr (thrust::tuple_size<T>::value >= 3) {
+        assert(false);  // should not be reached.
+      }
+      return true;
+    }
+  }
+};
+
+struct scalar_result_compare {
+  template <typename... Args>
+  auto operator()(thrust::tuple<Args...> t1, thrust::tuple<Args...> t2)
+  {
+    using type = thrust::tuple<Args...>;
+    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
+  }
+
+  template <typename T>
+  auto operator()(T t1, T t2)
+  {
+    comparator<T> comp{};
+    return comp(t1, t2);
+  }
+
+ private:
+  template <typename T, std::size_t... I>
+  auto equality_impl(T t1, T t2, std::index_sequence<I...>)
+  {
+    return (... && (scalar_result_compare::operator()(thrust::get<I>(t1), thrust::get<I>(t2))));
+  }
+};
+
+struct vector_result_compare {
+  const raft::handle_t& handle_;
+
+  vector_result_compare(raft::handle_t const& handle) : handle_(handle) {}
+
+  template <typename... Args>
+  auto operator()(std::tuple<rmm::device_uvector<Args>...> const& t1,
+                  std::tuple<rmm::device_uvector<Args>...> const& t2)
+  {
+    using type = thrust::tuple<Args...>;
+    return equality_impl(t1, t2, std::make_index_sequence<thrust::tuple_size<type>::value>());
+  }
+
+  template <typename T>
+  auto operator()(rmm::device_uvector<T> const& t1, rmm::device_uvector<T> const& t2)
+  {
+    return thrust::equal(
+      handle_.get_thrust_policy(), t1.begin(), t1.end(), t2.begin(), comparator<T>());
+  }
+
+ private:
+  template <typename T, std::size_t... I>
+  auto equality_impl(T& t1, T& t2, std::index_sequence<I...>)
+  {
+    return (... && (vector_result_compare::operator()(std::get<I>(t1), std::get<I>(t2))));
+  }
+};
+
+}  // namespace test
+}  // namespace cugraph