diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index 213f9b9497a..d1c2cf3df52 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -298,6 +298,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_range_first_ + major_offset;
   }
 
+  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  {
+    if (major_hypersparse_first_ && (major >= *major_hypersparse_first_)) {
+      auto major_hypersparse_idx =
+        detail::major_hypersparse_idx_from_major_nocheck_impl(*dcs_nzd_vertices_, major);
+      return major_hypersparse_idx
+               ? thrust::make_optional((*major_hypersparse_first_ - major_range_first_) +
+                                       *major_hypersparse_idx)
+               : thrust::nullopt;
+    } else {
+      return major - major_range_first_;
+    }
+  }
+
   __device__ vertex_t major_from_major_idx_nocheck(vertex_t major_idx) const noexcept
   {
     if (major_hypersparse_first_) {
@@ -339,6 +353,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
                              : thrust::nullopt;
   }
+
   __host__ __device__ thrust::optional<vertex_t> dcs_nzd_vertex_count() const
   {
     return dcs_nzd_vertices_
@@ -460,6 +475,11 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_offset;
   }
 
+  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  {
+    return major_offset_from_major_nocheck(major);
+  }
+
   __device__ vertex_t major_from_major_idx_nocheck(vertex_t major_idx) const noexcept
   {
     return major_from_major_offset_nocheck(major_idx);
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 53c66c6483e..93d884a56d9 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -631,6 +631,19 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   edge_t count_self_loops(raft::handle_t const& handle) const;
   edge_t count_multi_edges(raft::handle_t const& handle) const;
 
+  rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
+                                     /* (edge_srcs, edge_dsts) should be pre-shuffled */
+                                     raft::device_span<vertex_t const> edge_srcs,
+                                     raft::device_span<vertex_t const> edge_dsts,
+                                     bool do_expensive_check = false);
+
+  rmm::device_uvector<edge_t> compute_multiplicity(
+    raft::handle_t const& handle,
+    /* (edge_srcs, edge_dsts) should be pre-shuffled */
+    raft::device_span<vertex_t const> edge_srcs,
+    raft::device_span<vertex_t const> edge_dsts,
+    bool do_expensive_check = false);
+
   template <bool transposed = is_storage_transposed>
   std::enable_if_t<transposed, std::optional<raft::device_span<vertex_t const>>>
   local_sorted_unique_edge_srcs() const
@@ -928,6 +941,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   edge_t count_self_loops(raft::handle_t const& handle) const;
   edge_t count_multi_edges(raft::handle_t const& handle) const;
 
+  rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
+                                     raft::device_span<vertex_t const> edge_srcs,
+                                     raft::device_span<vertex_t const> edge_dsts,
+                                     bool do_expensive_check = false);
+
+  rmm::device_uvector<edge_t> compute_multiplicity(raft::handle_t const& handle,
+                                                   raft::device_span<vertex_t const> edge_srcs,
+                                                   raft::device_span<vertex_t const> edge_dsts,
+                                                   bool do_expensive_check = false);
+
   template <bool transposed = is_storage_transposed>
   std::enable_if_t<transposed, std::optional<raft::device_span<vertex_t const>>>
   local_sorted_unique_edge_srcs() const
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index 55e8f5c88d7..7ac294d7719 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <prims/count_if_e.cuh>
 #include <prims/per_v_pair_transform_dst_nbr_intersection.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index cefc1836fa6..8261ec747f9 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <prims/detail/optional_dataframe_buffer.hpp>
 #include <prims/kv_store.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_edge_property_device_view.cuh>
@@ -63,35 +64,6 @@ namespace cugraph {
 
 namespace detail {
 
-// check vertices in the pair are valid and first element of the pair is within the local vertex
-// partition range
-template <typename vertex_t>
-struct is_invalid_input_vertex_pair_t {
-  vertex_t num_vertices{};
-  raft::device_span<vertex_t const> edge_partition_major_range_firsts{};
-  raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
-  vertex_t edge_partition_minor_range_first{};
-  vertex_t edge_partition_minor_range_last{};
-
-  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> pair) const
-  {
-    auto major = thrust::get<0>(pair);
-    auto minor = thrust::get<1>(pair);
-    if (!is_valid_vertex(num_vertices, major) || !is_valid_vertex(num_vertices, minor)) {
-      return true;
-    }
-    auto it = thrust::upper_bound(thrust::seq,
-                                  edge_partition_major_range_lasts.begin(),
-                                  edge_partition_major_range_lasts.end(),
-                                  major);
-    if (it == edge_partition_major_range_lasts.end()) { return true; }
-    auto edge_partition_idx =
-      static_cast<size_t>(thrust::distance(edge_partition_major_range_lasts.begin(), it));
-    if (major < edge_partition_major_range_firsts[edge_partition_idx]) { return true; }
-    return (minor < edge_partition_minor_range_first) || (minor >= edge_partition_minor_range_last);
-  }
-};
-
 // group index determined by major_comm_rank (primary key) and local edge partition index (secondary
 // key)
 template <typename vertex_t>
@@ -154,24 +126,11 @@ struct update_rx_major_local_degree_t {
     auto major =
       rx_majors[rx_group_firsts[major_comm_rank * minor_comm_size + local_edge_partition_idx] +
                 offset_in_local_edge_partition];
-    vertex_t major_idx{0};
-    edge_t local_degree{0};
-    if (multi_gpu && (edge_partition.major_hypersparse_first() &&
-                      (major >= *(edge_partition.major_hypersparse_first())))) {
-      auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-      if (major_hypersparse_idx) {
-        major_idx =
-          (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-          *major_hypersparse_idx;
-        local_degree = edge_partition.local_degree(major_idx);
-      }
-    } else {
-      major_idx    = edge_partition.major_offset_from_major_nocheck(major);
-      local_degree = edge_partition.local_degree(major_idx);
-    }
+    auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+    auto local_degree = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
     if (edge_partition_e_mask && (local_degree > edge_t{0})) {
-      auto local_offset = edge_partition.local_offset(major_idx);
+      auto local_offset = edge_partition.local_offset(*major_idx);
       local_degree      = static_cast<edge_t>(
         count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree));
     }
@@ -325,29 +284,11 @@ struct pick_min_degree_t {
     edge_t local_degree0{0};
     vertex_t major0 = thrust::get<0>(pair);
     if constexpr (std::is_same_v<FirstElementToIdxMap, void*>) {
-      vertex_t major_idx{0};
-      if constexpr (multi_gpu) {
-        if (edge_partition.major_hypersparse_first() &&
-            (major0 >= *(edge_partition.major_hypersparse_first()))) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major0);
-          if (major_hypersparse_idx) {
-            major_idx =
-              (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-              *major_hypersparse_idx;
-            local_degree0 = edge_partition.local_degree(major_idx);
-          }
-        } else {
-          major_idx     = edge_partition.major_offset_from_major_nocheck(major0);
-          local_degree0 = edge_partition.local_degree(major_idx);
-        }
-      } else {
-        major_idx     = edge_partition.major_offset_from_major_nocheck(major0);
-        local_degree0 = edge_partition.local_degree(major_idx);
-      }
+      auto major_idx = edge_partition.major_idx_from_major_nocheck(major0);
+      local_degree0  = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
       if (edge_partition_e_mask && (local_degree0 > edge_t{0})) {
-        auto local_offset = edge_partition.local_offset(major_idx);
+        auto local_offset = edge_partition.local_offset(*major_idx);
         local_degree0 =
           count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree0);
       }
@@ -360,29 +301,11 @@ struct pick_min_degree_t {
     edge_t local_degree1{0};
     vertex_t major1 = thrust::get<1>(pair);
     if constexpr (std::is_same_v<SecondElementToIdxMap, void*>) {
-      vertex_t major_idx{0};
-      if constexpr (multi_gpu) {
-        if (edge_partition.major_hypersparse_first() &&
-            (major1 >= *(edge_partition.major_hypersparse_first()))) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major1);
-          if (major_hypersparse_idx) {
-            major_idx =
-              (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-              *major_hypersparse_idx;
-            local_degree1 = edge_partition.local_degree(major_idx);
-          }
-        } else {
-          major_idx     = edge_partition.major_offset_from_major_nocheck(major1);
-          local_degree1 = edge_partition.local_degree(major_idx);
-        }
-      } else {
-        major_idx     = edge_partition.major_offset_from_major_nocheck(major1);
-        local_degree1 = edge_partition.local_degree(major_idx);
-      }
+      auto major_idx = edge_partition.major_idx_from_major_nocheck(major1);
+      local_degree1  = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
       if (edge_partition_e_mask && (local_degree1 > edge_t{0})) {
-        auto local_offset = edge_partition.local_offset(major_idx);
+        auto local_offset = edge_partition.local_offset(*major_idx);
         local_degree1 =
           count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree1);
       }
@@ -699,77 +622,6 @@ struct gatherv_indices_t {
   }
 };
 
-template <typename GraphViewType, typename VertexPairIterator>
-size_t count_invalid_vertex_pairs(raft::handle_t const& handle,
-                                  GraphViewType const& graph_view,
-                                  VertexPairIterator vertex_pair_first,
-                                  VertexPairIterator vertex_pair_last)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-
-  std::vector<vertex_t> h_edge_partition_major_range_firsts(
-    graph_view.number_of_local_edge_partitions());
-  std::vector<vertex_t> h_edge_partition_major_range_lasts(
-    h_edge_partition_major_range_firsts.size());
-  vertex_t edge_partition_minor_range_first{};
-  vertex_t edge_partition_minor_range_last{};
-  if constexpr (GraphViewType::is_multi_gpu) {
-    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
-      if constexpr (GraphViewType::is_storage_transposed) {
-        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_dst_range_first(i);
-        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_dst_range_last(i);
-      } else {
-        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_src_range_first(i);
-        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_src_range_last(i);
-      }
-    }
-    if constexpr (GraphViewType::is_storage_transposed) {
-      edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
-      edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
-    } else {
-      edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
-      edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
-    }
-  } else {
-    h_edge_partition_major_range_firsts[0] = vertex_t{0};
-    h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
-    edge_partition_minor_range_first       = vertex_t{0};
-    edge_partition_minor_range_last        = graph_view.number_of_vertices();
-  }
-  rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
-    h_edge_partition_major_range_firsts.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
-    h_edge_partition_major_range_lasts.size(), handle.get_stream());
-  raft::update_device(d_edge_partition_major_range_firsts.data(),
-                      h_edge_partition_major_range_firsts.data(),
-                      h_edge_partition_major_range_firsts.size(),
-                      handle.get_stream());
-  raft::update_device(d_edge_partition_major_range_lasts.data(),
-                      h_edge_partition_major_range_lasts.data(),
-                      h_edge_partition_major_range_lasts.size(),
-                      handle.get_stream());
-
-  auto num_invalid_pairs = thrust::count_if(
-    handle.get_thrust_policy(),
-    vertex_pair_first,
-    vertex_pair_last,
-    is_invalid_input_vertex_pair_t<vertex_t>{
-      graph_view.number_of_vertices(),
-      raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
-                                        d_edge_partition_major_range_firsts.end()),
-      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
-                                        d_edge_partition_major_range_lasts.end()),
-      edge_partition_minor_range_first,
-      edge_partition_minor_range_last});
-  if constexpr (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-    num_invalid_pairs =
-      host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
-  }
-
-  return num_invalid_pairs;
-}
-
 // In multi-GPU, the first element of every vertex pair in [vertex_pair_first, vertex_pair) should
 // be within the valid edge partition major range assigned to this process and the second element
 // should be within the valid edge partition minor range assigned to this process.
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index 201c08325d7..469bfcb4e47 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <prims/detail/nbr_intersection.cuh>
 #include <prims/property_op_utils.cuh>
 #include <utilities/collect_comm.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_edge_property_device_view.cuh>
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index c6623621d24..93a2d040b60 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -426,28 +426,15 @@ void transform_e(raft::handle_t const& handle,
           edge_first + edge_partition_offsets[i + 1],
           [edge_partition,
            edge_partition_e_mask] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
-            auto major = thrust::get<0>(edge);
-            auto minor = thrust::get<1>(edge);
-            vertex_t major_idx{};
-            auto major_hypersparse_first = edge_partition.major_hypersparse_first();
-            if (major_hypersparse_first) {
-              if (major < *major_hypersparse_first) {
-                major_idx = edge_partition.major_offset_from_major_nocheck(major);
-              } else {
-                auto major_hypersparse_idx =
-                  edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-                if (!major_hypersparse_idx) { return true; }
-                major_idx =
-                  edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
-                  *major_hypersparse_idx;
-              }
-            } else {
-              major_idx = edge_partition.major_offset_from_major_nocheck(major);
-            }
+            auto major     = thrust::get<0>(edge);
+            auto minor     = thrust::get<1>(edge);
+            auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+            if (!major_idx) { return true; }
             vertex_t const* indices{nullptr};
             edge_t edge_offset{};
             edge_t local_degree{};
-            thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+            thrust::tie(indices, edge_offset, local_degree) =
+              edge_partition.local_edges(*major_idx);
             auto lower_it =
               thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
             if (*lower_it != minor) { return true; }
@@ -494,24 +481,16 @@ void transform_e(raft::handle_t const& handle,
         auto major = thrust::get<0>(edge);
         auto minor = thrust::get<1>(edge);
 
-        auto major_hypersparse_first = edge_partition.major_hypersparse_first();
-        auto major_offset            = edge_partition.major_offset_from_major_nocheck(major);
-        vertex_t major_idx{major_offset};
-
-        if ((major_hypersparse_first) && (major >= *major_hypersparse_first)) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-          assert(major_hypersparse_idx);
-          major_idx = edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
-                      *major_hypersparse_idx;
-        }
+        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+        auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+        assert(major_idx);
 
         auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
 
         vertex_t const* indices{nullptr};
         edge_t edge_offset{};
         edge_t local_degree{};
-        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
         auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
         auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
 
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index da0ecc991df..7928c61cf7b 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <prims/count_if_e.cuh>
 #include <prims/per_v_transform_reduce_incoming_outgoing_e.cuh>
 #include <prims/reduce_op.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
@@ -414,6 +415,59 @@ edge_t count_edge_partition_multi_edges(
   }
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+std::tuple<rmm::device_uvector<size_t>, std::vector<size_t>>
+compute_edge_indices_and_edge_partition_offsets(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  raft::device_span<vertex_t const> edge_majors,
+  raft::device_span<vertex_t const> edge_minors)
+{
+  auto edge_first = thrust::make_zip_iterator(edge_majors.begin(), edge_minors.begin());
+
+  rmm::device_uvector<size_t> edge_indices(edge_majors.size(), handle.get_stream());
+  thrust::sequence(handle.get_thrust_policy(), edge_indices.begin(), edge_indices.end(), size_t{0});
+  thrust::sort(handle.get_thrust_policy(),
+               edge_indices.begin(),
+               edge_indices.end(),
+               [edge_first] __device__(size_t lhs, size_t rhs) {
+                 return *(edge_first + lhs) < *(edge_first + rhs);
+               });
+
+  std::vector<size_t> h_major_range_lasts(graph_view.number_of_local_edge_partitions());
+  for (size_t i = 0; i < h_major_range_lasts.size(); ++i) {
+    if constexpr (store_transposed) {
+      h_major_range_lasts[i] = graph_view.local_edge_partition_dst_range_last(i);
+    } else {
+      h_major_range_lasts[i] = graph_view.local_edge_partition_src_range_last(i);
+    }
+  }
+  rmm::device_uvector<size_t> d_major_range_lasts(h_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_major_range_lasts.data(),
+                      h_major_range_lasts.data(),
+                      h_major_range_lasts.size(),
+                      handle.get_stream());
+  rmm::device_uvector<size_t> d_lower_bounds(d_major_range_lasts.size(), handle.get_stream());
+  auto major_first        = edge_majors.begin();
+  auto sorted_major_first = thrust::make_transform_iterator(
+    edge_indices.begin(),
+    cugraph::detail::indirection_t<size_t, decltype(major_first)>{major_first});
+  thrust::lower_bound(handle.get_thrust_policy(),
+                      sorted_major_first,
+                      sorted_major_first + edge_indices.size(),
+                      d_major_range_lasts.begin(),
+                      d_major_range_lasts.end(),
+                      d_lower_bounds.begin());
+  std::vector<size_t> edge_partition_offsets(d_lower_bounds.size() + 1, 0);
+  raft::update_host(edge_partition_offsets.data() + 1,
+                    d_lower_bounds.data(),
+                    d_lower_bounds.size(),
+                    handle.get_stream());
+  handle.sync_stream();
+
+  return std::make_tuple(std::move(edge_indices), edge_partition_offsets);
+}
+
 }  // namespace
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
@@ -751,4 +805,293 @@ edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_i
     this->local_edge_partition_segment_offsets());
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<bool>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::has_edge(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> edge_srcs,
+  raft::device_span<vertex_t const> edge_dsts,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto [edge_indices, edge_partition_offsets] =
+    compute_edge_indices_and_edge_partition_offsets(handle,
+                                                    *this,
+                                                    store_transposed ? edge_dsts : edge_srcs,
+                                                    store_transposed ? edge_srcs : edge_dsts);
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  auto sorted_edge_first = thrust::make_transform_iterator(
+    edge_indices.begin(), cugraph::detail::indirection_t<size_t, decltype(edge_first)>{edge_first});
+  rmm::device_uvector<bool> ret(edge_srcs.size(), handle.get_stream());
+
+  for (size_t i = 0; i < this->number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+    thrust::transform(handle.get_thrust_policy(),
+                      sorted_edge_first + edge_partition_offsets[i],
+                      sorted_edge_first + edge_partition_offsets[i + 1],
+                      thrust::make_permutation_iterator(
+                        ret.begin(), edge_indices.begin() + edge_partition_offsets[i]),
+                      [edge_partition, edge_partition_e_mask] __device__(auto e) {
+                        auto major     = thrust::get<0>(e);
+                        auto minor     = thrust::get<1>(e);
+                        auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+                        if (major_idx) {
+                          vertex_t const* indices{nullptr};
+                          edge_t local_edge_offset{};
+                          edge_t local_degree{};
+                          thrust::tie(indices, local_edge_offset, local_degree) =
+                            edge_partition.local_edges(*major_idx);
+                          auto it = thrust::lower_bound(
+                            thrust::seq, indices, indices + local_degree, minor);
+                          if ((it != indices + local_degree) && *it == minor) {
+                            if (edge_partition_e_mask) {
+                              return (*edge_partition_e_mask)
+                                .get(local_edge_offset + thrust::distance(indices, it));
+                            } else {
+                              return true;
+                            }
+                          } else {
+                            return false;
+                          }
+                        } else {
+                          return false;
+                        }
+                      });
+  }
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<bool>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::has_edge(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> edge_srcs,
+  raft::device_span<vertex_t const> edge_dsts,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  rmm::device_uvector<bool> ret(edge_srcs.size(), handle.get_stream());
+
+  auto edge_partition =
+    edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
+  auto edge_partition_e_mask =
+    edge_mask_view
+      ? thrust::make_optional<
+          detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+          *edge_mask_view, 0)
+      : thrust::nullopt;
+  thrust::transform(
+    handle.get_thrust_policy(),
+    edge_first,
+    edge_first + edge_srcs.size(),
+    ret.begin(),
+    [edge_partition, edge_partition_e_mask] __device__(auto e) {
+      auto major        = thrust::get<0>(e);
+      auto minor        = thrust::get<1>(e);
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t local_edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, local_edge_offset, local_degree) =
+        edge_partition.local_edges(major_offset);
+      auto it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+      if ((it != indices + local_degree) && *it == minor) {
+        if (edge_partition_e_mask) {
+          return (*edge_partition_e_mask).get(local_edge_offset + thrust::distance(indices, it));
+        } else {
+          return true;
+        }
+      } else {
+        return false;
+      }
+    });
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_multiplicity(raft::handle_t const& handle,
+                       raft::device_span<vertex_t const> edge_srcs,
+                       raft::device_span<vertex_t const> edge_dsts,
+                       bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(this->is_multigraph(), "Use has_edge() instead for non-multigraphs.");
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto [edge_indices, edge_partition_offsets] =
+    compute_edge_indices_and_edge_partition_offsets(handle,
+                                                    *this,
+                                                    store_transposed ? edge_dsts : edge_srcs,
+                                                    store_transposed ? edge_srcs : edge_dsts);
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  auto sorted_edge_first = thrust::make_transform_iterator(
+    edge_indices.begin(), cugraph::detail::indirection_t<size_t, decltype(edge_first)>{edge_first});
+  rmm::device_uvector<edge_t> ret(edge_srcs.size(), handle.get_stream());
+
+  for (size_t i = 0; i < this->number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+    thrust::transform(
+      handle.get_thrust_policy(),
+      sorted_edge_first + edge_partition_offsets[i],
+      sorted_edge_first + edge_partition_offsets[i + 1],
+      thrust::make_permutation_iterator(ret.begin(),
+                                        edge_indices.begin() + edge_partition_offsets[i]),
+      [edge_partition, edge_partition_e_mask] __device__(auto e) {
+        auto major     = thrust::get<0>(e);
+        auto minor     = thrust::get<1>(e);
+        auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+        if (major_idx) {
+          vertex_t const* indices{nullptr};
+          edge_t local_edge_offset{};
+          edge_t local_degree{};
+          thrust::tie(indices, local_edge_offset, local_degree) =
+            edge_partition.local_edges(*major_idx);
+          auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+          auto upper_it = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+          auto multiplicity = static_cast<edge_t>(thrust::distance(lower_it, upper_it));
+          if (edge_partition_e_mask && (multiplicity > 0)) {
+            multiplicity = static_cast<edge_t>(detail::count_set_bits(
+              (*edge_partition_e_mask).value_first(),
+              static_cast<size_t>(local_edge_offset + thrust::distance(indices, lower_it)),
+              static_cast<size_t>(multiplicity)));
+          }
+          return multiplicity;
+        } else {
+          return edge_t{0};
+        }
+      });
+  }
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
+  compute_multiplicity(raft::handle_t const& handle,
+                       raft::device_span<vertex_t const> edge_srcs,
+                       raft::device_span<vertex_t const> edge_dsts,
+                       bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(this->is_multigraph(), "Use has_edge() instead for non-multigraphs.");
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  rmm::device_uvector<edge_t> ret(edge_srcs.size(), handle.get_stream());
+
+  auto edge_partition =
+    edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
+  auto edge_partition_e_mask =
+    edge_mask_view
+      ? thrust::make_optional<
+          detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+          *edge_mask_view, 0)
+      : thrust::nullopt;
+  thrust::transform(
+    handle.get_thrust_policy(),
+    edge_first,
+    edge_first + edge_srcs.size(),
+    ret.begin(),
+    [edge_partition, edge_partition_e_mask] __device__(auto e) {
+      auto major        = thrust::get<0>(e);
+      auto minor        = thrust::get<1>(e);
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t local_edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, local_edge_offset, local_degree) =
+        edge_partition.local_edges(major_offset);
+      auto lower_it     = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+      auto upper_it     = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+      auto multiplicity = static_cast<edge_t>(thrust::distance(lower_it, upper_it));
+      if (edge_partition_e_mask && (multiplicity > 0)) {
+        multiplicity = static_cast<edge_t>(detail::count_set_bits(
+          (*edge_partition_e_mask).value_first(),
+          static_cast<size_t>(local_edge_offset + thrust::distance(indices, lower_it)),
+          static_cast<size_t>(multiplicity)));
+      }
+      return multiplicity;
+    });
+
+  return ret;
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/utilities/error_check_utils.cuh b/cpp/src/utilities/error_check_utils.cuh
new file mode 100644
index 00000000000..baaf513d93d
--- /dev/null
+++ b/cpp/src/utilities/error_check_utils.cuh
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/count.h>
+
+#include <vector>
+
+namespace cugraph {
+namespace detail {
+
+// check vertices in the pair are in [0, num_vertices) and belongs to one of the local edge
+// partitions.
+template <typename vertex_t>
+struct is_invalid_input_vertex_pair_t {
+  vertex_t num_vertices{};
+  raft::device_span<vertex_t const> edge_partition_major_range_firsts{};
+  raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
+  vertex_t edge_partition_minor_range_first{};
+  vertex_t edge_partition_minor_range_last{};
+
+  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> pair) const
+  {
+    auto major = thrust::get<0>(pair);
+    auto minor = thrust::get<1>(pair);
+    if (!is_valid_vertex(num_vertices, major) || !is_valid_vertex(num_vertices, minor)) {
+      return true;
+    }
+    auto it = thrust::upper_bound(thrust::seq,
+                                  edge_partition_major_range_lasts.begin(),
+                                  edge_partition_major_range_lasts.end(),
+                                  major);
+    if (it == edge_partition_major_range_lasts.end()) { return true; }
+    auto edge_partition_idx =
+      static_cast<size_t>(thrust::distance(edge_partition_major_range_lasts.begin(), it));
+    if (major < edge_partition_major_range_firsts[edge_partition_idx]) { return true; }
+    return (minor < edge_partition_minor_range_first) || (minor >= edge_partition_minor_range_last);
+  }
+};
+
+template <typename GraphViewType, typename VertexPairIterator>
+size_t count_invalid_vertex_pairs(raft::handle_t const& handle,
+                                  GraphViewType const& graph_view,
+                                  VertexPairIterator vertex_pair_first,
+                                  VertexPairIterator vertex_pair_last)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  std::vector<vertex_t> h_edge_partition_major_range_firsts(
+    graph_view.number_of_local_edge_partitions());
+  std::vector<vertex_t> h_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_firsts.size());
+  vertex_t edge_partition_minor_range_first{};
+  vertex_t edge_partition_minor_range_last{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
+      if constexpr (GraphViewType::is_storage_transposed) {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_dst_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_dst_range_last(i);
+      } else {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_src_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_src_range_last(i);
+      }
+    }
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
+    } else {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
+    }
+  } else {
+    h_edge_partition_major_range_firsts[0] = vertex_t{0};
+    h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
+    edge_partition_minor_range_first       = vertex_t{0};
+    edge_partition_minor_range_last        = graph_view.number_of_vertices();
+  }
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
+    h_edge_partition_major_range_firsts.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.size(),
+                      handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.size(),
+                      handle.get_stream());
+
+  auto num_invalid_pairs = thrust::count_if(
+    handle.get_thrust_policy(),
+    vertex_pair_first,
+    vertex_pair_last,
+    is_invalid_input_vertex_pair_t<vertex_t>{
+      graph_view.number_of_vertices(),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
+                                        d_edge_partition_major_range_firsts.end()),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
+                                        d_edge_partition_major_range_lasts.end()),
+      edge_partition_minor_range_first,
+      edge_partition_minor_range_last});
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+    num_invalid_pairs =
+      host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
+  }
+
+  return num_invalid_pairs;
+}
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d9d2f677abc..3df979fe5c2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -313,6 +313,11 @@ ConfigureTest(DEGREE_TEST structure/degree_test.cpp)
 ConfigureTest(COUNT_SELF_LOOPS_AND_MULTI_EDGES_TEST
               "structure/count_self_loops_and_multi_edges_test.cpp")
 
+###################################################################################################
+# - Query edge existence and multiplicity tests ---------------------------------------------------
+ConfigureTest(HAS_EDGE_AND_COMPUTE_MULTIPLICITY_TEST
+              "structure/has_edge_and_compute_multiplicity_test.cpp")
+
 ###################################################################################################
 # - Coarsening tests ------------------------------------------------------------------------------
 ConfigureTest(COARSEN_GRAPH_TEST structure/coarsen_graph_test.cpp)
@@ -479,6 +484,11 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_COUNT_SELF_LOOPS_AND_MULTI_EDGES_TEST
           "structure/mg_count_self_loops_and_multi_edges_test.cpp")
 
+    ###############################################################################################
+    # - MG Query edge existence and multiplicity tests --------------------------------------------
+    ConfigureTestMG(MG_HAS_EDGE_AND_COMPUTE_MULTIPLICITY_TEST
+          "structure/mg_has_edge_and_compute_multiplicity_test.cpp")
+
     ###############################################################################################
     # - MG PAGERANK tests -------------------------------------------------------------------------
     ConfigureTestMG(MG_PAGERANK_TEST link_analysis/mg_pagerank_test.cpp)
diff --git a/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp b/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
index 68828d5eee1..b7f1dce2023 100644
--- a/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
+++ b/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -208,10 +208,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
 
@@ -220,10 +217,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -235,10 +229,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp
new file mode 100644
index 00000000000..3ad6953ca03
--- /dev/null
+++ b/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <vector>
+
+struct HasEdgeAndComputeMultiplicity_Usecase {
+  size_t num_vertex_pairs{};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_HasEdgeAndComputeMultiplicity
+  : public ::testing::TestWithParam<
+      std::tuple<HasEdgeAndComputeMultiplicity_Usecase, input_usecase_t>> {
+ public:
+  Tests_HasEdgeAndComputeMultiplicity() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, bool store_transposed>
+  void run_current_test(
+    HasEdgeAndComputeMultiplicity_Usecase const& has_edge_and_compute_multiplicity_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    constexpr bool renumber = true;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, false> graph(handle);
+    std::optional<rmm::device_uvector<vertex_t>> d_renumber_map_labels{std::nullopt};
+    std::tie(graph, std::ignore, d_renumber_map_labels) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, false, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+
+    raft::random::RngState rng_state(0);
+    rmm::device_uvector<vertex_t> edge_srcs(
+      has_edge_and_compute_multiplicity_usecase.num_vertex_pairs, handle.get_stream());
+    rmm::device_uvector<vertex_t> edge_dsts(edge_srcs.size(), handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         edge_srcs.data(),
+                                         edge_srcs.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         edge_dsts.data(),
+                                         edge_dsts.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Querying edge existence");
+    }
+
+    auto edge_exists =
+      graph_view.has_edge(handle,
+                          raft::device_span<vertex_t const>(edge_srcs.data(), edge_srcs.size()),
+                          raft::device_span<vertex_t const>(edge_dsts.data(), edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Computing multiplicity");
+    }
+
+    auto edge_multiplicities = graph_view.compute_multiplicity(
+      handle,
+      raft::device_span<vertex_t const>(edge_srcs.data(), edge_srcs.size()),
+      raft::device_span<vertex_t const>(edge_dsts.data(), edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (has_edge_and_compute_multiplicity_usecase.check_correctness) {
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> unrenumbered_graph(handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore, std::ignore) =
+          cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+            handle, input_usecase, false, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets = cugraph::test::to_host(
+        handle, unrenumbered_graph_view.local_edge_partition_view().offsets());
+      std::vector<vertex_t> h_indices = cugraph::test::to_host(
+        handle, unrenumbered_graph_view.local_edge_partition_view().indices());
+
+      rmm::device_uvector<vertex_t> d_unrenumbered_edge_srcs(edge_srcs.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> d_unrenumbered_edge_dsts(edge_dsts.size(), handle.get_stream());
+      raft::copy_async(
+        d_unrenumbered_edge_srcs.data(), edge_srcs.data(), edge_srcs.size(), handle.get_stream());
+      raft::copy_async(
+        d_unrenumbered_edge_dsts.data(), edge_dsts.data(), edge_dsts.size(), handle.get_stream());
+      if (renumber) {
+        cugraph::unrenumber_local_int_vertices(handle,
+                                               d_unrenumbered_edge_srcs.data(),
+                                               d_unrenumbered_edge_srcs.size(),
+                                               (*d_renumber_map_labels).data(),
+                                               vertex_t{0},
+                                               graph_view.number_of_vertices());
+        cugraph::unrenumber_local_int_vertices(handle,
+                                               d_unrenumbered_edge_dsts.data(),
+                                               d_unrenumbered_edge_dsts.size(),
+                                               (*d_renumber_map_labels).data(),
+                                               vertex_t{0},
+                                               graph_view.number_of_vertices());
+      }
+      auto h_unrenumbered_edge_srcs = cugraph::test::to_host(handle, d_unrenumbered_edge_srcs);
+      auto h_unrenumbered_edge_dsts = cugraph::test::to_host(handle, d_unrenumbered_edge_dsts);
+
+      auto h_cugraph_edge_exists         = cugraph::test::to_host(handle, edge_exists);
+      auto h_cugraph_edge_multiplicities = cugraph::test::to_host(handle, edge_multiplicities);
+      std::vector<bool> h_reference_edge_exists(edge_srcs.size());
+      std::vector<edge_t> h_reference_edge_multiplicities(edge_srcs.size());
+      for (size_t i = 0; i < edge_srcs.size(); ++i) {
+        auto src      = h_unrenumbered_edge_srcs[i];
+        auto dst      = h_unrenumbered_edge_dsts[i];
+        auto major    = store_transposed ? dst : src;
+        auto minor    = store_transposed ? src : dst;
+        auto lower_it = std::lower_bound(
+          h_indices.begin() + h_offsets[major], h_indices.begin() + h_offsets[major + 1], minor);
+        auto upper_it = std::upper_bound(
+          h_indices.begin() + h_offsets[major], h_indices.begin() + h_offsets[major + 1], minor);
+        auto multiplicity                  = static_cast<edge_t>(std::distance(lower_it, upper_it));
+        h_reference_edge_exists[i]         = multiplicity > 0 ? true : false;
+        h_reference_edge_multiplicities[i] = multiplicity;
+      }
+
+      ASSERT_TRUE(std::equal(h_reference_edge_exists.begin(),
+                             h_reference_edge_exists.end(),
+                             h_cugraph_edge_exists.begin()))
+        << "has_edge() return values do not match with the reference values.";
+      ASSERT_TRUE(std::equal(h_reference_edge_multiplicities.begin(),
+                             h_reference_edge_multiplicities.end(),
+                             h_cugraph_edge_multiplicities.begin()))
+        << "compute_multiplicity() return values do not match with the reference values.";
+    }
+  }
+};
+
+using Tests_HasEdgeAndComputeMultiplicity_File =
+  Tests_HasEdgeAndComputeMultiplicity<cugraph::test::File_Usecase>;
+using Tests_HasEdgeAndComputeMultiplicity_Rmat =
+  Tests_HasEdgeAndComputeMultiplicity<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_HasEdgeAndComputeMultiplicity_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_HasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_HasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 1024 * 128, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
new file mode 100644
index 00000000000..8079de7ebfe
--- /dev/null
+++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct HasEdgeAndComputeMultiplicity_Usecase {
+  size_t num_vertex_pairs{};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHasEdgeAndComputeMultiplicity
+  : public ::testing::TestWithParam<
+      std::tuple<HasEdgeAndComputeMultiplicity_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHasEdgeAndComputeMultiplicity() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running has_edge & compute_multiplicity on multiple GPUs to that of
+  // a single-GPU run
+  template <typename vertex_t, typename edge_t, bool store_transposed>
+  void run_current_test(
+    HasEdgeAndComputeMultiplicity_Usecase const& has_edge_and_compute_multiplicity_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    using weight_t       = float;
+    using edge_type_id_t = int32_t;
+
+    HighResTimer hr_timer{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 2. create an edge list to query
+
+    raft::random::RngState rng_state(comm_rank);
+    size_t num_vertex_pairs_this_gpu =
+      (has_edge_and_compute_multiplicity_usecase.num_vertex_pairs / comm_size) +
+      ((comm_rank < has_edge_and_compute_multiplicity_usecase.num_vertex_pairs % comm_size)
+         ? size_t{1}
+         : size_t{0});
+    rmm::device_uvector<vertex_t> d_mg_edge_srcs(num_vertex_pairs_this_gpu, handle_->get_stream());
+    rmm::device_uvector<vertex_t> d_mg_edge_dsts(d_mg_edge_srcs.size(), handle_->get_stream());
+    cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                         d_mg_edge_srcs.data(),
+                                         d_mg_edge_srcs.size(),
+                                         vertex_t{0},
+                                         mg_graph_view.number_of_vertices(),
+                                         rng_state);
+    cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                         d_mg_edge_dsts.data(),
+                                         d_mg_edge_dsts.size(),
+                                         vertex_t{0},
+                                         mg_graph_view.number_of_vertices(),
+                                         rng_state);
+
+    std::tie(store_transposed ? d_mg_edge_dsts : d_mg_edge_srcs,
+             store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts,
+             std::ignore,
+             std::ignore,
+             std::ignore) =
+      cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
+        vertex_t,
+        edge_t,
+        weight_t,
+        edge_type_id_t>(*handle_,
+                        std::move(store_transposed ? d_mg_edge_dsts : d_mg_edge_srcs),
+                        std::move(store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts),
+                        std::nullopt,
+                        std::nullopt,
+                        std::nullopt,
+                        mg_graph_view.vertex_partition_range_lasts());
+
+    // 3. run MG has_edge & compute_multiplicity
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Querying edge existence");
+    }
+
+    auto d_mg_edge_exists = mg_graph_view.has_edge(
+      *handle_,
+      raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()),
+      raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Computing multiplicity");
+    }
+
+    auto d_mg_edge_multiplicities = mg_graph_view.compute_multiplicity(
+      *handle_,
+      raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()),
+      raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 4. copmare SG & MG results
+
+    if (has_edge_and_compute_multiplicity_usecase.check_correctness) {
+      // 4-1. aggregate MG results
+
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_mg_edge_srcs.data(),
+        d_mg_edge_srcs.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_mg_edge_dsts.data(),
+        d_mg_edge_dsts.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      auto d_mg_aggregate_edge_srcs = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()));
+      auto d_mg_aggregate_edge_dsts = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+      auto d_mg_aggregate_edge_exists = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<bool const>(d_mg_edge_exists.data(), d_mg_edge_exists.size()));
+      auto d_mg_aggregate_edge_multiplicities = cugraph::test::device_gatherv(
+        *handle_,
+        raft::device_span<edge_t const>(d_mg_edge_multiplicities.data(),
+                                        d_mg_edge_multiplicities.size()));
+
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        // 4-2. run SG count_self_loops & count_multi_edges
+
+        auto d_sg_edge_exists = sg_graph_view.has_edge(
+          *handle_,
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_srcs.data(),
+                                            d_mg_aggregate_edge_srcs.size()),
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_dsts.data(),
+                                            d_mg_aggregate_edge_dsts.size()));
+        auto d_sg_edge_multiplicities = sg_graph_view.compute_multiplicity(
+          *handle_,
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_srcs.data(),
+                                            d_mg_aggregate_edge_srcs.size()),
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_dsts.data(),
+                                            d_mg_aggregate_edge_dsts.size()));
+
+        // 4-3. compare
+
+        auto h_mg_aggregate_edge_exists =
+          cugraph::test::to_host(*handle_, d_mg_aggregate_edge_exists);
+        auto h_mg_aggregate_edge_multiplicities =
+          cugraph::test::to_host(*handle_, d_mg_aggregate_edge_multiplicities);
+        auto h_sg_edge_exists         = cugraph::test::to_host(*handle_, d_sg_edge_exists);
+        auto h_sg_edge_multiplicities = cugraph::test::to_host(*handle_, d_sg_edge_multiplicities);
+
+        ASSERT_TRUE(std::equal(h_mg_aggregate_edge_exists.begin(),
+                               h_mg_aggregate_edge_exists.end(),
+                               h_sg_edge_exists.begin()));
+        ASSERT_TRUE(std::equal(h_mg_aggregate_edge_multiplicities.begin(),
+                               h_mg_aggregate_edge_multiplicities.end(),
+                               h_sg_edge_multiplicities.begin()));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGHasEdgeAndComputeMultiplicity<input_usecase_t>::handle_ =
+  nullptr;
+
+using Tests_MGHasEdgeAndComputeMultiplicity_File =
+  Tests_MGHasEdgeAndComputeMultiplicity<cugraph::test::File_Usecase>;
+using Tests_MGHasEdgeAndComputeMultiplicity_Rmat =
+  Tests_MGHasEdgeAndComputeMultiplicity<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGHasEdgeAndComputeMultiplicity_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGHasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 1024 * 128, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/device_comm_wrapper.cu b/cpp/tests/utilities/device_comm_wrapper.cu
index cfc65b5d741..50727394ad7 100644
--- a/cpp/tests/utilities/device_comm_wrapper.cu
+++ b/cpp/tests/utilities/device_comm_wrapper.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,9 +40,10 @@ rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle,
   rmm::device_uvector<T> gathered_v(
     is_root ? std::reduce(rx_sizes.begin(), rx_sizes.end()) : size_t{0}, handle.get_stream());
 
+  using comm_datatype_t = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
   cugraph::device_gatherv(handle.get_comms(),
-                          d_input.data(),
-                          gathered_v.data(),
+                          reinterpret_cast<comm_datatype_t const*>(d_input.data()),
+                          reinterpret_cast<comm_datatype_t*>(gathered_v.data()),
                           d_input.size(),
                           rx_sizes,
                           rx_displs,
@@ -64,9 +65,10 @@ rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
   rmm::device_uvector<T> gathered_v(std::reduce(rx_sizes.begin(), rx_sizes.end()),
                                     handle.get_stream());
 
+  using comm_datatype_t = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
   cugraph::device_allgatherv(handle.get_comms(),
-                             d_input.data(),
-                             gathered_v.data(),
+                             reinterpret_cast<comm_datatype_t const*>(d_input.data()),
+                             reinterpret_cast<comm_datatype_t*>(gathered_v.data()),
                              rx_sizes,
                              rx_displs,
                              handle.get_stream());
@@ -76,6 +78,9 @@ rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
 
 // explicit instantiation
 
+template rmm::device_uvector<bool> device_gatherv(raft::handle_t const& handle,
+                                                  raft::device_span<bool const> d_input);
+
 template rmm::device_uvector<int32_t> device_gatherv(raft::handle_t const& handle,
                                                      raft::device_span<int32_t const> d_input);
 
@@ -91,6 +96,9 @@ template rmm::device_uvector<float> device_gatherv(raft::handle_t const& handle,
 template rmm::device_uvector<double> device_gatherv(raft::handle_t const& handle,
                                                     raft::device_span<double const> d_input);
 
+template rmm::device_uvector<bool> device_allgatherv(raft::handle_t const& handle,
+                                                     raft::device_span<bool const> d_input);
+
 template rmm::device_uvector<int32_t> device_allgatherv(raft::handle_t const& handle,
                                                         raft::device_span<int32_t const> d_input);
 
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 321a0536e02..3fa6ae089d3 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -377,18 +377,24 @@ template <typename T>
 std::vector<T> to_host(raft::handle_t const& handle, raft::device_span<T const> data)
 {
   std::vector<T> h_data(data.size());
-  raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
+  if constexpr (std::is_same_v<T, bool>) {  // std::vector<bool> stores values in a packed format
+    auto h_tmp = new bool[data.size()];
+    raft::update_host(h_tmp, data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+    std::transform(
+      h_tmp, h_tmp + data.size(), h_data.begin(), [](uint8_t v) { return static_cast<bool>(v); });
+    delete[] h_tmp;
+  } else {
+    raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+  }
   return h_data;
 }
 
 template <typename T>
 std::vector<T> to_host(raft::handle_t const& handle, rmm::device_uvector<T> const& data)
 {
-  std::vector<T> h_data(data.size());
-  raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
-  return h_data;
+  return to_host(handle, raft::device_span<T const>(data.data(), data.size()));
 }
 
 template <typename T>
@@ -396,11 +402,7 @@ std::optional<std::vector<T>> to_host(raft::handle_t const& handle,
                                       std::optional<raft::device_span<T const>> data)
 {
   std::optional<std::vector<T>> h_data{std::nullopt};
-  if (data) {
-    h_data = std::vector<T>((*data).size());
-    raft::update_host((*h_data).data(), (*data).data(), (*data).size(), handle.get_stream());
-    handle.sync_stream();
-  }
+  if (data) { h_data = to_host(handle, *data); }
   return h_data;
 }
 
@@ -410,9 +412,7 @@ std::optional<std::vector<T>> to_host(raft::handle_t const& handle,
 {
   std::optional<std::vector<T>> h_data{std::nullopt};
   if (data) {
-    h_data = std::vector<T>((*data).size());
-    raft::update_host((*h_data).data(), (*data).data(), (*data).size(), handle.get_stream());
-    handle.sync_stream();
+    h_data = to_host(handle, raft::device_span<T const>((*data).data(), (*data).size()));
   }
   return h_data;
 }
@@ -430,8 +430,16 @@ template <typename T>
 rmm::device_uvector<T> to_device(raft::handle_t const& handle, std::vector<T> const& data)
 {
   rmm::device_uvector<T> d_data(data.size(), handle.get_stream());
-  raft::update_device(d_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
+  if constexpr (std::is_same_v<T, bool>) {  // std::vector<bool> stores values in a packed format
+    auto h_tmp = new bool[data.size()];
+    std::copy(data.begin(), data.end(), h_tmp);
+    raft::update_device(d_data.data(), h_tmp, h_tmp + data.size(), handle.get_stream());
+    handle.sync_stream();
+    delete[] h_tmp;
+  } else {
+    raft::update_device(d_data.data(), data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+  }
   return d_data;
 }
 
@@ -453,11 +461,7 @@ std::optional<rmm::device_uvector<T>> to_device(raft::handle_t const& handle,
                                                 std::optional<std::vector<T>> const& data)
 {
   std::optional<rmm::device_uvector<T>> d_data{std::nullopt};
-  if (data) {
-    d_data = rmm::device_uvector<T>(data->size(), handle.get_stream());
-    raft::update_host(d_data->data(), data->data(), data->size(), handle.get_stream());
-    handle.sync_stream();
-  }
+  if (data) { d_data = to_device(handle, *data); }
   return d_data;
 }
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 8843e61ad89..cc4ce474f2d 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,7 +29,7 @@ class GATConv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int or tuple
+    in_feats : int or (int, int)
         Input feature size. A pair denotes feature sizes of source and
         destination nodes.
     out_feats : int
@@ -92,7 +92,7 @@ class GATConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -104,14 +104,19 @@ def __init__(
         bias: bool = True,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
 
         if isinstance(in_feats, int):
@@ -126,28 +131,34 @@ def __init__(
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-            self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(3 * num_heads * out_feats))
         else:
             self.register_parameter("lin_edge", None)
-            self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(2 * num_heads * out_feats))
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
-            self.register_buffer("bias", None)
+            self.register_buffer("lin_res", None)
 
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        if bias and not isinstance(self.lin_res, nn.Linear):
+            if concat:
+                self.bias = nn.Parameter(torch.empty(num_heads, out_feats))
+            else:
+                self.bias = nn.Parameter(torch.empty(out_feats))
         else:
-            self.register_buffer("lin_res", None)
+            self.register_buffer("bias", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -172,7 +183,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -182,8 +193,10 @@ def forward(
         ----------
         graph : DGLGraph or SparseGraph
             The graph.
-        nfeat : torch.Tensor
-            Input features of shape :math:`(N, D_{in})`.
+        nfeat : torch.Tensor or (torch.Tensor, torch.Tensor)
+            Node features. If given as a tuple, the two elements correspond to
+            the source and destination node features, respectively, in a
+            bipartite graph.
         efeat: torch.Tensor, optional
             Optional edge features.
         max_in_degree : int
@@ -237,18 +250,17 @@ def forward(
 
         if bipartite:
             if not hasattr(self, "lin_src"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_feats must be a pair of "
-                    f"integers to allow bipartite node features, but got "
-                    f"{self.in_feats}."
-                )
-            nfeat_src = self.lin_src(nfeat[0])
-            nfeat_dst = self.lin_dst(nfeat[1])
+                nfeat_src = self.lin(nfeat[0])
+                nfeat_dst = self.lin(nfeat[1])
+            else:
+                nfeat_src = self.lin_src(nfeat[0])
+                nfeat_dst = self.lin_dst(nfeat[1])
         else:
             if not hasattr(self, "lin"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats is expected to be an "
-                    f"integer, but got {self.in_feats}."
+                    f"integer when the graph is not bipartite, "
+                    f"but got {self.in_feats}."
                 )
             nfeat = self.lin(nfeat)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
index 209a5fe1a8d..6c78b4df0b8 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,14 +29,11 @@ class GATv2Conv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int, or pair of ints
-        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
-        If the layer is to be applied to a unidirectional bipartite graph, `in_feats`
-        specifies the input feature size on both the source and destination nodes.
-        If a scalar is given, the source and destination node feature size
-        would take the same value.
+    in_feats : int or (int, int)
+        Input feature size. A pair denotes feature sizes of source and
+        destination nodes.
     out_feats : int
-        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
+        Output feature size.
     num_heads : int
         Number of heads in Multi-Head Attention.
     feat_drop : float, optional
@@ -58,17 +55,15 @@ class GATv2Conv(BaseConv):
         input graph. By setting ``True``, it will suppress the check and let the
         users handle it by themselves. Defaults: ``False``.
     bias : bool, optional
-        If set to :obj:`False`, the layer will not learn
-        an additive bias. (default: :obj:`True`)
+        If True, learns a bias term. Defaults: ``True``.
     share_weights : bool, optional
-        If set to :obj:`True`, the same matrix for :math:`W_{left}` and
-        :math:`W_{right}` in the above equations, will be applied to the source
-        and the target node of every edge. (default: :obj:`False`)
+        If ``True``, the same matrix will be applied to the source and the
+        destination node features. Defaults: ``False``.
     """
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -81,16 +76,22 @@ def __init__(
         share_weights: bool = False,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
         self.share_weights = share_weights
+        self.bias = bias
 
         self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
         if share_weights:
@@ -106,30 +107,28 @@ def __init__(
                 self.in_feats_dst, num_heads * out_feats, bias=bias
             )
 
-        self.attn = nn.Parameter(torch.Tensor(num_heads * out_feats))
+        self.attn_weights = nn.Parameter(torch.empty(num_heads * out_feats))
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
         else:
             self.register_parameter("lin_edge", None)
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
-        else:
-            self.register_buffer("bias", None)
-
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
             self.register_buffer("lin_res", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -137,7 +136,7 @@ def reset_parameters(self):
         nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
 
         nn.init.xavier_normal_(
-            self.attn.view(-1, self.num_heads, self.out_feats), gain=gain
+            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
         if self.lin_edge is not None:
             self.lin_edge.reset_parameters()
@@ -145,13 +144,10 @@ def reset_parameters(self):
         if self.lin_res is not None:
             self.lin_res.reset_parameters()
 
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -225,7 +221,7 @@ def forward(
 
         out = ops_torch.operators.mha_gat_v2_n2n(
             nfeat,
-            self.attn,
+            self.attn_weights,
             _graph,
             num_heads=self.num_heads,
             activation="LeakyReLU",
@@ -243,7 +239,4 @@ def forward(
                 res = res.mean(dim=1)
             out = out + res
 
-        if self.bias is not None:
-            out = out + self.bias
-
         return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index 54916674210..5c4b5dea441 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -100,16 +100,16 @@ def __init__(
         self.self_loop = self_loop
         if regularizer is None:
             self.W = nn.Parameter(
-                torch.Tensor(num_rels + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_rels + dim_self_loop, in_feats, out_feats)
             )
             self.coeff = None
         elif regularizer == "basis":
             if num_bases is None:
                 raise ValueError('Missing "num_bases" for basis regularization.')
             self.W = nn.Parameter(
-                torch.Tensor(num_bases + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_bases + dim_self_loop, in_feats, out_feats)
             )
-            self.coeff = nn.Parameter(torch.Tensor(num_rels, num_bases))
+            self.coeff = nn.Parameter(torch.empty(num_rels, num_bases))
             self.num_bases = num_bases
         else:
             raise ValueError(
@@ -119,7 +119,7 @@ def __init__(
         self.regularizer = regularizer
 
         if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+            self.bias = nn.Parameter(torch.empty(out_feats))
         else:
             self.register_parameter("bias", None)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index a3f946d7cb4..b6198903766 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -65,7 +65,7 @@ class SAGEConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         aggregator_type: str = "mean",
         feat_drop: float = 0.0,
@@ -111,7 +111,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        feat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 8481b9ee265..e77556fb76f 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -51,7 +51,7 @@ class TransformerConv(BaseConv):
 
     def __init__(
         self,
-        in_node_feats: Union[int, Tuple[int, int]],
+        in_node_feats: Union[int, tuple[int, int]],
         out_node_feats: int,
         num_heads: int,
         concat: bool = True,
@@ -116,7 +116,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward computation.
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index a3863ed81fa..ee1183f5cd1 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +13,7 @@
 
 import pytest
 
+import dgl
 import torch
 
 from cugraph.testing.mg_utils import (
@@ -58,3 +59,10 @@ class SparseGraphData1:
 @pytest.fixture
 def sparse_graph_1():
     return SparseGraphData1()
+
+
+@pytest.fixture
+def dgl_graph_1():
+    src = torch.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
+    dst = torch.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
+    return dgl.graph((src, dst))
diff --git a/python/cugraph-dgl/tests/nn/__init__.py b/python/cugraph-dgl/tests/nn/__init__.py
deleted file mode 100644
index a1dd01f33d4..00000000000
--- a/python/cugraph-dgl/tests/nn/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/tests/nn/common.py b/python/cugraph-dgl/tests/nn/common.py
deleted file mode 100644
index 34787d20c9a..00000000000
--- a/python/cugraph-dgl/tests/nn/common.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from cugraph.utilities.utils import import_optional
-
-th = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-def create_graph1():
-    u = th.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
-    v = th.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
-    g = dgl.graph((u, v))
-    return g
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index ce145b2bc87..de27efc6329 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATConv as CuGraphGATConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,37 +22,49 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatconv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATConv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,24 +76,24 @@ def test_gatconv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {"bias": False, "allow_zero_in_degree": True, "residual": residual}
 
-    conv1 = GATConv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATConv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATConv(*args, **kwargs).cuda()
     dim = num_heads * out_feats
     with torch.no_grad():
-        conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
-        conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
-        if bipartite:
-            conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-            conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        conv2.attn_weights[:dim].copy_(conv1.attn_l.flatten())
+        conv2.attn_weights[dim:].copy_(conv1.attn_r.flatten())
+        if mode == "bipartite":
+            conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+            conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
         else:
-            conv2.lin.weight.data = conv1.fc.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+            conv2.lin.weight.copy_(conv1.fc.weight)
+        if residual and conv1.has_linear_res:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -90,12 +101,12 @@ def test_gatconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
-    if bipartite:
+    if mode == "bipartite":
         assert torch.allclose(
             conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
@@ -105,25 +116,38 @@ def test_gatconv_equality(
     else:
         assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
+    if residual and conv1.has_linear_res:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
         conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=ATOL,
+        atol=1e-5,  # Note: using a loosened tolerance here due to numerical error
     )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatconv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -131,17 +155,17 @@ def test_gatconv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -154,8 +178,8 @@ def test_gatconv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
index 52003edacca..2d26b7fdc28 100644
--- a/python/cugraph-dgl/tests/nn/test_gatv2conv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,45 +15,56 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
 
-ATOL = 1e-6
+ATOL = 1e-5
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatv2conv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATv2Conv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,19 +76,24 @@ def test_gatv2conv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {
+        "bias": False,
+        "allow_zero_in_degree": True,
+        "residual": residual,
+        "share_weights": mode == "share_weights",
+    }
 
-    conv1 = GATv2Conv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATv2Conv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATv2Conv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATv2Conv(*args, **kwargs).cuda()
     with torch.no_grad():
-        conv2.attn.data = conv1.attn.data.flatten()
-        conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-        conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+        conv2.attn_weights.copy_(conv1.attn.flatten())
+        conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+        conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
+        if residual:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -85,8 +101,8 @@ def test_gatv2conv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
@@ -97,21 +113,38 @@ def test_gatv2conv_equality(
         conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
     )
 
-    assert torch.allclose(conv1.attn.grad, conv1.attn.grad, atol=ATOL)
+    if residual:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
+    assert torch.allclose(
+        conv1.attn.grad,
+        conv2.attn_weights.grad.view(1, num_heads, out_feats),
+        atol=ATOL,
+    )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatv2conv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -119,17 +152,17 @@ def test_gatv2conv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -142,8 +175,8 @@ def test_gatv2conv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index bdaa89e57f2..b5d3686c609 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,7 +22,7 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_bases", [1, 2, 5])
 @pytest.mark.parametrize("regularizer", [None, "basis"])
@@ -31,7 +30,8 @@
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_relgraphconv_equality(
-    idtype_int,
+    dgl_graph_1,
+    idx_type,
     max_in_degree,
     num_bases,
     regularizer,
@@ -42,6 +42,12 @@ def test_relgraphconv_equality(
     from dgl.nn.pytorch import RelGraphConv
 
     torch.manual_seed(12345)
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
+
+    if to_block:
+        g = dgl.to_block(g)
+
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
     kwargs = {
@@ -50,16 +56,10 @@ def test_relgraphconv_equality(
         "bias": False,
         "self_loop": self_loop,
     }
-    g = create_graph1().to("cuda")
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).cuda()
-
-    if idtype_int:
-        g = g.int()
-    if to_block:
-        g = dgl.to_block(g)
 
+    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
     size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).cuda()
+    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
 
     if sparse_format == "coo":
         sg = SparseGraph(
@@ -76,18 +76,18 @@ def test_relgraphconv_equality(
             size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
         )
 
-    conv1 = RelGraphConv(*args, **kwargs).cuda()
-    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).cuda()
+    conv1 = RelGraphConv(*args, **kwargs).to(device)
+    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).to(device)
 
     with torch.no_grad():
         if self_loop:
-            conv2.W.data[:-1] = conv1.linear_r.W.data
-            conv2.W.data[-1] = conv1.loop_weight.data
+            conv2.W[:-1].copy_(conv1.linear_r.W)
+            conv2.W[-1].copy_(conv1.loop_weight)
         else:
-            conv2.W.data = conv1.linear_r.W.data.detach().clone()
+            conv2.W.copy_(conv1.linear_r.W)
 
         if regularizer is not None:
-            conv2.coeff.data = conv1.linear_r.coeff.data.detach().clone()
+            conv2.coeff.copy_(conv1.linear_r.coeff)
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
 
@@ -98,7 +98,7 @@ def test_relgraphconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
 
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index b5d0a44b868..3f1c2b1b3fe 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,21 +25,19 @@
 @pytest.mark.parametrize("aggr", ["mean", "pool"])
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_sageconv_equality(
-    aggr, bias, bipartite, idtype_int, max_in_degree, to_block, sparse_format
+    dgl_graph_1, aggr, bias, bipartite, idx_type, max_in_degree, to_block, sparse_format
 ):
     from dgl.nn.pytorch import SAGEConv
 
     torch.manual_seed(12345)
-    kwargs = {"aggregator_type": aggr, "bias": bias}
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
@@ -49,12 +46,12 @@ def test_sageconv_equality(
     if bipartite:
         in_feats = (5, 3)
         feat = (
-            torch.rand(size[0], in_feats[0], requires_grad=True).cuda(),
-            torch.rand(size[1], in_feats[1], requires_grad=True).cuda(),
+            torch.rand(size[0], in_feats[0], requires_grad=True).to(device),
+            torch.rand(size[1], in_feats[1], requires_grad=True).to(device),
         )
     else:
         in_feats = 5
-        feat = torch.rand(size[0], in_feats).cuda()
+        feat = torch.rand(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,18 +62,19 @@ def test_sageconv_equality(
         offsets, indices, _ = g.adj_tensors("csc")
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
-    conv1 = SAGEConv(in_feats, out_feats, **kwargs).cuda()
-    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).cuda()
+    kwargs = {"aggregator_type": aggr, "bias": bias}
+    conv1 = SAGEConv(in_feats, out_feats, **kwargs).to(device)
+    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).to(device)
 
     in_feats_src = conv2.in_feats_src
     with torch.no_grad():
-        conv2.lin.weight.data[:, :in_feats_src] = conv1.fc_neigh.weight.data
-        conv2.lin.weight.data[:, in_feats_src:] = conv1.fc_self.weight.data
+        conv2.lin.weight[:, :in_feats_src].copy_(conv1.fc_neigh.weight)
+        conv2.lin.weight[:, in_feats_src:].copy_(conv1.fc_self.weight)
         if bias:
-            conv2.lin.bias.data[:] = conv1.fc_self.bias.data
+            conv2.lin.bias.copy_(conv1.fc_self.bias)
         if aggr == "pool":
-            conv2.pre_lin.weight.data[:] = conv1.fc_pool.weight.data
-            conv2.pre_lin.bias.data[:] = conv1.fc_pool.bias.data
+            conv2.pre_lin.weight.copy_(conv1.fc_pool.weight)
+            conv2.pre_lin.bias.copy_(conv1.fc_pool.bias)
 
     out1 = conv1(g, feat)
     if sparse_format is not None:
@@ -85,7 +83,7 @@ def test_sageconv_equality(
         out2 = conv2(g, feat, max_in_degree=max_in_degree)
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
     assert torch.allclose(
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 5ac4fd7bea7..28d13dedec8 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import TransformerConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,27 +25,25 @@
 @pytest.mark.parametrize("beta", [False, True])
 @pytest.mark.parametrize("bipartite_node_feats", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
-@pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
+@pytest.mark.parametrize("num_heads", [1, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_transformerconv(
+    dgl_graph_1,
     beta,
     bipartite_node_feats,
     concat,
-    idtype_int,
+    idx_type,
     num_heads,
     to_block,
     use_edge_feats,
     sparse_format,
 ):
     torch.manual_seed(12345)
-    device = "cuda"
-    g = create_graph1().to(device)
-
-    if idtype_int:
-        g = g.int()
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
     if to_block:
         g = dgl.to_block(g)
@@ -92,5 +89,5 @@ def test_transformerconv(
     else:
         out = conv(g, nfeat, efeat)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index 0d4f0b59413..5a4773168b6 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -50,7 +50,7 @@ repos:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.1.14
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -62,7 +62,7 @@ repos:
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
           - flake8==7.0.0
-          - flake8-bugbear==23.12.2
+          - flake8-bugbear==24.1.17
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.1.14
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
index d0e9a5c7f1b..46c6b54075b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
@@ -21,7 +21,7 @@
 ]
 
 
-@networkx_algorithm(plc="triangle_count", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="triangle_count")
 def is_bipartite(G):
     G = _to_graph(G)
     # Counting triangles may not be the fastest way to do this, but it is simple.
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
index ba2b3d9c895..f6bb142cded 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
@@ -21,8 +21,8 @@
 @networkx_algorithm(
     is_incomplete=True,  # weight not supported
     is_different=True,  # RNG with seed is different
-    plc="betweenness_centrality",
     version_added="23.10",
+    _plc="betweenness_centrality",
 )
 def betweenness_centrality(
     G, k=None, normalized=True, weight=None, endpoints=False, seed=None
@@ -54,8 +54,8 @@ def _(G, k=None, normalized=True, weight=None, endpoints=False, seed=None):
 @networkx_algorithm(
     is_incomplete=True,  # weight not supported
     is_different=True,  # RNG with seed is different
-    plc="edge_betweenness_centrality",
     version_added="23.10",
+    _plc="edge_betweenness_centrality",
 )
 def edge_betweenness_centrality(G, k=None, normalized=True, weight=None, seed=None):
     """`weight` parameter is not yet supported, and RNG with seed may be different."""
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
index 9e615955a8b..65a8633667a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
@@ -29,8 +29,8 @@
 @networkx_algorithm(
     extra_params=_dtype_param,
     is_incomplete=True,  # nstart not supported
-    plc="eigenvector_centrality",
     version_added="23.12",
+    _plc="eigenvector_centrality",
 )
 def eigenvector_centrality(
     G, max_iter=100, tol=1.0e-6, nstart=None, weight=None, *, dtype=None
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
index a2fb950c1aa..4a0684f72ee 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
@@ -29,8 +29,8 @@
 @networkx_algorithm(
     extra_params=_dtype_param,
     is_incomplete=True,  # nstart and normalized=False not supported
-    plc="katz_centrality",
     version_added="23.12",
+    _plc="katz_centrality",
 )
 def katz_centrality(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/cluster.py b/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
index 951c358ff26..a458e6c04db 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
@@ -45,7 +45,7 @@ def _triangles(G, nodes, symmetrize=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="triangle_count", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="triangle_count")
 def triangles(G, nodes=None):
     G = _to_undirected_graph(G)
     node_ids, triangles, is_single_node = _triangles(G, nodes)
@@ -57,9 +57,13 @@ def triangles(G, nodes=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, plc="triangle_count", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
 def clustering(G, nodes=None, weight=None):
     """Directed graphs and `weight` parameter are not yet supported."""
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of clustering not currently supported"
+        )
     G = _to_undirected_graph(G)
     node_ids, triangles, is_single_node = _triangles(G, nodes)
     if len(G) == 0:
@@ -83,9 +87,13 @@ def _(G, nodes=None, weight=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, plc="triangle_count", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
 def average_clustering(G, nodes=None, weight=None, count_zeros=True):
     """Directed graphs and `weight` parameter are not yet supported."""
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of average_clustering not currently supported"
+        )
     G = _to_undirected_graph(G)
     node_ids, triangles, is_single_node = _triangles(G, nodes)
     if len(G) == 0:
@@ -110,7 +118,7 @@ def _(G, nodes=None, weight=None, count_zeros=True):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, plc="triangle_count", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
 def transitivity(G):
     """Directed graphs are not yet supported."""
     G = _to_undirected_graph(G)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
index 413ff9ca5e3..f58f1000fc4 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -36,8 +36,8 @@
     },
     is_incomplete=True,  # seed not supported; self-loops not supported
     is_different=True,  # RNG different
-    plc="louvain",
     version_added="23.10",
+    _plc="louvain",
 )
 def louvain_communities(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
index cdb9f54f6c4..24955e3eac8 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
@@ -26,7 +26,7 @@
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def number_connected_components(G):
     G = _to_undirected_graph(G)
     return _number_connected_components(G)
@@ -50,14 +50,11 @@ def _number_connected_components(G, symmetrize=None):
 @number_connected_components._can_run
 def _(G):
     # NetworkX <= 3.2.1 does not check directedness for us
-    try:
-        return not G.is_directed()
-    except Exception:
-        return False
+    return not G.is_directed()
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def connected_components(G):
     G = _to_undirected_graph(G)
     return _connected_components(G)
@@ -80,7 +77,7 @@ def _connected_components(G, symmetrize=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def is_connected(G):
     G = _to_undirected_graph(G)
     return _is_connected(G)
@@ -106,7 +103,7 @@ def _is_connected(G, symmetrize=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def node_connected_component(G, n):
     # We could also do plain BFS from n
     G = _to_undirected_graph(G)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
index 8fdf99ed5ea..d1713129703 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
@@ -51,7 +51,7 @@ def _strongly_connected_components(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", plc="strongly_connected_components")
+@networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
 def strongly_connected_components(G):
     G = _to_directed_graph(G)
     if G.src_indices.size == 0:
@@ -62,7 +62,7 @@ def strongly_connected_components(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", plc="strongly_connected_components")
+@networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
 def number_strongly_connected_components(G):
     G = _to_directed_graph(G)
     if G.src_indices.size == 0:
@@ -72,7 +72,7 @@ def number_strongly_connected_components(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", plc="strongly_connected_components")
+@networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
 def is_strongly_connected(G):
     G = _to_directed_graph(G)
     if len(G) == 0:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
index 5b797b39118..e42acdd3d84 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
@@ -27,21 +27,21 @@
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def weakly_connected_components(G):
     G = _to_directed_graph(G)
     return _connected_components(G, symmetrize="union")
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def number_weakly_connected_components(G):
     G = _to_directed_graph(G)
     return _number_connected_components(G, symmetrize="union")
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_weakly_connected(G):
     G = _to_directed_graph(G)
     return _is_connected(G, symmetrize="union")
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
index f323cdf6004..71f61abf45b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/core.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py
@@ -28,7 +28,7 @@
 
 @not_implemented_for("directed")
 @not_implemented_for("multigraph")
-@networkx_algorithm(is_incomplete=True, plc="core_number", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="core_number")
 def core_number(G):
     """Directed graphs are not yet supported."""
     G = _to_undirected_graph(G)
@@ -55,7 +55,7 @@ def _(G):
 
 @not_implemented_for("directed")
 @not_implemented_for("multigraph")
-@networkx_algorithm(is_incomplete=True, plc="k_truss_subgraph", version_added="23.12")
+@networkx_algorithm(is_incomplete=True, version_added="23.12", _plc="k_truss_subgraph")
 def k_truss(G, k):
     """
     Currently raises `NotImplementedError` for graphs with more than one connected
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/dag.py b/python/nx-cugraph/nx_cugraph/algorithms/dag.py
index ad5b7594aa1..64be0a58105 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/dag.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/dag.py
@@ -45,11 +45,11 @@ def _ancestors_and_descendants(G, source, *, is_ancestors):
     return G._nodearray_to_set(node_ids[mask])
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def descendants(G, source):
     return _ancestors_and_descendants(G, source, is_ancestors=False)
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def ancestors(G, source):
     return _ancestors_and_descendants(G, source, is_ancestors=True)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
index caa01327a56..9e723624a3b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
@@ -33,8 +33,8 @@
         ),
         **_dtype_param,
     },
-    plc="hits",
     version_added="23.12",
+    _plc="hits",
 )
 def hits(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
index d45d019c1b7..55fcc3e520a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
@@ -29,8 +29,8 @@
 @networkx_algorithm(
     extra_params=_dtype_param,
     is_incomplete=True,  # dangling not supported
-    plc={"pagerank", "personalized_pagerank"},
     version_added="23.12",
+    _plc={"pagerank", "personalized_pagerank"},
 )
 def pagerank(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
index b1032a8236b..2012495953e 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -21,12 +21,12 @@
 __all__ = ["single_source_shortest_path_length", "single_target_shortest_path_length"]
 
 
-@networkx_algorithm(plc="bfs", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_source_shortest_path_length(G, source, cutoff=None):
     return _single_shortest_path_length(G, source, cutoff, "Source")
 
 
-@networkx_algorithm(plc="bfs", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_target_shortest_path_length(G, target, cutoff=None):
     return _single_shortest_path_length(G, target, cutoff, "Target")
 
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
index aa671bbb7d4..ef1c011363a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
@@ -57,9 +57,17 @@ def _bfs(G, source, *, depth_limit=None, reverse=False):
     return distances[mask], predecessors[mask], node_ids[mask]
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def generic_bfs_edges(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
     """`neighbors` and `sort_neighbors` parameters are not yet supported."""
+    if neighbors is not None:
+        raise NotImplementedError(
+            "neighbors argument in generic_bfs_edges is not currently supported"
+        )
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in generic_bfs_edges is not currently supported"
+        )
     return bfs_edges(source, depth_limit=depth_limit)
 
 
@@ -68,9 +76,13 @@ def _(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
     return neighbors is None and sort_neighbors is None
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_edges is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         return
@@ -95,9 +107,13 @@ def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_tree(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_tree is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         return nxcg.DiGraph.from_coo(
@@ -149,9 +165,13 @@ def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_successors(G, source, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_successors is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         yield (source, [])
@@ -173,7 +193,7 @@ def _(G, source, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def bfs_layers(G, sources):
     G = _to_graph(G)
     if sources in G:
@@ -201,9 +221,13 @@ def bfs_layers(G, sources):
     return (G._nodearray_to_list(groups[key]) for key in range(len(groups)))
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_predecessors(G, source, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_predecessors is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         return
@@ -227,7 +251,7 @@ def _(G, source, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def descendants_at_distance(G, source, distance):
     G = _check_G_and_source(G, source)
     if distance is None or distance < 0:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py b/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
index 0b82f079d43..74f57b5ea5a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
@@ -21,20 +21,20 @@
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_arborescence(G):
     G = _to_directed_graph(G)
     return is_tree(G) and int(G._in_degrees_array().max()) <= 1
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_branching(G):
     G = _to_directed_graph(G)
     return is_forest(G) and int(G._in_degrees_array().max()) <= 1
 
 
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_forest(G):
     G = _to_graph(G)
     if len(G) == 0:
@@ -60,7 +60,7 @@ def is_forest(G):
     return True
 
 
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_tree(G):
     G = _to_graph(G)
     if len(G) == 0:
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
index d09a9e9617a..011ebfd6ef7 100644
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -59,7 +59,7 @@ def __new__(
         version_added: str,  # Required
         is_incomplete: bool = False,  # See self.extra_doc for details if True
         is_different: bool = False,  # See self.extra_doc for details if True
-        plc: str | set[str] | None = None,  # Hidden from user, may be removed someday
+        _plc: str | set[str] | None = None,  # Hidden from user, may be removed someday
     ):
         if func is None:
             return partial(
@@ -67,10 +67,10 @@ def __new__(
                 name=name,
                 extra_params=extra_params,
                 nodes_or_number=nodes_or_number,
-                plc=plc,
                 version_added=version_added,
                 is_incomplete=is_incomplete,
                 is_different=is_different,
+                _plc=_plc,
             )
         instance = object.__new__(cls)
         if nodes_or_number is not None and nx.__version__[:3] > "3.2":
@@ -89,12 +89,14 @@ def __new__(
                 f"extra_params must be dict, str, or None; got {type(extra_params)}"
             )
         instance.extra_params = extra_params
-        if plc is None or isinstance(plc, set):
-            instance._plc_names = plc
-        elif isinstance(plc, str):
-            instance._plc_names = {plc}
+        if _plc is None or isinstance(_plc, set):
+            instance._plc_names = _plc
+        elif isinstance(_plc, str):
+            instance._plc_names = {_plc}
         else:
-            raise TypeError(f"plc argument must be str, set, or None; got {type(plc)}")
+            raise TypeError(
+                f"_plc argument must be str, set, or None; got {type(_plc)}"
+            )
         instance.version_added = version_added
         instance.is_incomplete = is_incomplete
         instance.is_different = is_different