From 38a8cdbb369a398a9688acb84c41015b92b7fd0e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Fri, 9 Feb 2024 13:34:23 -0800
Subject: [PATCH 1/6] Address primitives performance regression with the added
 edge masking support (when edge masking is not in use) (#4119)

This addresses performance regression due to the added edge masking support when edge masking is disabled.

We addressed the issue for `per_v_transform_reduce_incoming|outgoing_e` in https://github.com/rapidsai/cugraph/pull/4085

Using a similar approach, we address the issue for `transform_e`, `transform_reduce_e`, and `detail::nbr_intersection` in this PR.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)

URL: https://github.com/rapidsai/cugraph/pull/4119
---
 cpp/src/prims/detail/nbr_intersection.cuh |  73 +++--
 cpp/src/prims/transform_e.cuh             | 334 +++++++++++++++-------
 cpp/src/prims/transform_reduce_e.cuh      | 252 ++++++++--------
 3 files changed, 408 insertions(+), 251 deletions(-)
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index 8261ec747f9..26b87f21dbb 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -319,7 +319,8 @@ struct pick_min_degree_t {
   }
 };
 
-template <typename InputKeyIterator0,
+template <bool check_edge_mask,
+          typename InputKeyIterator0,
           typename InputKeyIterator1,
           typename InputValueIterator0,  // should be void* if invalid
           typename InputValueIterator1,  // should be void* if invalid
@@ -356,10 +357,14 @@ __device__ edge_t set_intersection_by_key_with_mask(InputKeyIterator0 input_key_
   auto output_idx = output_start_offset;
   while ((idx0 < (input_start_offset0 + input_size0)) &&
          (idx1 < (input_start_offset1 + input_size1))) {
-    bool valid0 = apply_mask0 ? check_bit_set(idx0) : true;
-    bool valid1 = apply_mask1 ? check_bit_set(idx1) : true;
-    if (!valid0) { ++idx0; }
-    if (!valid1) { ++idx1; }
+    bool valid0 = true;
+    bool valid1 = true;
+    if constexpr (check_edge_mask) {
+      valid0 = apply_mask0 ? check_bit_set(idx0) : true;
+      valid1 = apply_mask1 ? check_bit_set(idx1) : true;
+      if (!valid0) { ++idx0; }
+      if (!valid1) { ++idx1; }
+    }
 
     if (valid0 && valid1) {
       auto key0 = *(input_key_first0 + idx0);
@@ -522,24 +527,42 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
     // vertices in a single warp (better optimize if this becomes a performance
     // bottleneck)
 
-    auto mask_first        = edge_partition_e_mask ? (*edge_partition_e_mask).value_first()
-                                                   : static_cast<uint32_t const*>(nullptr);
-    auto intersection_size = set_intersection_by_key_with_mask(
-      indices0,
-      indices1,
-      edge_property_values0,
-      edge_property_values1,
-      mask_first,
-      nbr_intersection_indices.begin(),
-      nbr_intersection_e_property_values0,
-      nbr_intersection_e_property_values1,
-      local_edge_offset0,
-      local_degree0,
-      (std::is_same_v<FirstElementToIdxMap, void*> && edge_partition_e_mask),
-      local_edge_offset1,
-      local_degree1,
-      (std::is_same_v<SecondElementToIdxMap, void*> && edge_partition_e_mask),
-      nbr_intersection_offsets[i]);
+    edge_t intersection_size{};
+    if (edge_partition_e_mask) {
+      intersection_size =
+        set_intersection_by_key_with_mask<true>(indices0,
+                                                indices1,
+                                                edge_property_values0,
+                                                edge_property_values1,
+                                                (*edge_partition_e_mask).value_first(),
+                                                nbr_intersection_indices.begin(),
+                                                nbr_intersection_e_property_values0,
+                                                nbr_intersection_e_property_values1,
+                                                local_edge_offset0,
+                                                local_degree0,
+                                                std::is_same_v<FirstElementToIdxMap, void*>,
+                                                local_edge_offset1,
+                                                local_degree1,
+                                                std::is_same_v<SecondElementToIdxMap, void*>,
+                                                nbr_intersection_offsets[i]);
+    } else {
+      intersection_size =
+        set_intersection_by_key_with_mask<false>(indices0,
+                                                 indices1,
+                                                 edge_property_values0,
+                                                 edge_property_values1,
+                                                 static_cast<uint32_t const*>(nullptr),
+                                                 nbr_intersection_indices.begin(),
+                                                 nbr_intersection_e_property_values0,
+                                                 nbr_intersection_e_property_values1,
+                                                 local_edge_offset0,
+                                                 local_degree0,
+                                                 false,
+                                                 local_edge_offset1,
+                                                 local_degree1,
+                                                 false,
+                                                 nbr_intersection_offsets[i]);
+    }
 
     thrust::fill(
       thrust::seq,
@@ -714,7 +737,7 @@ nbr_intersection(raft::handle_t const& handle,
   auto edge_mask_view = graph_view.edge_mask_view();
 
   std::optional<std::unique_ptr<kv_store_t<vertex_t, vertex_t, false>>> major_to_idx_map_ptr{
-    std::nullopt};
+    std::nullopt};  // idx to major_nbr_offsets
   std::optional<rmm::device_uvector<edge_t>> major_nbr_offsets{std::nullopt};
   std::optional<rmm::device_uvector<vertex_t>> major_nbr_indices{std::nullopt};
 
@@ -1041,7 +1064,7 @@ nbr_intersection(raft::handle_t const& handle,
   // 3. Collect neighbor list for minors (for the neighbors within the minor range for this GPU)
 
   std::optional<std::unique_ptr<kv_store_t<vertex_t, vertex_t, false>>> minor_to_idx_map_ptr{
-    std::nullopt};
+    std::nullopt};  // idx to minor_nbr_offsets
   std::optional<rmm::device_uvector<size_t>> minor_nbr_offsets{std::nullopt};
   std::optional<rmm::device_uvector<vertex_t>> minor_nbr_indices{std::nullopt};
 
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index 93a2d040b60..5c83e0f1b71 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -42,7 +42,8 @@ namespace detail {
 
 int32_t constexpr transform_e_kernel_block_size = 512;
 
-template <typename GraphViewType,
+template <bool check_edge_mask,
+          typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
@@ -56,7 +57,7 @@ __global__ void transform_e_packed_bool(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
   EdgePartitionEdgeValueOutputWrapper edge_partition_e_value_output,
   EdgeOp e_op)
 {
@@ -72,35 +73,44 @@ __global__ void transform_e_packed_bool(
 
   auto num_edges = edge_partition.number_of_edges();
   while (idx < static_cast<edge_t>(packed_bool_size(num_edges))) {
-    auto edge_mask = packed_bool_full_mask();
-    if (edge_partition_e_mask) { edge_mask = *((*edge_partition_e_mask).value_first() + idx); }
+    [[maybe_unused]] auto edge_mask =
+      packed_bool_full_mask();  // relevant only when check_edge_mask is true
+    if constexpr (check_edge_mask) { edge_mask = *(edge_partition_e_mask.value_first() + idx); }
 
     auto local_edge_idx =
       idx * static_cast<edge_t>(packed_bools_per_word()) + static_cast<edge_t>(lane_id);
     int predicate{0};
 
-    if ((local_edge_idx < num_edges) && (edge_mask & packed_bool_mask(lane_id))) {
-      auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(local_edge_idx);
-      auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
-      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-      auto minor        = *(edge_partition.indices() + local_edge_idx);
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-      auto src        = GraphViewType::is_storage_transposed ? minor : major;
-      auto dst        = GraphViewType::is_storage_transposed ? major : minor;
-      auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-      auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-      predicate       = e_op(src,
-                       dst,
-                       edge_partition_src_value_input.get(src_offset),
-                       edge_partition_dst_value_input.get(dst_offset),
-                       edge_partition_e_value_input.get(local_edge_idx))
-                          ? int{1}
-                          : int{0};
+    if (local_edge_idx < num_edges) {
+      bool compute_predicate = true;
+      if constexpr (check_edge_mask) {
+        compute_predicate = (edge_mask & packed_bool_mask(lane_id) != packed_bool_empty_mask());
+      }
+
+      if (compute_predicate) {
+        auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(local_edge_idx);
+        auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
+        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+        auto minor        = *(edge_partition.indices() + local_edge_idx);
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+        auto src        = GraphViewType::is_storage_transposed ? minor : major;
+        auto dst        = GraphViewType::is_storage_transposed ? major : minor;
+        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+        predicate       = e_op(src,
+                         dst,
+                         edge_partition_src_value_input.get(src_offset),
+                         edge_partition_dst_value_input.get(dst_offset),
+                         edge_partition_e_value_input.get(local_edge_idx))
+                            ? int{1}
+                            : int{0};
+      }
     }
+
     uint32_t new_val = __ballot_sync(uint32_t{0xffffffff}, predicate);
     if (lane_id == 0) {
-      if (edge_mask == packed_bool_full_mask()) {
+      if constexpr (check_edge_mask) {
         *(edge_partition_e_value_output.value_first() + idx) = new_val;
       } else {
         auto old_val = *(edge_partition_e_value_output.value_first() + idx);
@@ -112,6 +122,99 @@ __global__ void transform_e_packed_bool(
   }
 }
 
+template <bool check_edge_mask,
+          typename GraphViewType,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename EdgeOp,
+          typename EdgeValueOutputWrapper>
+struct update_e_value_t {
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu>
+    edge_partition{};
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input{};
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input{};
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input{};
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask{};
+  EdgeOp e_op{};
+  EdgeValueOutputWrapper edge_partition_e_value_output{};
+
+  __device__ void operator()(thrust::tuple<typename GraphViewType::vertex_type,
+                                           typename GraphViewType::vertex_type> edge) const
+  {
+    using vertex_t = typename GraphViewType::vertex_type;
+    using edge_t   = typename GraphViewType::edge_type;
+
+    auto major = thrust::get<0>(edge);
+    auto minor = thrust::get<1>(edge);
+
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+    assert(major_idx);
+
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+    vertex_t const* indices{nullptr};
+    edge_t edge_offset{};
+    edge_t local_degree{};
+    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
+    auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+    auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
+
+    auto src        = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst        = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+
+    for (auto it = lower_it; it != upper_it; ++it) {
+      assert(*it == minor);
+      if constexpr (check_edge_mask) {
+        if (edge_partition_e_mask.get(edge_offset + thrust::distance(indices, it))) {
+          auto e_op_result =
+            e_op(src,
+                 dst,
+                 edge_partition_src_value_input.get(src_offset),
+                 edge_partition_dst_value_input.get(dst_offset),
+                 edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
+          edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
+                                            e_op_result);
+        }
+      } else {
+        auto e_op_result =
+          e_op(src,
+               dst,
+               edge_partition_src_value_input.get(src_offset),
+               edge_partition_dst_value_input.get(dst_offset),
+               edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
+        edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it), e_op_result);
+      }
+    }
+  }
+
+  __device__ void operator()(typename GraphViewType::edge_type i) const
+  {
+    auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
+    auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    auto minor        = *(edge_partition.indices() + i);
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+    auto src         = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst         = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset  = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset  = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    auto e_op_result = e_op(src,
+                            dst,
+                            edge_partition_src_value_input.get(src_offset),
+                            edge_partition_dst_value_input.get(dst_offset),
+                            edge_partition_e_value_input.get(i));
+    edge_partition_e_value_output.set(i, e_op_result);
+  }
+};
+
 }  // namespace detail
 
 /**
@@ -228,47 +331,68 @@ void transform_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid(num_edges,
                                            detail::transform_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        detail::transform_e_packed_bool<GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-            edge_partition,
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            edge_partition_e_value_output,
-            e_op);
+        if (edge_partition_e_mask) {
+          detail::transform_e_packed_bool<true, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              edge_partition,
+              edge_partition_src_value_input,
+              edge_partition_dst_value_input,
+              edge_partition_e_value_input,
+              *edge_partition_e_mask,
+              edge_partition_e_value_output,
+              e_op);
+        } else {
+          detail::transform_e_packed_bool<false, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              edge_partition,
+              edge_partition_src_value_input,
+              edge_partition_dst_value_input,
+              edge_partition_e_value_input,
+              std::byte{},  // dummy
+              edge_partition_e_value_output,
+              e_op);
+        }
       }
     } else {
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        thrust::make_counting_iterator(edge_t{0}),
-        thrust::make_counting_iterator(num_edges),
-        [e_op,
-         edge_partition,
-         edge_partition_src_value_input,
-         edge_partition_dst_value_input,
-         edge_partition_e_value_input,
-         edge_partition_e_mask,
-         edge_partition_e_value_output] __device__(edge_t i) {
-          if (!edge_partition_e_mask || (*edge_partition_e_mask).get(i)) {
-            auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
-            auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
-            auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-            auto minor        = *(edge_partition.indices() + i);
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-            auto src         = GraphViewType::is_storage_transposed ? minor : major;
-            auto dst         = GraphViewType::is_storage_transposed ? major : minor;
-            auto src_offset  = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-            auto dst_offset  = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-            auto e_op_result = e_op(src,
-                                    dst,
-                                    edge_partition_src_value_input.get(src_offset),
-                                    edge_partition_dst_value_input.get(dst_offset),
-                                    edge_partition_e_value_input.get(i));
-            edge_partition_e_value_output.set(i, e_op_result);
-          }
-        });
+      if (edge_partition_e_mask) {
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(edge_t{0}),
+                         thrust::make_counting_iterator(num_edges),
+                         detail::update_e_value_t<true,
+                                                  GraphViewType,
+                                                  edge_partition_src_input_device_view_t,
+                                                  edge_partition_dst_input_device_view_t,
+                                                  edge_partition_e_input_device_view_t,
+                                                  decltype(*edge_partition_e_mask),
+                                                  EdgeOp,
+                                                  edge_partition_e_output_device_view_t>{
+                           edge_partition,
+                           edge_partition_src_value_input,
+                           edge_partition_dst_value_input,
+                           edge_partition_e_value_input,
+                           *edge_partition_e_mask,
+                           e_op,
+                           edge_partition_e_value_output});
+      } else {
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(edge_t{0}),
+                         thrust::make_counting_iterator(num_edges),
+                         detail::update_e_value_t<false,
+                                                  GraphViewType,
+                                                  edge_partition_src_input_device_view_t,
+                                                  edge_partition_dst_input_device_view_t,
+                                                  edge_partition_e_input_device_view_t,
+                                                  std::byte,  // dummy
+                                                  EdgeOp,
+                                                  edge_partition_e_output_device_view_t>{
+                           edge_partition,
+                           edge_partition_src_value_input,
+                           edge_partition_dst_value_input,
+                           edge_partition_e_value_input,
+                           std::byte{},  // dummy
+                           e_op,
+                           edge_partition_e_value_output});
+      }
     }
   }
 }
@@ -467,53 +591,45 @@ void transform_e(raft::handle_t const& handle,
     auto edge_partition_e_value_output =
       edge_partition_e_output_device_view_t(edge_value_output, i);
 
-    thrust::for_each(
-      handle.get_thrust_policy(),
-      edge_first + edge_partition_offsets[i],
-      edge_first + edge_partition_offsets[i + 1],
-      [e_op,
-       edge_partition,
-       edge_partition_src_value_input,
-       edge_partition_dst_value_input,
-       edge_partition_e_value_input,
-       edge_partition_e_mask,
-       edge_partition_e_value_output] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
-        auto major = thrust::get<0>(edge);
-        auto minor = thrust::get<1>(edge);
-
-        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-        auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
-        assert(major_idx);
-
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-        vertex_t const* indices{nullptr};
-        edge_t edge_offset{};
-        edge_t local_degree{};
-        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
-        auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
-        auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
-
-        auto src        = GraphViewType::is_storage_transposed ? minor : major;
-        auto dst        = GraphViewType::is_storage_transposed ? major : minor;
-        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-
-        for (auto it = lower_it; it != upper_it; ++it) {
-          assert(*it == minor);
-          if (!edge_partition_e_mask ||
-              ((*edge_partition_e_mask).get(edge_offset + thrust::distance(indices, it)))) {
-            auto e_op_result =
-              e_op(src,
-                   dst,
-                   edge_partition_src_value_input.get(src_offset),
-                   edge_partition_dst_value_input.get(dst_offset),
-                   edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
-            edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
-                                              e_op_result);
-          }
-        }
-      });
+    if (edge_partition_e_mask) {
+      thrust::for_each(handle.get_thrust_policy(),
+                       edge_first + edge_partition_offsets[i],
+                       edge_first + edge_partition_offsets[i + 1],
+                       detail::update_e_value_t<true,
+                                                GraphViewType,
+                                                edge_partition_src_input_device_view_t,
+                                                edge_partition_dst_input_device_view_t,
+                                                edge_partition_e_input_device_view_t,
+                                                decltype(*edge_partition_e_mask),
+                                                EdgeOp,
+                                                edge_partition_e_output_device_view_t>{
+                         edge_partition,
+                         edge_partition_src_value_input,
+                         edge_partition_dst_value_input,
+                         edge_partition_e_value_input,
+                         *edge_partition_e_mask,
+                         e_op,
+                         edge_partition_e_value_output});
+    } else {
+      thrust::for_each(handle.get_thrust_policy(),
+                       edge_first + edge_partition_offsets[i],
+                       edge_first + edge_partition_offsets[i + 1],
+                       detail::update_e_value_t<false,
+                                                GraphViewType,
+                                                edge_partition_src_input_device_view_t,
+                                                edge_partition_dst_input_device_view_t,
+                                                edge_partition_e_input_device_view_t,
+                                                std::byte,  // dummy
+                                                EdgeOp,
+                                                edge_partition_e_output_device_view_t>{
+                         edge_partition,
+                         edge_partition_src_value_input,
+                         edge_partition_dst_value_input,
+                         edge_partition_e_value_input,
+                         std::byte{},  // dummy
+                         e_op,
+                         edge_partition_e_value_output});
+    }
   }
 }
 
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index 483ab64dcd9..7acc7461268 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <prims/detail/prim_functors.cuh>
 #include <prims/property_op_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
@@ -89,48 +90,51 @@ __global__ void transform_reduce_e_hypersparse(
   while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
     auto major =
       *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     auto major_idx =
       major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
-    auto sum                                        = thrust::transform_reduce(
-      thrust::seq,
-      thrust::make_counting_iterator(edge_t{0}),
-      thrust::make_counting_iterator(local_degree),
-      [&edge_partition,
-       &edge_partition_src_value_input,
-       &edge_partition_dst_value_input,
-       &edge_partition_e_value_input,
-       &edge_partition_e_mask,
-       &e_op,
-       major,
-       indices,
-       edge_offset] __device__(auto i) {
-        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-          auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-          auto minor        = indices[i];
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-          auto src          = GraphViewType::is_storage_transposed ? minor : major;
-          auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-          auto src_offset   = GraphViewType::is_storage_transposed
-                                                                       ? minor_offset
-                                                                       : static_cast<vertex_t>(major_offset);
-          auto dst_offset   = GraphViewType::is_storage_transposed
-                                                                       ? static_cast<vertex_t>(major_offset)
-                                                                       : minor_offset;
-          return e_op(src,
-                      dst,
-                      edge_partition_src_value_input.get(src_offset),
-                      edge_partition_dst_value_input.get(dst_offset),
-                      edge_partition_e_value_input.get(edge_offset + i));
-        } else {
-          return e_op_result_t{};
-        }
-      },
-      e_op_result_t{},
-      edge_property_add);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    e_op_result_t sum{};
+    if (edge_partition_e_mask) {
+      sum = thrust::transform_reduce(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        [&edge_partition_e_mask, &call_e_op, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return e_op_result_t{};
+          }
+        },
+        e_op_result_t{},
+        edge_property_add);
+    } else {
+      sum = thrust::transform_reduce(thrust::seq,
+                                     thrust::make_counting_iterator(edge_t{0}),
+                                     thrust::make_counting_iterator(local_degree),
+                                     call_e_op,
+                                     e_op_result_t{},
+                                     edge_property_add);
+    }
 
     e_op_result_sum = edge_property_add(e_op_result_sum, sum);
     idx += gridDim.x * blockDim.x;
@@ -175,50 +179,50 @@ __global__ void transform_reduce_e_low_degree(
   property_op<e_op_result_t, thrust::plus> edge_property_add{};
   e_op_result_t e_op_result_sum{};
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-    auto sum                                        = thrust::transform_reduce(
-      thrust::seq,
-      thrust::make_counting_iterator(edge_t{0}),
-      thrust::make_counting_iterator(local_degree),
-      [&edge_partition,
-       &edge_partition_src_value_input,
-       &edge_partition_dst_value_input,
-       &edge_partition_e_value_input,
-       &edge_partition_e_mask,
-       &e_op,
-       major_offset,
-       indices,
-       edge_offset] __device__(auto i) {
-        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-          auto minor        = indices[i];
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-          auto src          = GraphViewType::is_storage_transposed
-                                                                       ? minor
-                                                                       : edge_partition.major_from_major_offset_nocheck(major_offset);
-          auto dst          = GraphViewType::is_storage_transposed
-                                                                       ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                                                                       : minor;
-          auto src_offset   = GraphViewType::is_storage_transposed
-                                                                       ? minor_offset
-                                                                       : static_cast<vertex_t>(major_offset);
-          auto dst_offset   = GraphViewType::is_storage_transposed
-                                                                       ? static_cast<vertex_t>(major_offset)
-                                                                       : minor_offset;
-          return e_op(src,
-                      dst,
-                      edge_partition_src_value_input.get(src_offset),
-                      edge_partition_dst_value_input.get(dst_offset),
-                      edge_partition_e_value_input.get(edge_offset + i));
-        } else {
-          return e_op_result_t{};
-        }
-      },
-      e_op_result_t{},
-      edge_property_add);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    e_op_result_t sum{};
+    if (edge_partition_e_mask) {
+      sum = thrust::transform_reduce(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        [&edge_partition_e_mask, &call_e_op, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return e_op_result_t{};
+          }
+        },
+        e_op_result_t{},
+        edge_property_add);
+    } else {
+      sum = thrust::transform_reduce(thrust::seq,
+                                     thrust::make_counting_iterator(edge_t{0}),
+                                     thrust::make_counting_iterator(local_degree),
+                                     call_e_op,
+                                     e_op_result_t{},
+                                     edge_property_add);
+    }
 
     e_op_result_sum = edge_property_add(e_op_result_sum, sum);
     idx += gridDim.x * blockDim.x;
@@ -264,30 +268,37 @@ __global__ void transform_reduce_e_mid_degree(
   property_op<e_op_result_t, thrust::plus> edge_property_add{};
   e_op_result_t e_op_result_sum{};
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed
-                              ? minor
-                              : edge_partition.major_from_major_offset_nocheck(major_offset);
-        auto dst          = GraphViewType::is_storage_transposed
-                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                              : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        auto e_op_result = e_op(src,
-                                dst,
-                                edge_partition_src_value_input.get(src_offset),
-                                edge_partition_dst_value_input.get(dst_offset),
-                                edge_partition_e_value_input.get(edge_offset + i));
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        auto e_op_result = call_e_op(i);
         e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
       }
     }
@@ -331,30 +342,37 @@ __global__ void transform_reduce_e_high_degree(
   property_op<e_op_result_t, thrust::plus> edge_property_add{};
   e_op_result_t e_op_result_sum{};
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed
-                              ? minor
-                              : edge_partition.major_from_major_offset_nocheck(major_offset);
-        auto dst          = GraphViewType::is_storage_transposed
-                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                              : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        auto e_op_result = e_op(src,
-                                dst,
-                                edge_partition_src_value_input.get(src_offset),
-                                edge_partition_dst_value_input.get(dst_offset),
-                                edge_partition_e_value_input.get(edge_offset + i));
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        auto e_op_result = call_e_op(i);
         e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
       }
     }

From f63310c2f4d538c2872de1b355c1735f6de87406 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 12 Feb 2024 11:46:38 -0600
Subject: [PATCH 2/6] Add cuda-nvtx-dev, add missing CUDA library dependencies.
 (#4162)

This PR fixes issues with devcontainer builds where `cuda-nvtx-dev` was missing when building `libcugraph_etl`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cugraph/pull/4162
---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml |  7 +++++++
 dependencies.yaml                                | 10 +++++++++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 43bc60d91fb..de8db94df1c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -14,6 +14,7 @@ dependencies:
 - breathe
 - c-compiler
 - cmake>=3.26.4
+- cuda-nvtx
 - cuda-version=11.8
 - cudatoolkit
 - cudf==24.4.*
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 1829f3dd860..03dade0ed1f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -14,7 +14,10 @@ dependencies:
 - breathe
 - c-compiler
 - cmake>=3.26.4
+- cuda-cudart-dev
 - cuda-nvcc
+- cuda-nvtx-dev
+- cuda-profiler-api
 - cuda-version=12.0
 - cudf==24.4.*
 - cupy>=12.0.0
@@ -29,8 +32,12 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
+- libcublas-dev
 - libcudf==24.4.*
 - libcugraphops==24.4.*
+- libcurand-dev
+- libcusolver-dev
+- libcusparse-dev
 - libraft-headers==24.4.*
 - libraft==24.4.*
 - librmm==24.4.*
diff --git a/dependencies.yaml b/dependencies.yaml
index cfefe3b9ff9..58354407bbc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -327,10 +327,18 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
+              - cuda-cudart-dev
+              - cuda-nvtx-dev
+              - cuda-profiler-api
+              - libcublas-dev
+              - libcurand-dev
+              - libcusolver-dev
+              - libcusparse-dev
           - matrix:
               cuda: "11.*"
             packages:
               - cudatoolkit
+              - cuda-nvtx
   common_build:
     common:
       - output_types: [conda, pyproject]
@@ -345,6 +353,7 @@ dependencies:
           - cxx-compiler
           - gmock>=1.13.0
           - gtest>=1.13.0
+          - libcudf==24.4.*
           - libcugraphops==24.4.*
           - libraft-headers==24.4.*
           - libraft==24.4.*
@@ -438,7 +447,6 @@ dependencies:
         packages:
           - aiohttp
           - fsspec>=0.6.0
-          - libcudf==24.4.*
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu

From eb096ac5ba7831112d6dd39a75be7c4ce9328b21 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Mon, 12 Feb 2024 15:44:12 -0500
Subject: [PATCH 3/6] Update Changelog [skip ci]

---
 CHANGELOG.md | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d165cd7efc4..fe08c8aeb03 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,84 @@
+# cuGraph 24.02.00 (12 Feb 2024)
+
+## 🚨 Breaking Changes
+
+- Remove Experimental Wrappers from GNN Code ([#4070](https://github.com/rapidsai/cugraph/pull/4070)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Switch to scikit-build-core ([#4053](https://github.com/rapidsai/cugraph/pull/4053)) [@vyasr](https://github.com/vyasr)
+- Update to CCCL 2.2.0. ([#4052](https://github.com/rapidsai/cugraph/pull/4052)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Exclude tests from builds ([#4147)&quot; (#4157](https://github.com/rapidsai/cugraph/pull/4147)&quot; (#4157)) [@raydouglass](https://github.com/raydouglass)
+- Exclude tests from builds ([#4147](https://github.com/rapidsai/cugraph/pull/4147)) [@vyasr](https://github.com/vyasr)
+- Constraint pytorch-dependent wheel test to only run on amd64 ([#4133](https://github.com/rapidsai/cugraph/pull/4133)) [@tingyu66](https://github.com/tingyu66)
+- Removes the `networkx_algorithm` decorator to all SCC functions to disable dispatching to them ([#4120](https://github.com/rapidsai/cugraph/pull/4120)) [@rlratzel](https://github.com/rlratzel)
+- Correct `cugraph-pyg` package name used in wheels and fix test script ([#4083](https://github.com/rapidsai/cugraph/pull/4083)) [@tingyu66](https://github.com/tingyu66)
+- Fix Jaccard hang ([#4080](https://github.com/rapidsai/cugraph/pull/4080)) [@jnke2016](https://github.com/jnke2016)
+- Fix OOB error, BFS C API should validate that the source vertex is a valid vertex ([#4077](https://github.com/rapidsai/cugraph/pull/4077)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [BUG]Fix non-type template parameter to cugraph::relabel ([#4064](https://github.com/rapidsai/cugraph/pull/4064)) [@naimnv](https://github.com/naimnv)
+- Fix MG weighted similarity test failure ([#4054](https://github.com/rapidsai/cugraph/pull/4054)) [@seunghwak](https://github.com/seunghwak)
+- MG C-API test failure fixes ([#4047](https://github.com/rapidsai/cugraph/pull/4047)) [@seunghwak](https://github.com/seunghwak)
+- Add a barrier before cugraph Graph creation ([#4046](https://github.com/rapidsai/cugraph/pull/4046)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Fix % 0 bug in MG_SELECT_RANDOM_VERTICES test ([#4034](https://github.com/rapidsai/cugraph/pull/4034)) [@seunghwak](https://github.com/seunghwak)
+- Branch 24.02 merge branch 23.12 ([#4012](https://github.com/rapidsai/cugraph/pull/4012)) [@vyasr](https://github.com/vyasr)
+
+## 📖 Documentation
+
+- Updates nx-cugraph README.md with latest algos ([#4135](https://github.com/rapidsai/cugraph/pull/4135)) [@rlratzel](https://github.com/rlratzel)
+- corrected links in C API and added groups for support functions ([#4131](https://github.com/rapidsai/cugraph/pull/4131)) [@acostadon](https://github.com/acostadon)
+- Forward-merge branch-23.12 to branch-24.02 ([#4049](https://github.com/rapidsai/cugraph/pull/4049)) [@GPUtester](https://github.com/GPUtester)
+
+## 🚀 New Features
+
+- Implement has_edge() &amp; compute_multiplicity() ([#4096](https://github.com/rapidsai/cugraph/pull/4096)) [@seunghwak](https://github.com/seunghwak)
+- Update per_v_transform_reduce_incoming|outgoing_e to support edge masking ([#4085](https://github.com/rapidsai/cugraph/pull/4085)) [@seunghwak](https://github.com/seunghwak)
+- Remove Experimental Wrappers from GNN Code ([#4070](https://github.com/rapidsai/cugraph/pull/4070)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- MNMG ECG ([#4030](https://github.com/rapidsai/cugraph/pull/4030)) [@naimnv](https://github.com/naimnv)
+- Replace graph_view.hpp::number_of_edges with compute_number_of_edges ([#4026](https://github.com/rapidsai/cugraph/pull/4026)) [@seunghwak](https://github.com/seunghwak)
+- Update count_if_e, transform_reduce_e, and transform_e to support edge masking ([#4001](https://github.com/rapidsai/cugraph/pull/4001)) [@seunghwak](https://github.com/seunghwak)
+- Sampling Performance Testing ([#3584](https://github.com/rapidsai/cugraph/pull/3584)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+
+## 🛠️ Improvements
+
+- Adds option to rapids_cpm_find for raft to disable hnswlib feature, adds updates for pytest 8 compat, temporarily skips IO intensive test in CI ([#4121](https://github.com/rapidsai/cugraph/pull/4121)) [@rlratzel](https://github.com/rlratzel)
+- Adds benchmarks for additional nx-cugraph 24.02 algos ([#4112](https://github.com/rapidsai/cugraph/pull/4112)) [@rlratzel](https://github.com/rlratzel)
+- nx-cugraph: use coverage to ensure all algorithms were run ([#4108](https://github.com/rapidsai/cugraph/pull/4108)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: rename `plc=` to `_plc=` ([#4106](https://github.com/rapidsai/cugraph/pull/4106)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `complement` and `reverse` ([#4103](https://github.com/rapidsai/cugraph/pull/4103)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `core_number` (undirected graphs only) ([#4100](https://github.com/rapidsai/cugraph/pull/4100)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `is_tree`, etc. ([#4097](https://github.com/rapidsai/cugraph/pull/4097)) [@eriknw](https://github.com/eriknw)
+- Optimize the drop-duplicate functionality ([#4095](https://github.com/rapidsai/cugraph/pull/4095)) [@jnke2016](https://github.com/jnke2016)
+- nx-cugraph: add triangles and clustering algorithms ([#4093](https://github.com/rapidsai/cugraph/pull/4093)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: PLC now handles isolated nodes; clean up our workarounds ([#4092](https://github.com/rapidsai/cugraph/pull/4092)) [@eriknw](https://github.com/eriknw)
+- Remove usages of rapids-env-update ([#4090](https://github.com/rapidsai/cugraph/pull/4090)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Provide explicit pool sizes and avoid RMM detail APIs ([#4086](https://github.com/rapidsai/cugraph/pull/4086)) [@harrism](https://github.com/harrism)
+- refactor CUDA versions in dependencies.yaml ([#4084](https://github.com/rapidsai/cugraph/pull/4084)) [@jameslamb](https://github.com/jameslamb)
+- build wheels for `cugraph-dgl` and `cugraph-pyg` ([#4075](https://github.com/rapidsai/cugraph/pull/4075)) [@tingyu66](https://github.com/tingyu66)
+- Match weight-sharing option of GATConv in DGL ([#4074](https://github.com/rapidsai/cugraph/pull/4074)) [@tingyu66](https://github.com/tingyu66)
+- nx-cugraph: add weakly connected components ([#4071](https://github.com/rapidsai/cugraph/pull/4071)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: indicate which plc algorithms are used and version_added ([#4069](https://github.com/rapidsai/cugraph/pull/4069)) [@eriknw](https://github.com/eriknw)
+- Adds `nx-cugraph` benchmarks for 23.12 algos (SSSP, pagerank, hits, katz_centrality, degree_centrality, eigenvector_centrality) ([#4065](https://github.com/rapidsai/cugraph/pull/4065)) [@rlratzel](https://github.com/rlratzel)
+- `nx-cugraph`: add `to_undirected` method; add reciprocity algorithms ([#4063](https://github.com/rapidsai/cugraph/pull/4063)) [@eriknw](https://github.com/eriknw)
+- Switch to scikit-build-core ([#4053](https://github.com/rapidsai/cugraph/pull/4053)) [@vyasr](https://github.com/vyasr)
+- Update to CCCL 2.2.0. ([#4052](https://github.com/rapidsai/cugraph/pull/4052)) [@bdice](https://github.com/bdice)
+- Prevent `actions/labeler` from adding `Label Checker` labels ([#4048](https://github.com/rapidsai/cugraph/pull/4048)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update dependencies.yaml to new pip index ([#4045](https://github.com/rapidsai/cugraph/pull/4045)) [@vyasr](https://github.com/vyasr)
+- Remove checks for Pascal, no longer supported ([#4044](https://github.com/rapidsai/cugraph/pull/4044)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix HITS convergence error. ([#4043](https://github.com/rapidsai/cugraph/pull/4043)) [@seunghwak](https://github.com/seunghwak)
+- Test select_random_vertices for all possible values of flags ([#4042](https://github.com/rapidsai/cugraph/pull/4042)) [@naimnv](https://github.com/naimnv)
+- Remove CUGRAPH_BUILD_WHEELS and standardize Python builds ([#4041](https://github.com/rapidsai/cugraph/pull/4041)) [@vyasr](https://github.com/vyasr)
+- Create `cugraph-equivariant` package ([#4036](https://github.com/rapidsai/cugraph/pull/4036)) [@tingyu66](https://github.com/tingyu66)
+- [FEA]: Add DASK edgelist and graph support to the Dataset API ([#4035](https://github.com/rapidsai/cugraph/pull/4035)) [@huiyuxie](https://github.com/huiyuxie)
+- Add support for Louvain to MTMG ([#4033](https://github.com/rapidsai/cugraph/pull/4033)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Clean up self-loop and multi-edge removal logic ([#4032](https://github.com/rapidsai/cugraph/pull/4032)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Mtmg updates for rmm ([#4031](https://github.com/rapidsai/cugraph/pull/4031)) [@ChuckHastings](https://github.com/ChuckHastings)
+- nx-cugraph: adds `ancestors`, `descendants`, and BFS algos ([#4029](https://github.com/rapidsai/cugraph/pull/4029)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: update usage of `nodes_or_number` for nx compat ([#4028](https://github.com/rapidsai/cugraph/pull/4028)) [@eriknw](https://github.com/eriknw)
+- Removes unsupported `setup.py` calls, cleans up text ([#4024](https://github.com/rapidsai/cugraph/pull/4024)) [@rlratzel](https://github.com/rlratzel)
+- Resolves conflicts from forward-merging branch-23.12 into branch-24.02 ([#4020](https://github.com/rapidsai/cugraph/pull/4020)) [@rlratzel](https://github.com/rlratzel)
+- Add `HeteroGATConv` to `cugraph-pyg` ([#3914](https://github.com/rapidsai/cugraph/pull/3914)) [@tingyu66](https://github.com/tingyu66)
+- Update for CCCL 2.x ([#3862](https://github.com/rapidsai/cugraph/pull/3862)) [@seunghwak](https://github.com/seunghwak)
+
 # cuGraph 23.12.00 (6 Dec 2023)
 
 ## 🚨 Breaking Changes

From fe69e7fbf24617a21a5e26d95ca9db9f95ca9f2e Mon Sep 17 00:00:00 2001
From: Ralph Liu <137829296+nv-rliu@users.noreply.github.com>
Date: Tue, 13 Feb 2024 11:24:55 -0500
Subject: [PATCH 4/6] Fix `FutureWarnings` in Graph Creation (#4167)

Addresses #4163

This PR uses `iloc` when referencing the index in a Series object, which removes the `FutureWarning` that was seen at graph creation.

Authors:
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4167
---
 python/cugraph/cugraph/structure/number_map.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py
index d7da20f9d84..b0118fee960 100644
--- a/python/cugraph/cugraph/structure/number_map.py
+++ b/python/cugraph/cugraph/structure/number_map.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -497,7 +497,7 @@ def renumber_and_segment(
             # can't determine the edgelist input type
             unrenumbered_id_type = None
         else:
-            unrenumbered_id_type = df.dtypes[0]
+            unrenumbered_id_type = df.dtypes.iloc[0]
 
         if np.int64 in list(df.dtypes):
             renumber_id_type = np.int64

From b93e1148d3c2149fb1422492c570676eeb473b97 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 13 Feb 2024 18:24:54 +0100
Subject: [PATCH 5/6] Add a new notebook for SNMG benchmark runs (#4091)

This PR adds new notebook for SNMG benchmark runs. It
- [x]  simplifies bookkeeping and data presentation.
- [x]  update each cugraph function wrappers to handle both SG and MG calls depending on a flag. The object is to have a common set of function wrappers for SG, SNMG and MNMG and put them in a common module to avoid duplication.

Authors:
  - Naim (https://github.com/naimnv)
  - Ralph Liu (https://github.com/nv-rliu)

Approvers:
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4091
---
 .../synth_release_single_node_multi_gpu.ipynb | 950 ++++++++++++++++++
 1 file changed, 950 insertions(+)
 create mode 100644 notebooks/cugraph_benchmarks/synth_release_single_node_multi_gpu.ipynb

diff --git a/notebooks/cugraph_benchmarks/synth_release_single_node_multi_gpu.ipynb b/notebooks/cugraph_benchmarks/synth_release_single_node_multi_gpu.ipynb
new file mode 100644
index 00000000000..c44f475c441
--- /dev/null
+++ b/notebooks/cugraph_benchmarks/synth_release_single_node_multi_gpu.ipynb
@@ -0,0 +1,950 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Skip notebook test\n",
+    "-----\n",
+    "\n",
+    "#### NOTE:  This notebook will take hours to run.\n",
+    "-----\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Comparing NetworkX vs cuGraph using synthetic data on various algorithms on single node multi GPU (SNMG) cluster\n",
+    "\n",
+    "\n",
+    "This notebook compares the execution times of many of the cuGraph and NetworkX algorithms when run against identical synthetic data at multiple scales.\n",
+    "\n",
+    "This notebook uses the RMAT data generator which allows the creation of graphs at various scales.  The notebook, by default, runs on a set of selected sizes but users are free to change or add to that list."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notebook Credits\n",
+    "\n",
+    "    \n",
+    "| Author        |    Date    |  Update             | cuGraph Version |  Test Hardware         |\n",
+    "| --------------|------------|---------------------|-----------------|------------------------|\n",
+    "| Don Acosta    | 1/12/2023  | Created             | 23.02 nightly   | RTX A6000, CUDA 11.7   |\n",
+    "| Brad Rees     | 1/27/2023  | Modified            | 23.02 nightly   | RTX A6000, CUDA 11.7   |\n",
+    "| Naim, Md      | 2/08/2024  | Modified for SNMG   | 24.04 nightly   | RTX A6000, CUDA 12.0   |\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Timing "
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When looking at the overall workflow, NetworkX and cuGraph do things differently.  For example, NetworkX spends a lot of time creating the graph data structure.  cuGraph on the other hand does a lazy creation of the data structure when an algorithm is called.  To further complicate the comparison problem, NetworkX does not always return the answer.  In some cases, it returns a generator that is then called to produce the data.  \n",
+    "\n",
+    "This benchmark produces two performance metrics:\n",
+    " - (1)\tJust the algorithm run time \n",
+    " - (2)\tThe algorithm plus graph creation time\n",
+    "\n",
+    "Since GPU memory is a precious resource, having a lot of temporary data laying around is avoided.  So once a graph is created, the raw data is dropped.  \n",
+    " \n",
+    "__What is not timed__:  Generating the data with R-MAT</p>\n",
+    "__What is timed__:     (1) creating a Graph, (2) running the algorithm (3) run any generators\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Algorithms"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "|        Algorithm        |  Type         | Undirected Graph | Directed Graph |   Notes\n",
+    "| ------------------------|---------------|------ | ------- |-------------\n",
+    "| Katz                    | Centrality    |   X   |         | \n",
+    "| Betweenness Centrality  | Centrality    |   X   |         | Estimated, k = 100\n",
+    "| Louvain                 | Community     |   X   |         | Uses python-louvain for comparison\n",
+    "| Triangle Counting       | Community     |   X   |         |\n",
+    "| Core Number             | Core          |   X   |         |\n",
+    "| PageRank                | Link Analysis |       |    X    |\n",
+    "| Jaccard                 | Similarity    |   X   |         |\n",
+    "| BFS                     | Traversal     |   X   |         | No depth limit\n",
+    "| SSSP                    | Traversal     |   X   |         | \n",
+    "\n",
+    "\n",
+    "### Test Data\n",
+    "Data is generated using a Recursive MATrix (R-MAT) graph generation algorithm. \n",
+    "The generator specifics are documented [here](https://docs.rapids.ai/api/cugraph/stable/api_docs/generator.html)\n",
+    "\n",
+    "\n",
+    "\n",
+    "### Notes\n",
+    "* Running Betweenness Centrality on the full graph is prohibitive using NetworkX.  Anything over k=100 can explode runtime to days\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Import Modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# system and other\n",
+    "import gc\n",
+    "import os\n",
+    "from time import perf_counter\n",
+    "import pandas as pd\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "# rapids\n",
+    "import cugraph\n",
+    "\n",
+    "# liblibraries to setup dask cluster and client\n",
+    "from dask.distributed import Client\n",
+    "from dask_cuda import LocalCUDACluster\n",
+    "from cugraph.dask.comms import comms as Comms\n",
+    "\n",
+    "# NetworkX libraries\n",
+    "import networkx as nx\n",
+    "\n",
+    "# RMAT data generator\n",
+    "from cugraph.generators import rmat\n",
+    "from cugraph.structure import NumberMap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try: \n",
+    "    import community\n",
+    "except ModuleNotFoundError:\n",
+    "    os.system('pip install python-louvain')\n",
+    "    import community"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Determine the scale of the test data\n",
+    "RMAT generates graph where the number of vertices is a power of 2 and the number of edges is based on an edge factor times the number vertices.\n",
+    "\n",
+    "Since RMAT tends to generate about 50% isolated vertices, those vertices are dropped from the graph data.  Hence the number of vertices is closer to (2 ** scale) / 2\n",
+    "\n",
+    "\n",
+    "| Scale | Vertices (est) | Edges  |\n",
+    "| ------|----------------|--------|\n",
+    "| 10 | 512 | 16,384 | \n",
+    "| 11 | 1,024 | 32,768| \n",
+    "| 12 | 2,048 | 65,536| \n",
+    "| 13 | 4,096 | 131,072| \n",
+    "| 14 | 8,192 | 262,144| \n",
+    "| 15 | 16,384 | 524,288 | \n",
+    "| 16 | 32,768 | 1,048,576 | \n",
+    "| 17 | 65,536 | 2,097,152 | \n",
+    "| 18 | 131,072 | 4,194,304 | \n",
+    "| 19 | 262,144 | 8,388,608 | \n",
+    "| 20 | 524,288 | 16,777,216 | \n",
+    "| 21 | 1,048,576 | 33,554,432 | \n",
+    "| 22 | 2,097,152 | 67,108,864 | \n",
+    "| 23 | 4,194,304 | 134,217,728 | \n",
+    "| 24 | 8,388,608 | 268,435,456 | \n",
+    "| 25 | 16,777,216 | 536,870,912 | \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test Data Sizes\n",
+    "# Here you can create an array of test data sizes.   Then set the \"data\" variable to the array you want\n",
+    "# the dictionary format is 'name' : scale\n",
+    "\n",
+    "\n",
+    "# These scales are used by R-MAT to determine the number of vertices/edges in the synthetic data graph.\n",
+    "data_full = {\n",
+    "    'data_scale_10'   :  10,\n",
+    "    'data_scale_12'   :  12,\n",
+    "    'data_scale_14'  :   14,\n",
+    "    'data_scale_16'  :   16,\n",
+    "    'data_scale_18'  :   18,\n",
+    "    'data_scale_20'  :   20,\n",
+    "}\n",
+    "\n",
+    "# for quick testing\n",
+    "data_quick = {\n",
+    "   'data_scale_9' : 9,\n",
+    "   'data_scale_10' : 10,\n",
+    "   'data_scale_11' : 11,\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Which dataset is to be used\n",
+    "data = data_quick\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Generate data\n",
+    "The data is generated once for each size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data generator \n",
+    "#  The result is an edgelist of the size determined by the scale and edge factor\n",
+    "def generate_data(scale, edgefactor=16, mg=False):\n",
+    "    _gdf = rmat(\n",
+    "        scale,\n",
+    "        (2 ** scale) * edgefactor,\n",
+    "        0.57,\n",
+    "        0.19,\n",
+    "        0.19,\n",
+    "        42,\n",
+    "        clip_and_flip=False,\n",
+    "        scramble_vertex_ids=True,\n",
+    "        create_using=None,  # return edgelist instead of Graph instance\n",
+    "        mg=mg # determines whether generated data will be used on one or multiple GPUs\n",
+    "        )\n",
+    "\n",
+    "    clean_coo = NumberMap.renumber(_gdf, src_col_names=\"src\", dst_col_names=\"dst\")[0]\n",
+    "    if mg:\n",
+    "        clean_coo.rename(columns={\"renumbered_src\": \"src\", \"renumbered_dst\": \"dst\"})\n",
+    "    else:\n",
+    "        clean_coo.rename(columns={\"renumbered_src\": \"src\", \"renumbered_dst\": \"dst\"}, inplace=True)\n",
+    "\n",
+    "    print(f'Generated a dataframe of type {type(clean_coo)}, with {len(clean_coo)} edges')\n",
+    "    \n",
+    "    return clean_coo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create Graph functions\n",
+    "There are two types of graphs created:\n",
+    "* Directed Graphs - calls to create_nx_digraph, create_cu_directed_graph.\n",
+    "* Undirected Graphs - calls to create_xx_ugraph <- fully symmeterized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NetworkX\n",
+    "def create_nx_graph(_df , directed=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if directed:\n",
+    "        g_type = nx.DiGraph\n",
+    "    else:\n",
+    "        g_type = nx.Graph\n",
+    "    \n",
+    "    _gnx = nx.from_pandas_edgelist(_df,\n",
+    "                            source='src',\n",
+    "                            target='dst',\n",
+    "                            edge_attr=None,\n",
+    "                            create_using=g_type)\n",
+    "    t2 = perf_counter() - t1\n",
+    "\n",
+    "    return _gnx, t2\n",
+    "\n",
+    "\n",
+    "\n",
+    "# cuGraph\n",
+    "def create_cu_graph(_df, transpose=False, directed=False, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    _g = cugraph.Graph(directed=directed)\n",
+    "\n",
+    "    if mg:\n",
+    "        _g.from_dask_cudf_edgelist(_df, source=\"src\", destination=\"dst\", edge_attr=None)\n",
+    "    else:\n",
+    "        _g.from_cudf_edgelist(_df,\n",
+    "                            source='src',\n",
+    "                            destination='dst',\n",
+    "                            edge_attr=None,\n",
+    "                            renumber=False,\n",
+    "                            store_transposed=transpose)\n",
+    "    t2 = perf_counter() - t1\n",
+    "\n",
+    "    return _g, t2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Algorithm Execution"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Katz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_katz(_G, alpha):\n",
+    "    t1 = perf_counter()\n",
+    "    _ = nx.katz_centrality(_G, alpha)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_katz(_G, alpha, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.katz_centrality(_G, alpha)\n",
+    "    else:\n",
+    "\n",
+    "        _ = cugraph.katz_centrality(_G, alpha)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Betweenness Centrality"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_bc(_G, _k):\n",
+    "    t1 = perf_counter()\n",
+    "    _ = nx.betweenness_centrality(_G, k=_k)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_bc(_G, _k, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.betweenness_centrality(_G, k=_k)\n",
+    "    else:   \n",
+    "        _ = cugraph.betweenness_centrality(_G, k=_k)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Louvain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_louvain(_G):\n",
+    "    t1 = perf_counter()\n",
+    "    parts = community.best_partition(_G)\n",
+    "    \n",
+    "    # Calculating modularity scores for comparison\n",
+    "    _ = community.modularity(parts, _G)\n",
+    "    \n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_louvain(_G, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _, modularity = cugraph.dask.louvain(_G)\n",
+    "        print (f'modularity: {modularity}')\n",
+    "    else:\n",
+    "        _,_ = cugraph.louvain(_G)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Triangle Counting"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_tc(_G):\n",
+    "    t1 = perf_counter()\n",
+    "    nx_count = nx.triangles(_G)\n",
+    "\n",
+    "    # To get the number of triangles, we would need to loop through the array and add up each count\n",
+    "    count = 0\n",
+    "    for key, value in nx_count.items():\n",
+    "        count = count + value\n",
+    "    \n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_tc(_G, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.triangle_count(_G)\n",
+    "    else:\n",
+    "        _ = cugraph.triangle_count(_G)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Core Number"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_core_num(_G):\n",
+    "    t1 = perf_counter()\n",
+    "    _G.remove_edges_from(nx.selfloop_edges(_G))\n",
+    "    nx_count = nx.core_number(_G)\n",
+    "    \n",
+    "    count = 0\n",
+    "    for key, value in nx_count.items():\n",
+    "        count = count + value\n",
+    "    \n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_core_num(_G, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.core_number(_G)\n",
+    "    else:\n",
+    "        _ = cugraph.core_number(_G)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PageRank"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_pagerank(_G):\n",
+    "    t1 = perf_counter()\n",
+    "    _ = nx.pagerank(_G)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2 \n",
+    "\n",
+    "def cu_pagerank(_G, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.pagerank(_G)\n",
+    "    else:\n",
+    "        _ = cugraph.pagerank(_G)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Jaccard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_jaccard(_G):\n",
+    "    t1 = perf_counter()\n",
+    "    nj = nx.jaccard_coefficient(_G)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_jaccard(_G, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.jaccard(_G)\n",
+    "    else:\n",
+    "        _ = cugraph.jaccard_coefficient(_G)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### BFS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_bfs(_G, seed):\n",
+    "    t1 = perf_counter()\n",
+    "    nb = nx.bfs_edges(_G, seed)\n",
+    "    nb_list = list(nb) # gen -> list\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_bfs(_G, seed=0, mg=False):\n",
+    "    t1 = perf_counter()\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.bfs(_G, seed)\n",
+    "    else:\n",
+    "        _ = cugraph.bfs(_G, seed)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SSSP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nx_sssp(_G, seed):\n",
+    "    t1 = perf_counter()\n",
+    "    _ = nx.shortest_path(_G, seed)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n",
+    "\n",
+    "def cu_sssp(_G, seed = 0, mg=False):\n",
+    "    \n",
+    "    t1 = perf_counter()\n",
+    "    # SSSP requires weighted graph\n",
+    "    if mg:\n",
+    "        _ = cugraph.dask.bfs(_G, seed)\n",
+    "    else:\n",
+    "        _ = cugraph.bfs(_G, seed)\n",
+    "    t2 = perf_counter() - t1\n",
+    "    return t2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## MG Benchmark"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize multi-GPU environment\n",
+    "Before we get started, we need to set up a dask (local) cluster of workers to execute our work, and a client to coordinate and schedule work for that cluster.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup a local dask cluster of workers, and a client\n",
+    "cluster = LocalCUDACluster()\n",
+    "client = Client(cluster)\n",
+    "Comms.initialize(p2p=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Placeholders to collect execution run statistics "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "nx_algo_run_times = defaultdict(defaultdict)\n",
+    "cugraph_algo_run_times = defaultdict(defaultdict)\n",
+    "perf_algos = defaultdict(defaultdict)\n",
+    "perf = defaultdict(defaultdict)\n",
+    "cugraph_graph_creation_times = defaultdict()\n",
+    "nx_graph_creation_times = defaultdict()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run NX and cuGraph algorithms for all datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "for dataset, scale in data.items():\n",
+    "    \n",
+    "    # generate data\n",
+    "    print(\"------------------------------\")\n",
+    "    print(f'Creating Graph of Scale = {scale}')\n",
+    "    \n",
+    "    gdf = generate_data(scale, edgefactor=16, mg=True)\n",
+    "    gdf = gdf.repartition(gdf.npartitions * 3)\n",
+    "\n",
+    "    # Copy data to host to create NX graph\n",
+    "    pdf = pd.DataFrame(columns=['src', 'dst'])\n",
+    "    for part_idx in range(gdf.npartitions):\n",
+    "        computed_df = gdf.partitions[part_idx].compute().to_pandas()\n",
+    "        pdf = pd.concat([pdf, computed_df], ignore_index=True, sort=False)\n",
+    "\n",
+    "    print(f\"\\tdata in gdf {len(gdf)} and data in pandas {len(pdf)}\")\n",
+    "    \n",
+    "    # create cuGraph and NX graphs\n",
+    "    g_cu, tcu = create_cu_graph(gdf, mg=True)\n",
+    "    g_nx, tnx = create_nx_graph(pdf)\n",
+    "    cugraph_graph_creation_times[dataset] = tcu\n",
+    "    nx_graph_creation_times[dataset] = tnx\n",
+    "    del gdf, pdf\n",
+    "\n",
+    "    # prep\n",
+    "    deg = g_cu.degree()\n",
+    "    deg_max = deg['degree'].max().compute()\n",
+    "    alpha = 1 / deg_max\n",
+    "    num_nodes = g_cu.number_of_vertices()\n",
+    "    del deg\n",
+    "    gc.collect()\n",
+    "\n",
+    "    #-- Katz \n",
+    "    algorithm = \"Katz\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_katz(g_nx, alpha)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_katz(g_cu, alpha, mg=True)\n",
+    "    print(\"\")\n",
+    "    \n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    #-- BC\n",
+    "    algorithm = \"BC\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    k = 100\n",
+    "    if k > num_nodes:\n",
+    "        k = int(num_nodes)\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_bc(g_nx, k)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_bc(g_cu, k, mg=True)\n",
+    "    print(\" \")\n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    #-- Louvain\n",
+    "    algorithm = \"Louvain\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_louvain(g_nx)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_louvain(g_cu, mg=True)\n",
+    "    print(\" \")\n",
+    "\n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    #-- TC\n",
+    "    algorithm = \"TC\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_tc(g_nx)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_tc(g_cu, mg=True)\n",
+    "    print(\" \")\n",
+    "    \n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    #-- Core Number\n",
+    "    algorithm = \"Core Number\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_core_num(g_nx)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_core_num(g_cu, mg=True)\n",
+    "    print(\" \")\n",
+    "\n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    #-- PageRank\n",
+    "    algorithm = \"PageRank\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_pagerank(g_nx)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_pagerank(g_cu, mg=True)\n",
+    "    print(\" \")\n",
+    "\n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    #-- Jaccard\n",
+    "    algorithm = \"Jaccard\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_jaccard(g_nx)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_jaccard(g_cu, mg=True)\n",
+    "    print(\" \")\n",
+    "\n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    # Seed for BFS and SSSP\n",
+    "    nx_seed = list(g_nx.nodes)[0]\n",
+    "    cu_seed = g_cu.nodes().compute().to_pandas().iloc[0]\n",
+    "\n",
+    "    #-- BFS\n",
+    "    algorithm = \"BFS\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_bfs(g_nx, nx_seed)\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_bfs(g_cu, seed = cu_seed, mg=True)\n",
+    "    print(\" \")\n",
+    "\n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    #-- SSSP\n",
+    "    algorithm = \"SSSP\"\n",
+    "    print(f\"\\t{algorithm}  \", end = '')\n",
+    "    print(\"n.\", end='')\n",
+    "    tx = nx_sssp(g_nx, nx_seed)\n",
+    "\n",
+    "    print(\"c.\", end='')\n",
+    "    tc = cu_sssp(g_cu, seed = cu_seed, mg=True)\n",
+    "    print(\" \")\n",
+    "\n",
+    "    nx_algo_run_times[dataset][algorithm] = tx\n",
+    "    cugraph_algo_run_times[dataset][algorithm] = tc\n",
+    "    perf_algos[dataset][algorithm] = tx/tc \n",
+    "    perf[dataset][algorithm] = (tx + tnx) / (tc + tcu)\n",
+    "\n",
+    "    del g_cu, g_nx\n",
+    "    gc.collect()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### cuGraph speedup of different algorithms w.r.t. NX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\\n\\t------Speedup (cuGraph w.r.t. NX)------\\n\")\n",
+    "print(pd.DataFrame(perf))\n",
+    "print(\"\\n\\t------Speedup (cuGraph w.r.t. NX, excluding graph creation time)------\\n\")\n",
+    "print(pd.DataFrame(perf_algos))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Nx and cuGraph execution times for different algorithms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "nx_and_cugraph_run_times = pd.DataFrame()\n",
+    "for dataset in cugraph_algo_run_times.keys():\n",
+    "    temp_df = pd.DataFrame({'NX': nx_algo_run_times[dataset], 'cuGraph': cugraph_algo_run_times[dataset]})\n",
+    "    columns = [(dataset, 'cuGraph'), (dataset, 'NX')]\n",
+    "    temp_df.columns = pd.MultiIndex.from_tuples(columns)\n",
+    "    nx_and_cugraph_run_times = pd.concat([temp_df, nx_and_cugraph_run_times], axis=1)\n",
+    "\n",
+    "print(\"\\n\\t------Nx and cuGraph execution times for different algorithms-----\\n\")\n",
+    "print(nx_and_cugraph_run_times)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clean up multi-GPU environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Comms.destroy()\n",
+    "client.close()\n",
+    "cluster.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "___\n",
+    "Copyright (c) 2020-2023, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n",
+    "___"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cudfdev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.1.0"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "587ff963ecd34554a9da41c94362e2baa062d9a57502e220f049e10816826984"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 3b927f64c17e608a9c49c5423150aaca584b92c4 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 13 Feb 2024 20:59:00 +0100
Subject: [PATCH 6/6] Use input rng_state rather than creating a local one
 (#4168)

Use input rng_state rather than creating a one local to function

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4168
---
 cpp/src/community/detail/refine_impl.cuh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index eb874657f01..c8ba8163ab2 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -504,10 +504,8 @@ refine_clustering(
     //
     // Decide best/positive move for each vertex
     //
-    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-    raft::random::RngState rng_state(seed);
-    raft::random::DeviceState<raft::random::PCGenerator> device_state(rng_state);
 
+    raft::random::DeviceState<raft::random::PCGenerator> device_state(rng_state);
     auto gain_and_dst_output_pairs = allocate_dataframe_buffer<thrust::tuple<weight_t, vertex_t>>(
       graph_view.local_vertex_partition_range_size(), handle.get_stream());