From 3fc3e98ba260083bdc5c0b52f41139c6180bb074 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Mon, 4 Dec 2023 14:09:23 -0500
Subject: [PATCH 1/9] Pin actions/labeler to v4 [skip ci] (#4038)

RAPIDS repos are using the `main` branch of https://github.com/actions/labeler which recently introduced [breaking changes](https://github.com/actions/labeler/releases/tag/v5.0.0).

This PR pins to the latest v4 release of the labeler action until we can evaluate the changes required for v5.

This PR also moves the labeler workflow to the correct location in the repo.

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 {github => .github}/workflows/labeler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename {github => .github}/workflows/labeler.yml (83%)
diff --git a/github/workflows/labeler.yml b/.github/workflows/labeler.yml
similarity index 83%
rename from github/workflows/labeler.yml
rename to .github/workflows/labeler.yml
index 23956a02fbd..31e78f82a62 100644
--- a/github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -6,6 +6,6 @@ jobs:
   triage:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/labeler@main
+    - uses: actions/labeler@v4
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"

From 32eaa5e97e5c26ebf5c8ca04faee51316eee75e8 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Tue, 5 Dec 2023 12:34:30 -0500
Subject: [PATCH 2/9] Clean up self-loop and multi-edge removal logic (#4032)

There are mask utilities that perform some of the functions that were implemented to do this cleanup.

Use the mask utilities instead of replicating functionality.

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/4032
---
 cpp/src/structure/detail/structure_utils.cuh  |  33 ++---
 cpp/src/structure/remove_multi_edges_impl.cuh |  53 ++++----
 cpp/src/structure/remove_self_loops_impl.cuh  |  36 ++---
 cpp/tests/community/triangle_count_test.cpp   |   4 +-
 cpp/tests/utilities/test_graphs.hpp           |  20 ++-
 cpp/tests/utilities/thrust_wrapper.cu         | 126 ------------------
 cpp/tests/utilities/thrust_wrapper.hpp        |  13 --
 7 files changed, 73 insertions(+), 212 deletions(-)

diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh
index 7630d5855a0..f0f729bce18 100644
--- a/cpp/src/structure/detail/structure_utils.cuh
+++ b/cpp/src/structure/detail/structure_utils.cuh
@@ -20,6 +20,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/device_functors.cuh>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
 #include <cugraph/utilities/misc_utils.cuh>
 #include <cugraph/utilities/packed_bool_utils.hpp>
 
@@ -524,35 +525,21 @@ std::tuple<size_t, rmm::device_uvector<uint32_t>> mark_entries(raft::handle_t co
                      return word;
                    });
 
-  // FIXME:  use detail::count_set_bits
-  size_t bit_count = thrust::transform_reduce(
-    handle.get_thrust_policy(),
-    marked_entries.begin(),
-    marked_entries.end(),
-    [] __device__(auto word) { return __popc(word); },
-    size_t{0},
-    thrust::plus<size_t>());
+  size_t bit_count = detail::count_set_bits(handle, marked_entries.begin(), num_entries);
 
   return std::make_tuple(bit_count, std::move(marked_entries));
 }
 
 template <typename T>
-rmm::device_uvector<T> remove_flagged_elements(raft::handle_t const& handle,
-                                               rmm::device_uvector<T>&& vector,
-                                               raft::device_span<uint32_t const> remove_flags,
-                                               size_t remove_count)
+rmm::device_uvector<T> keep_flagged_elements(raft::handle_t const& handle,
+                                             rmm::device_uvector<T>&& vector,
+                                             raft::device_span<uint32_t const> keep_flags,
+                                             size_t keep_count)
 {
-  rmm::device_uvector<T> result(vector.size() - remove_count, handle.get_stream());
-
-  thrust::copy_if(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator(size_t{0}),
-    thrust::make_counting_iterator(vector.size()),
-    thrust::make_transform_output_iterator(result.begin(),
-                                           indirection_t<size_t, T*>{vector.data()}),
-    [remove_flags] __device__(size_t i) {
-      return !(remove_flags[cugraph::packed_bool_offset(i)] & cugraph::packed_bool_mask(i));
-    });
+  rmm::device_uvector<T> result(keep_count, handle.get_stream());
+
+  detail::copy_if_mask_set(
+    handle, vector.begin(), vector.end(), keep_flags.begin(), result.begin());
 
   return result;
 }
diff --git a/cpp/src/structure/remove_multi_edges_impl.cuh b/cpp/src/structure/remove_multi_edges_impl.cuh
index ab6b1fba8eb..fdd3059f874 100644
--- a/cpp/src/structure/remove_multi_edges_impl.cuh
+++ b/cpp/src/structure/remove_multi_edges_impl.cuh
@@ -254,50 +254,47 @@ remove_multi_edges(raft::handle_t const& handle,
     }
   }
 
-  auto [multi_edge_count, multi_edges_to_delete] =
-    detail::mark_entries(handle,
-                         edgelist_srcs.size(),
-                         [d_edgelist_srcs = edgelist_srcs.data(),
-                          d_edgelist_dsts = edgelist_dsts.data()] __device__(auto idx) {
-                           return (idx > 0) && (d_edgelist_srcs[idx - 1] == d_edgelist_srcs[idx]) &&
-                                  (d_edgelist_dsts[idx - 1] == d_edgelist_dsts[idx]);
-                         });
-
-  if (multi_edge_count > 0) {
-    edgelist_srcs = detail::remove_flagged_elements(
+  auto [keep_count, keep_flags] = detail::mark_entries(
+    handle,
+    edgelist_srcs.size(),
+    [d_edgelist_srcs = edgelist_srcs.data(),
+     d_edgelist_dsts = edgelist_dsts.data()] __device__(auto idx) {
+      return !((idx > 0) && (d_edgelist_srcs[idx - 1] == d_edgelist_srcs[idx]) &&
+               (d_edgelist_dsts[idx - 1] == d_edgelist_dsts[idx]));
+    });
+
+  if (keep_count < edgelist_srcs.size()) {
+    edgelist_srcs = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_srcs),
-      raft::device_span<uint32_t const>{multi_edges_to_delete.data(), multi_edges_to_delete.size()},
-      multi_edge_count);
-    edgelist_dsts = detail::remove_flagged_elements(
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
+    edgelist_dsts = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_dsts),
-      raft::device_span<uint32_t const>{multi_edges_to_delete.data(), multi_edges_to_delete.size()},
-      multi_edge_count);
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
 
     if (edgelist_weights)
-      edgelist_weights = detail::remove_flagged_elements(
+      edgelist_weights = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_weights),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_ids)
-      edgelist_edge_ids = detail::remove_flagged_elements(
+      edgelist_edge_ids = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_ids),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_types)
-      edgelist_edge_types = detail::remove_flagged_elements(
+      edgelist_edge_types = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_types),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
   }
 
   return std::make_tuple(std::move(edgelist_srcs),
diff --git a/cpp/src/structure/remove_self_loops_impl.cuh b/cpp/src/structure/remove_self_loops_impl.cuh
index 161ffeae28e..dafe26cd1c5 100644
--- a/cpp/src/structure/remove_self_loops_impl.cuh
+++ b/cpp/src/structure/remove_self_loops_impl.cuh
@@ -44,44 +44,44 @@ remove_self_loops(raft::handle_t const& handle,
                   std::optional<rmm::device_uvector<edge_t>>&& edgelist_edge_ids,
                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types)
 {
-  auto [self_loop_count, self_loops_to_delete] =
+  auto [keep_count, keep_flags] =
     detail::mark_entries(handle,
                          edgelist_srcs.size(),
                          [d_srcs = edgelist_srcs.data(), d_dsts = edgelist_dsts.data()] __device__(
-                           size_t i) { return d_srcs[i] == d_dsts[i]; });
+                           size_t i) { return d_srcs[i] != d_dsts[i]; });
 
-  if (self_loop_count > 0) {
-    edgelist_srcs = detail::remove_flagged_elements(
+  if (keep_count < edgelist_srcs.size()) {
+    edgelist_srcs = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_srcs),
-      raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-      self_loop_count);
-    edgelist_dsts = detail::remove_flagged_elements(
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
+    edgelist_dsts = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_dsts),
-      raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-      self_loop_count);
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
 
     if (edgelist_weights)
-      edgelist_weights = detail::remove_flagged_elements(
+      edgelist_weights = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_weights),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_ids)
-      edgelist_edge_ids = detail::remove_flagged_elements(
+      edgelist_edge_ids = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_ids),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_types)
-      edgelist_edge_types = detail::remove_flagged_elements(
+      edgelist_edge_types = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_types),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
   }
 
   return std::make_tuple(std::move(edgelist_srcs),
diff --git a/cpp/tests/community/triangle_count_test.cpp b/cpp/tests/community/triangle_count_test.cpp
index 836bab59457..592924c3c47 100644
--- a/cpp/tests/community/triangle_count_test.cpp
+++ b/cpp/tests/community/triangle_count_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -232,7 +232,7 @@ class Tests_TriangleCount
       for (size_t i = 0; i < h_cugraph_vertices.size(); ++i) {
         auto v     = h_cugraph_vertices[i];
         auto count = h_cugraph_triangle_counts[i];
-        ASSERT_TRUE(count == h_reference_triangle_counts[v])
+        ASSERT_EQ(count, h_reference_triangle_counts[v])
           << "Triangle count values do not match with the reference values.";
       }
     }
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 16c9d3ed145..8cc87b26f1d 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -621,9 +621,25 @@ construct_graph(raft::handle_t const& handle,
 
   CUGRAPH_EXPECTS(d_src_v.size() <= static_cast<size_t>(std::numeric_limits<edge_t>::max()),
                   "Invalid template parameter: edge_t overflow.");
-  if (drop_self_loops) { remove_self_loops(handle, d_src_v, d_dst_v, d_weights_v); }
+  if (drop_self_loops) {
+    std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) =
+      cugraph::remove_self_loops<vertex_t, edge_t, weight_t, int32_t>(handle,
+                                                                      std::move(d_src_v),
+                                                                      std::move(d_dst_v),
+                                                                      std::move(d_weights_v),
+                                                                      std::nullopt,
+                                                                      std::nullopt);
+  }
 
-  if (drop_multi_edges) { sort_and_remove_multi_edges(handle, d_src_v, d_dst_v, d_weights_v); }
+  if (drop_multi_edges) {
+    std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) =
+      cugraph::remove_multi_edges<vertex_t, edge_t, weight_t, int32_t>(handle,
+                                                                       std::move(d_src_v),
+                                                                       std::move(d_dst_v),
+                                                                       std::move(d_weights_v),
+                                                                       std::nullopt,
+                                                                       std::nullopt);
+  }
 
   graph_t<vertex_t, edge_t, store_transposed, multi_gpu> graph(handle);
   std::optional<
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index cb7e6f1bd66..2daf250b4a2 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -206,131 +206,5 @@ template void populate_vertex_ids(raft::handle_t const& handle,
                                   rmm::device_uvector<int64_t>& d_vertices_v,
                                   int64_t vertex_id_offset);
 
-template <typename vertex_t, typename weight_t>
-void remove_self_loops(raft::handle_t const& handle,
-                       rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-                       rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-                       std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */)
-{
-  if (d_weight_v) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(d_src_v.begin(), d_dst_v.begin(), (*d_weight_v).begin()));
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::remove_if(
-                         handle.get_thrust_policy(),
-                         edge_first,
-                         edge_first + d_src_v.size(),
-                         [] __device__(auto e) { return thrust::get<0>(e) == thrust::get<1>(e); })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-    (*d_weight_v).resize(d_src_v.size(), handle.get_stream());
-  } else {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::remove_if(
-                         handle.get_thrust_policy(),
-                         edge_first,
-                         edge_first + d_src_v.size(),
-                         [] __device__(auto e) { return thrust::get<0>(e) == thrust::get<1>(e); })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-  }
-
-  d_src_v.shrink_to_fit(handle.get_stream());
-  d_dst_v.shrink_to_fit(handle.get_stream());
-  if (d_weight_v) { (*d_weight_v).shrink_to_fit(handle.get_stream()); }
-}
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template <typename vertex_t, typename weight_t>
-void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */)
-{
-  if (d_weight_v) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(d_src_v.begin(), d_dst_v.begin(), (*d_weight_v).begin()));
-    thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size());
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::unique(handle.get_thrust_policy(),
-                                      edge_first,
-                                      edge_first + d_src_v.size(),
-                                      [] __device__(auto lhs, auto rhs) {
-                                        return (thrust::get<0>(lhs) == thrust::get<0>(rhs)) &&
-                                               (thrust::get<1>(lhs) == thrust::get<1>(rhs));
-                                      })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-    (*d_weight_v).resize(d_src_v.size(), handle.get_stream());
-  } else {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
-    thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size());
-    d_src_v.resize(
-      thrust::distance(
-        edge_first,
-        thrust::unique(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size())),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-  }
-
-  d_src_v.shrink_to_fit(handle.get_stream());
-  d_dst_v.shrink_to_fit(handle.get_stream());
-  if (d_weight_v) { (*d_weight_v).shrink_to_fit(handle.get_stream()); }
-}
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index eead4dc268f..fb82d781198 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -46,18 +46,5 @@ void populate_vertex_ids(raft::handle_t const& handle,
                          rmm::device_uvector<vertex_t>& d_vertices_v /* [INOUT] */,
                          vertex_t vertex_id_offset);
 
-template <typename vertex_t, typename weight_t>
-void remove_self_loops(raft::handle_t const& handle,
-                       rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-                       rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-                       std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */);
-
-template <typename vertex_t, typename weight_t>
-void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */);
-
 }  // namespace test
 }  // namespace cugraph

From 2ec94b310ae854678e3c39a687b6cf34bd702c1b Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 6 Dec 2023 09:58:26 -0500
Subject: [PATCH 3/9] Update Changelog [skip ci]

---
 CHANGELOG.md | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 33a5b2bc5e7..d165cd7efc4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,84 @@
+# cuGraph 23.12.00 (6 Dec 2023)
+
+## 🚨 Breaking Changes
+
+- [BUG] Restore the original default order of CSR, which does not reverse edges in cuGraph-PyG ([#3980](https://github.com/rapidsai/cugraph/pull/3980)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- `Resultset` and `Dataset` Refactors ([#3957](https://github.com/rapidsai/cugraph/pull/3957)) [@nv-rliu](https://github.com/nv-rliu)
+- Moves more MG graph ETL to libcugraph and re-enables MG tests in CI ([#3941](https://github.com/rapidsai/cugraph/pull/3941)) [@jnke2016](https://github.com/jnke2016)
+
+## 🐛 Bug Fixes
+
+- Pin actions/labeler to v4 ([#4038](https://github.com/rapidsai/cugraph/pull/4038)) [@raydouglass](https://github.com/raydouglass)
+- Find rmm before cuco ([#4011](https://github.com/rapidsai/cugraph/pull/4011)) [@vyasr](https://github.com/vyasr)
+- Pin to minor versions of packages outside the cuGraph repository. ([#4004](https://github.com/rapidsai/cugraph/pull/4004)) [@bdice](https://github.com/bdice)
+- Move MTMG_TEST to MG tests block ([#3993](https://github.com/rapidsai/cugraph/pull/3993)) [@naimnv](https://github.com/naimnv)
+- Fix Leiden refinement phase ([#3990](https://github.com/rapidsai/cugraph/pull/3990)) [@naimnv](https://github.com/naimnv)
+- [BUG] Fix Graph Construction From Pandas in cuGraph-PyG ([#3985](https://github.com/rapidsai/cugraph/pull/3985)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [BUG] Restore the original default order of CSR, which does not reverse edges in cuGraph-PyG ([#3980](https://github.com/rapidsai/cugraph/pull/3980)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Fix eigenvector testing and HITS testing discrepancies ([#3979](https://github.com/rapidsai/cugraph/pull/3979)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [BUG] Fix Incorrect Edge Index, Directory Selection in cuGraph-PyG Loader ([#3978](https://github.com/rapidsai/cugraph/pull/3978)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [BUG] Check if Dask has quit to avoid throwing an exception and triggering a segfault on ddp exit ([#3961](https://github.com/rapidsai/cugraph/pull/3961)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- nx-cugraph: xfail test_louvain.py:test_threshold in Python 3.9 ([#3944](https://github.com/rapidsai/cugraph/pull/3944)) [@eriknw](https://github.com/eriknw)
+
+## 📖 Documentation
+
+- [DOC]: Fix invalid links and add materials to notebooks ([#4002](https://github.com/rapidsai/cugraph/pull/4002)) [@huiyuxie](https://github.com/huiyuxie)
+- Update Broken Links in README.md ([#3924](https://github.com/rapidsai/cugraph/pull/3924)) [@nv-rliu](https://github.com/nv-rliu)
+
+## 🚀 New Features
+
+- Implement the transform_e primitive (to update property values for all edges) ([#3917](https://github.com/rapidsai/cugraph/pull/3917)) [@seunghwak](https://github.com/seunghwak)
+- Update the neighbor intersection primitive to support edge masking. ([#3550](https://github.com/rapidsai/cugraph/pull/3550)) [@seunghwak](https://github.com/seunghwak)
+
+## 🛠️ Improvements
+
+- Correct defect found in DLFW testing ([#4021](https://github.com/rapidsai/cugraph/pull/4021)) [@ChuckHastings](https://github.com/ChuckHastings)
+- `nx-cugraph` README update: adds missing `connected_components` algo to table ([#4019](https://github.com/rapidsai/cugraph/pull/4019)) [@rlratzel](https://github.com/rlratzel)
+- Build concurrency for nightly and merge triggers ([#4009](https://github.com/rapidsai/cugraph/pull/4009)) [@bdice](https://github.com/bdice)
+- Support `drop_last` Argument in cuGraph-PyG Loader ([#3995](https://github.com/rapidsai/cugraph/pull/3995)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Adds `update-version.sh` support for recently added files containing RAPIDS versions ([#3994](https://github.com/rapidsai/cugraph/pull/3994)) [@rlratzel](https://github.com/rlratzel)
+- Use new `rapids-dask-dependency` metapackage for managing `dask` versions ([#3991](https://github.com/rapidsai/cugraph/pull/3991)) [@galipremsagar](https://github.com/galipremsagar)
+- Fixes to nx-cugraph README: fixes typos, updates link to NX backend docs ([#3989](https://github.com/rapidsai/cugraph/pull/3989)) [@rlratzel](https://github.com/rlratzel)
+- Address FIXMEs ([#3988](https://github.com/rapidsai/cugraph/pull/3988)) [@seunghwak](https://github.com/seunghwak)
+- Updates README file to include nx-cugraph user documentation, adds nx-cugraph to main README ([#3984](https://github.com/rapidsai/cugraph/pull/3984)) [@rlratzel](https://github.com/rlratzel)
+- Update C API graph creation function signatures ([#3982](https://github.com/rapidsai/cugraph/pull/3982)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [REVIEW]Optimize cugraph-DGL csc codepath ([#3977](https://github.com/rapidsai/cugraph/pull/3977)) [@VibhuJawa](https://github.com/VibhuJawa)
+- nx-cugraph: add SSSP (unweighted) ([#3976](https://github.com/rapidsai/cugraph/pull/3976)) [@eriknw](https://github.com/eriknw)
+- CuGraph compatibility fixes ([#3973](https://github.com/rapidsai/cugraph/pull/3973)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Skip certain `cugraph-pyg` tests when torch-sparse is not available ([#3970](https://github.com/rapidsai/cugraph/pull/3970)) [@tingyu66](https://github.com/tingyu66)
+- nx-cugraph: add `eigenvector_centrality`, `katz_centrality`, `hits`, `pagerank` ([#3968](https://github.com/rapidsai/cugraph/pull/3968)) [@eriknw](https://github.com/eriknw)
+- Cut peak memory footprint in graph creation ([#3966](https://github.com/rapidsai/cugraph/pull/3966)) [@seunghwak](https://github.com/seunghwak)
+- nx-cugraph: add CC for undirected graphs to fix k-truss ([#3965](https://github.com/rapidsai/cugraph/pull/3965)) [@eriknw](https://github.com/eriknw)
+- Skip certain `cugraph-pyg` tests when `torch_sparse` is not available ([#3962](https://github.com/rapidsai/cugraph/pull/3962)) [@tingyu66](https://github.com/tingyu66)
+- `Resultset` and `Dataset` Refactors ([#3957](https://github.com/rapidsai/cugraph/pull/3957)) [@nv-rliu](https://github.com/nv-rliu)
+- Download `xml` docs artifact through CloudFront endpoint ([#3955](https://github.com/rapidsai/cugraph/pull/3955)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Add many graph generators to nx-cugraph ([#3954](https://github.com/rapidsai/cugraph/pull/3954)) [@eriknw](https://github.com/eriknw)
+- Unpin `dask` and `distributed` for `23.12` development ([#3953](https://github.com/rapidsai/cugraph/pull/3953)) [@galipremsagar](https://github.com/galipremsagar)
+- Errors compiling for DLFW on CUDA 12.3 ([#3952](https://github.com/rapidsai/cugraph/pull/3952)) [@ChuckHastings](https://github.com/ChuckHastings)
+- nx-cugraph: add k_truss and degree centralities ([#3945](https://github.com/rapidsai/cugraph/pull/3945)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: handle seed argument in edge_betweenness_centrality ([#3943](https://github.com/rapidsai/cugraph/pull/3943)) [@eriknw](https://github.com/eriknw)
+- Moves more MG graph ETL to libcugraph and re-enables MG tests in CI ([#3941](https://github.com/rapidsai/cugraph/pull/3941)) [@jnke2016](https://github.com/jnke2016)
+- Temporarily disable mg testing ([#3940](https://github.com/rapidsai/cugraph/pull/3940)) [@jnke2016](https://github.com/jnke2016)
+- adding C/C++ API docs ([#3938](https://github.com/rapidsai/cugraph/pull/3938)) [@BradReesWork](https://github.com/BradReesWork)
+- Add multigraph support to nx-cugraph ([#3934](https://github.com/rapidsai/cugraph/pull/3934)) [@eriknw](https://github.com/eriknw)
+- Setup Consistent Nightly Versions for Pip and Conda ([#3933](https://github.com/rapidsai/cugraph/pull/3933)) [@divyegala](https://github.com/divyegala)
+- MTMG multi node ([#3932](https://github.com/rapidsai/cugraph/pull/3932)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Use branch-23.12 workflows. ([#3928](https://github.com/rapidsai/cugraph/pull/3928)) [@bdice](https://github.com/bdice)
+- Fix an issue occurring in the cuGraph-DGL example for &quot;mixed&quot; mode. ([#3927](https://github.com/rapidsai/cugraph/pull/3927)) [@drivanov](https://github.com/drivanov)
+- Updating Docs ([#3923](https://github.com/rapidsai/cugraph/pull/3923)) [@BradReesWork](https://github.com/BradReesWork)
+- Forward-merge branch-23.10 to branch-23.12 ([#3919](https://github.com/rapidsai/cugraph/pull/3919)) [@nv-rliu](https://github.com/nv-rliu)
+- new build all option ([#3916](https://github.com/rapidsai/cugraph/pull/3916)) [@BradReesWork](https://github.com/BradReesWork)
+- Silence spurious compiler warnings ([#3913](https://github.com/rapidsai/cugraph/pull/3913)) [@seunghwak](https://github.com/seunghwak)
+- Link wholegrah and cugraphops XML docs ([#3906](https://github.com/rapidsai/cugraph/pull/3906)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Updates to 23.12 ([#3905](https://github.com/rapidsai/cugraph/pull/3905)) [@raydouglass](https://github.com/raydouglass)
+- Forward-merge branch-23.10 to branch-23.12 ([#3904](https://github.com/rapidsai/cugraph/pull/3904)) [@GPUtester](https://github.com/GPUtester)
+- Build CUDA 12.0 ARM conda packages. ([#3903](https://github.com/rapidsai/cugraph/pull/3903)) [@bdice](https://github.com/bdice)
+- Merge branch-23.10 into branch-23.12 ([#3898](https://github.com/rapidsai/cugraph/pull/3898)) [@rlratzel](https://github.com/rlratzel)
+- Some MTMG code cleanup and small optimizations ([#3894](https://github.com/rapidsai/cugraph/pull/3894)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Enable parallel mode ([#3875](https://github.com/rapidsai/cugraph/pull/3875)) [@jnke2016](https://github.com/jnke2016)
+- Adds benchmarks for `nx-cugraph` ([#3854](https://github.com/rapidsai/cugraph/pull/3854)) [@rlratzel](https://github.com/rlratzel)
+- Add nx-cugraph notebook for showing accelerated networkX APIs ([#3830](https://github.com/rapidsai/cugraph/pull/3830)) [@betochimas](https://github.com/betochimas)
+
 # cuGraph 23.10.00 (11 Oct 2023)
 
 ## 🚨 Breaking Changes

From 783e4dc0d4ae06f16e469cb63883c66a224aab73 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Wed, 6 Dec 2023 10:06:20 -0500
Subject: [PATCH 4/9] Prevent automatic labeler from adding `Label Checker`
 labels (#4048)

This PR prevents the `doc` label from being automatically added to PRs since it can interfere with the [Label Checker](https://docs.rapids.ai/resources/label-checker/) check.

[skip ci]
---
 .github/labeler.yml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index c589fda6099..368bf328b99 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -9,17 +9,6 @@ python:
 benchmarks:
   - 'benchmarks/**'
 
-doc:
-  - 'docs/**'
-  - '**/*.md'
-  - 'datasets/**'
-  - 'notebooks/**'
-  - '**/*.txt'
-  - '**/*.rst'
-  - '**/*.ipynb'
-  - '**/*.pdf'
-  - '**/*.png'
-
 datasets:
   - 'datasets/**'
 

From f80480dae7aca9c419a7cdf9c6e90a77e056695f Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Wed, 6 Dec 2023 11:29:47 -0500
Subject: [PATCH 5/9] Add support for Louvain to MTMG (#4033)

Added vertex result instantiation for `vertex_t` which is necessary to handle Louvain results.

Added an MTMG test for Louvain to demonstrate how to use Louvain in MTMG.

Closes https://github.com/rapidsai/graph_dl/issues/330

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Joseph Nke (https://github.com/jnke2016)
  - Naim (https://github.com/naimnv)

URL: https://github.com/rapidsai/cugraph/pull/4033
---
 .../mtmg/detail/device_shared_wrapper.hpp     |   1 -
 cpp/include/cugraph/mtmg/graph_view.hpp       |  23 +-
 .../cugraph/mtmg/vertex_result_view.hpp       |   5 +-
 cpp/src/mtmg/vertex_result.cu                 |  89 ++-
 cpp/tests/CMakeLists.txt                      |  13 +-
 cpp/tests/mtmg/multi_node_threaded_test.cu    |   3 +-
 cpp/tests/mtmg/threaded_test.cu               |   3 +-
 cpp/tests/mtmg/threaded_test_louvain.cu       | 511 ++++++++++++++++++
 8 files changed, 617 insertions(+), 31 deletions(-)
 create mode 100644 cpp/tests/mtmg/threaded_test_louvain.cu

diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
index 3e4b2513a8d..5fbe7bc9f01 100644
--- a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
@@ -79,7 +79,6 @@ class device_shared_wrapper_t {
     objects_.insert(std::make_pair(local_rank, std::move(obj)));
   }
 
- public:
   /**
    * @brief Get reference to an object for a particular thread
    *
diff --git a/cpp/include/cugraph/mtmg/graph_view.hpp b/cpp/include/cugraph/mtmg/graph_view.hpp
index 94347e016ea..8e202ab4904 100644
--- a/cpp/include/cugraph/mtmg/graph_view.hpp
+++ b/cpp/include/cugraph/mtmg/graph_view.hpp
@@ -27,8 +27,27 @@ namespace mtmg {
  * @brief Graph view for each GPU
  */
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-using graph_view_t = detail::device_shared_wrapper_t<
-  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+class graph_view_t : public detail::device_shared_wrapper_t<
+                       cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>> {
+ public:
+  /**
+   * @brief Get the vertex_partition_view for this graph
+   */
+  vertex_partition_view_t<vertex_t, multi_gpu> get_vertex_partition_view(
+    cugraph::mtmg::handle_t const& handle) const
+  {
+    return this->get(handle).local_vertex_partition_view();
+  }
+
+  /**
+   * @brief Get the vertex_partition_view for this graph
+   */
+  std::vector<vertex_t> get_vertex_partition_range_lasts(
+    cugraph::mtmg::handle_t const& handle) const
+  {
+    return this->get(handle).vertex_partition_range_lasts();
+  }
+};
 
 }  // namespace mtmg
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_result_view.hpp b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
index a349bb95333..42b80cea62f 100644
--- a/cpp/include/cugraph/mtmg/vertex_result_view.hpp
+++ b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
@@ -39,11 +39,12 @@ class vertex_result_view_t : public detail::device_shared_device_span_t<result_t
   /**
    * @brief Gather results from specified vertices into a device vector
    */
-  template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+  template <typename vertex_t, bool multi_gpu>
   rmm::device_uvector<result_t> gather(
     handle_t const& handle,
     raft::device_span<vertex_t const> vertices,
-    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+    std::vector<vertex_t> const& vertex_partition_range_lasts,
+    cugraph::vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
     std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view);
 };
 
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
index 97fcd291c87..5b1825656ff 100644
--- a/cpp/src/mtmg/vertex_result.cu
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -27,15 +27,14 @@ namespace cugraph {
 namespace mtmg {
 
 template <typename result_t>
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+template <typename vertex_t, bool multi_gpu>
 rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   handle_t const& handle,
   raft::device_span<vertex_t const> vertices,
-  cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::vector<vertex_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view)
 {
-  auto this_gpu_graph_view = graph_view.get(handle);
-
   rmm::device_uvector<vertex_t> local_vertices(vertices.size(), handle.get_stream());
   rmm::device_uvector<int> vertex_gpu_ids(vertices.size(), handle.get_stream());
   rmm::device_uvector<size_t> vertex_pos(vertices.size(), handle.get_stream());
@@ -47,11 +46,11 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   cugraph::detail::sequence_fill(
     handle.get_stream(), vertex_pos.data(), vertex_pos.size(), size_t{0});
 
-  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
-    this_gpu_graph_view.vertex_partition_range_lasts().size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(),
+                                                               handle.get_stream());
   raft::update_device(d_vertex_partition_range_lasts.data(),
-                      this_gpu_graph_view.vertex_partition_range_lasts().data(),
-                      this_gpu_graph_view.vertex_partition_range_lasts().size(),
+                      vertex_partition_range_lasts.data(),
+                      vertex_partition_range_lasts.size(),
                       handle.get_stream());
 
   if (renumber_map_view) {
@@ -60,8 +59,8 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       local_vertices.data(),
       local_vertices.size(),
       renumber_map_view->get(handle).data(),
-      this_gpu_graph_view.local_vertex_partition_range_first(),
-      this_gpu_graph_view.local_vertex_partition_range_last());
+      vertex_partition_view.local_vertex_partition_range_first(),
+      vertex_partition_view.local_vertex_partition_range_last());
   }
 
   auto const major_comm_size =
@@ -89,8 +88,8 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
 
   auto& wrapped = this->get(handle);
 
-  auto vertex_partition = vertex_partition_device_view_t<vertex_t, multi_gpu>(
-    this_gpu_graph_view.local_vertex_partition_view());
+  auto vertex_partition =
+    vertex_partition_device_view_t<vertex_t, multi_gpu>(vertex_partition_view);
 
   auto iter =
     thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
@@ -130,37 +129,85 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int64_t const> vertices,
-  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<int32_t> vertex_result_view_t<int32_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<int32_t> vertex_result_view_t<int32_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<int64_t> vertex_result_view_t<int64_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<int64_t> vertex_result_view_t<int64_t>::gather(
   handle_t const& handle,
   raft::device_span<int64_t const> vertices,
-  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
 
 }  // namespace mtmg
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6530a25d178..d9c88bc179e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -738,9 +738,16 @@ if (BUILD_CUGRAPH_MTMG_TESTS)
     # - MTMG tests -------------------------------------------------------------------------
     ConfigureTest(MTMG_TEST mtmg/threaded_test.cu)
     target_link_libraries(MTMG_TEST
-                        PRIVATE
-                        UCP::UCP
-                        )
+                          PRIVATE
+                          UCP::UCP
+                         )
+
+    ConfigureTest(MTMG_LOUVAIN_TEST mtmg/threaded_test_louvain.cu)
+    target_link_libraries(MTMG_LOUVAIN_TEST
+                          PRIVATE
+                          cugraphmgtestutil
+                          UCP::UCP
+                         )
 
     if(BUILD_CUGRAPH_MG_TESTS)
         ###############################################################################################
diff --git a/cpp/tests/mtmg/multi_node_threaded_test.cu b/cpp/tests/mtmg/multi_node_threaded_test.cu
index e5a7de07781..17aed4fdecf 100644
--- a/cpp/tests/mtmg/multi_node_threaded_test.cu
+++ b/cpp/tests/mtmg/multi_node_threaded_test.cu
@@ -311,7 +311,8 @@ class Tests_Multithreaded
         auto d_my_pageranks = pageranks_view.gather(
           thread_handle,
           raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
-          graph_view,
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
           renumber_map_view);
 
         std::vector<result_t> my_pageranks(d_my_pageranks.size());
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
index 1a6a17eaa18..a5df0199cac 100644
--- a/cpp/tests/mtmg/threaded_test.cu
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -327,7 +327,8 @@ class Tests_Multithreaded
         auto d_my_pageranks = pageranks_view.gather(
           thread_handle,
           raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
-          graph_view,
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
           renumber_map_view);
 
         std::vector<result_t> my_pageranks(d_my_pageranks.size());
diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu
new file mode 100644
index 00000000000..c1395037646
--- /dev/null
+++ b/cpp/tests/mtmg/threaded_test_louvain.cu
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/edgelist.hpp>
+#include <cugraph/mtmg/graph.hpp>
+#include <cugraph/mtmg/per_thread_edgelist.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+#include <cugraph/mtmg/resource_manager.hpp>
+#include <cugraph/mtmg/vertex_result.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <nccl.h>
+
+#include <vector>
+
+#include <thrust/count.h>
+#include <thrust/unique.h>
+
+struct Multithreaded_Usecase {
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Multithreaded
+  : public ::testing::TestWithParam<std::tuple<Multithreaded_Usecase, input_usecase_t>> {
+ public:
+  Tests_Multithreaded() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  std::vector<int> get_gpu_list()
+  {
+    int num_gpus_per_node{1};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+
+    std::vector<int> gpu_list(num_gpus_per_node);
+    std::iota(gpu_list.begin(), gpu_list.end(), 0);
+
+    return gpu_list;
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool multi_gpu>
+  void run_current_test(
+    std::tuple<Multithreaded_Usecase const&, input_usecase_t const&> const& param,
+    std::vector<int> gpu_list)
+  {
+    using edge_type_t = int32_t;
+
+    constexpr bool renumber           = true;
+    constexpr bool do_expensive_check = false;
+
+    auto [multithreaded_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+
+    size_t max_level{1};  // Louvain is non-deterministic in MG if max_leve > 1
+    weight_t threshold{1e-6};
+    weight_t resolution{1};
+
+    size_t device_buffer_size{64 * 1024 * 1024};
+    size_t thread_buffer_size{4 * 1024 * 1024};
+
+    int num_gpus    = gpu_list.size();
+    int num_threads = num_gpus * 4;
+
+    cugraph::mtmg::resource_manager_t resource_manager;
+
+    std::for_each(gpu_list.begin(), gpu_list.end(), [&resource_manager](int gpu_id) {
+      resource_manager.register_local_gpu(gpu_id, rmm::cuda_device_id{gpu_id});
+    });
+
+    ncclUniqueId instance_manager_id;
+    ncclGetUniqueId(&instance_manager_id);
+
+    auto instance_manager = resource_manager.create_instance_manager(
+      resource_manager.registered_ranks(), instance_manager_id);
+
+    cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, false, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu> graph_view;
+    cugraph::mtmg::vertex_result_t<vertex_t> louvain_clusters;
+    std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
+      std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
+
+    auto edge_weights = multithreaded_usecase.test_weighted
+                          ? std::make_optional<cugraph::mtmg::edge_property_t<
+                              cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+                              weight_t>>()
+                          : std::nullopt;
+
+    //
+    // Simulate graph creation by spawning threads to walk through the
+    // local COO and add edges
+    //
+    std::vector<std::thread> running_threads;
+
+    //  Initialize shared edgelist object, one per GPU
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &edgelist,
+                                    device_buffer_size,
+                                    use_weight    = true,
+                                    use_edge_id   = false,
+                                    use_edge_type = false]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        edgelist.set(thread_handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    // Load SG edge list
+    auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] =
+      input_usecase.template construct_edgelist<vertex_t, weight_t>(
+        handle, multithreaded_usecase.test_weighted, false, false);
+
+    rmm::device_uvector<vertex_t> d_unique_vertices(2 * d_src_v.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), d_src_v.begin(), d_src_v.end(), d_unique_vertices.begin());
+    thrust::copy(handle.get_thrust_policy(),
+                 d_dst_v.begin(),
+                 d_dst_v.end(),
+                 d_unique_vertices.begin() + d_src_v.size());
+    thrust::sort(handle.get_thrust_policy(), d_unique_vertices.begin(), d_unique_vertices.end());
+
+    d_unique_vertices.resize(thrust::distance(d_unique_vertices.begin(),
+                                              thrust::unique(handle.get_thrust_policy(),
+                                                             d_unique_vertices.begin(),
+                                                             d_unique_vertices.end())),
+                             handle.get_stream());
+
+    auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
+    auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
+    auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_unique_vertices);
+
+    // Load edgelist from different threads.  We'll use more threads than GPUs here
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    thread_buffer_size,
+                                    &edgelist,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+        cugraph::mtmg::per_thread_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>
+          per_thread_edgelist(edgelist.get(thread_handle), thread_buffer_size);
+
+        for (size_t j = i; j < h_src_v.size(); j += num_threads) {
+          per_thread_edgelist.append(
+            thread_handle,
+            h_src_v[j],
+            h_dst_v[j],
+            h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
+            std::nullopt,
+            std::nullopt);
+        }
+
+        per_thread_edgelist.flush(thread_handle);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph,
+                                    &edge_weights,
+                                    &edgelist,
+                                    &renumber_map,
+                                    is_symmetric = is_symmetric,
+                                    renumber,
+                                    do_expensive_check]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          edge_t>>
+          edge_ids{std::nullopt};
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          int32_t>>
+          edge_types{std::nullopt};
+
+        edgelist.finalize_buffer(thread_handle);
+        edgelist.consolidate_and_shuffle(thread_handle, false);
+
+        cugraph::mtmg::
+          create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
+            thread_handle,
+            edgelist,
+            cugraph::graph_properties_t{is_symmetric, true},
+            renumber,
+            graph,
+            edge_weights,
+            edge_ids,
+            edge_types,
+            renumber_map,
+            do_expensive_check);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    graph_view             = graph.view();
+    auto renumber_map_view = renumber_map ? std::make_optional(renumber_map->view()) : std::nullopt;
+
+    weight_t modularity{0};
+
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &edge_weights,
+                                    &louvain_clusters,
+                                    &modularity,
+                                    &renumber_map,
+                                    max_level,
+                                    threshold,
+                                    resolution]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        rmm::device_uvector<vertex_t> local_louvain_clusters(
+          graph_view.get(thread_handle).local_vertex_partition_range_size(),
+          thread_handle.get_stream());
+
+        std::tie(std::ignore, modularity) = cugraph::louvain<vertex_t, edge_t, weight_t, true>(
+          thread_handle.raft_handle(),
+          graph_view.get(thread_handle),
+          edge_weights ? std::make_optional(edge_weights->get(thread_handle).view()) : std::nullopt,
+          local_louvain_clusters.data(),
+          max_level,
+          threshold,
+          resolution);
+
+        louvain_clusters.set(thread_handle, std::move(local_louvain_clusters));
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    std::vector<std::tuple<std::vector<vertex_t>, std::vector<vertex_t>>> computed_clusters_v;
+    std::mutex computed_clusters_lock{};
+
+    auto louvain_clusters_view = louvain_clusters.view();
+    std::vector<vertex_t> h_renumber_map;
+
+    // Load computed_clusters_v from different threads.
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &renumber_map_view,
+                                    &louvain_clusters_view,
+                                    &computed_clusters_lock,
+                                    &computed_clusters_v,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    &h_renumber_map,
+                                    &unique_vertices,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        auto number_of_vertices = unique_vertices.size();
+
+        std::vector<vertex_t> my_vertex_list;
+        my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
+
+        for (size_t j = i; j < number_of_vertices; j += num_threads) {
+          my_vertex_list.push_back(unique_vertices[j]);
+        }
+
+        rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
+                                                       thread_handle.raft_handle().get_stream());
+        raft::update_device(d_my_vertex_list.data(),
+                            my_vertex_list.data(),
+                            my_vertex_list.size(),
+                            thread_handle.raft_handle().get_stream());
+
+        auto d_my_clusters = louvain_clusters_view.gather(
+          thread_handle,
+          raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
+          renumber_map_view);
+
+        std::vector<vertex_t> my_clusters(d_my_clusters.size());
+        raft::update_host(my_clusters.data(),
+                          d_my_clusters.data(),
+                          d_my_clusters.size(),
+                          thread_handle.raft_handle().get_stream());
+
+        {
+          std::lock_guard<std::mutex> lock(computed_clusters_lock);
+          computed_clusters_v.push_back(
+            std::make_tuple(std::move(my_vertex_list), std::move(my_clusters)));
+        }
+
+        h_renumber_map = cugraph::test::to_host(
+          thread_handle.raft_handle(),
+          cugraph::test::device_allgatherv(thread_handle.raft_handle(),
+                                           renumber_map_view->get(thread_handle)));
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    if (multithreaded_usecase.check_correctness) {
+      // Want to compare the results in computed_clusters_v with SG results
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(handle);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+
+      for (int i = 0; i < num_gpus; ++i) {
+        running_threads.emplace_back(
+          [&instance_manager, &graph_view, &edge_weights, &sg_graph, &sg_edge_weights]() {
+            auto thread_handle = instance_manager->get_handle();
+
+            if (thread_handle.get_rank() == 0) {
+              std::tie(sg_graph, sg_edge_weights, std::ignore) =
+                cugraph::test::mg_graph_to_sg_graph(
+                  thread_handle.raft_handle(),
+                  graph_view.get(thread_handle),
+                  edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                               : std::nullopt,
+                  std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+                  false);  // create an SG graph with MG graph vertex IDs
+            } else {
+              cugraph::test::mg_graph_to_sg_graph(
+                thread_handle.raft_handle(),
+                graph_view.get(thread_handle),
+                edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                             : std::nullopt,
+                std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+                false);  // create an SG graph with MG graph vertex IDs
+            }
+          });
+      }
+
+      // Wait for CPU threads to complete
+      std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+      running_threads.resize(0);
+      instance_manager->reset_threads();
+
+      rmm::device_uvector<vertex_t> sg_clusters(sg_graph.number_of_vertices(), handle.get_stream());
+      weight_t modularity;
+
+      std::tie(std::ignore, modularity) = cugraph::louvain<vertex_t, edge_t, weight_t, false>(
+        handle,
+        sg_graph.view(),
+        sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt,
+        sg_clusters.data(),
+        max_level,
+        threshold,
+        resolution);
+
+      auto h_sg_clusters = cugraph::test::to_host(handle, sg_clusters);
+      std::map<vertex_t, vertex_t> h_cluster_map;
+      std::map<vertex_t, vertex_t> h_cluster_reverse_map;
+
+      std::for_each(
+        computed_clusters_v.begin(),
+        computed_clusters_v.end(),
+        [&h_sg_clusters, &h_cluster_map, &h_renumber_map, &h_cluster_reverse_map](auto t1) {
+          std::for_each(
+            thrust::make_zip_iterator(std::get<0>(t1).begin(), std::get<1>(t1).begin()),
+            thrust::make_zip_iterator(std::get<0>(t1).end(), std::get<1>(t1).end()),
+            [&h_sg_clusters, &h_cluster_map, &h_renumber_map, &h_cluster_reverse_map](auto t2) {
+              vertex_t v = thrust::get<0>(t2);
+              vertex_t c = thrust::get<1>(t2);
+
+              auto pos    = std::find(h_renumber_map.begin(), h_renumber_map.end(), v);
+              auto offset = std::distance(h_renumber_map.begin(), pos);
+
+              auto cluster_pos = h_cluster_map.find(c);
+              if (cluster_pos == h_cluster_map.end()) {
+                auto reverse_pos = h_cluster_reverse_map.find(h_sg_clusters[offset]);
+
+                ASSERT_TRUE(reverse_pos != h_cluster_map.end()) << "two different cluster mappings";
+
+                h_cluster_map.insert(std::make_pair(c, h_sg_clusters[offset]));
+                h_cluster_reverse_map.insert(std::make_pair(h_sg_clusters[offset], c));
+              } else {
+                ASSERT_EQ(cluster_pos->second, h_sg_clusters[offset])
+                  << "vertex " << v << ", offset = " << offset
+                  << ", SG cluster = " << h_sg_clusters[offset] << ", mtmg cluster = " << c
+                  << ", mapped value = " << cluster_pos->second;
+              }
+            });
+        });
+    }
+  }
+};
+
+using Tests_Multithreaded_File = Tests_Multithreaded<cugraph::test::File_Usecase>;
+using Tests_Multithreaded_Rmat = Tests_Multithreaded<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_Multithreaded_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+TEST_P(Tests_Multithreaded_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+INSTANTIATE_TEST_SUITE_P(file_test,
+                         Tests_Multithreaded_File,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(Multithreaded_Usecase{true, true}),
+                           ::testing::Values(cugraph::test::File_Usecase("karate.csv"),
+                                             cugraph::test::File_Usecase("dolphins.csv"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(Multithreaded_Usecase{true, true}),
+    //::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+    ::testing::Values(cugraph::test::Rmat_Usecase(5, 8, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_File,
+  ::testing::Combine(
+    // disable correctness checks
+    ::testing::Values(Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()

From a5718c66aa5e72a7d91b4b3a073736f195352736 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Wed, 6 Dec 2023 08:52:39 -0800
Subject: [PATCH 6/9] Add a barrier before cugraph Graph creation (#4046)

This PR introduces a short term fix for https://github.com/rapidsai/cugraph/issues/4037 .


CC: @jnke2016 , @rlratzel

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4046
---
 .../cugraph/cugraph/dask/common/mg_utils.py   |  7 +++++-
 .../simpleDistributedGraph.py                 | 22 +++++++++----------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/python/cugraph/cugraph/dask/common/mg_utils.py b/python/cugraph/cugraph/dask/common/mg_utils.py
index 6acda48c9da..b04f293dc0e 100644
--- a/python/cugraph/cugraph/dask/common/mg_utils.py
+++ b/python/cugraph/cugraph/dask/common/mg_utils.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import os
-
+import gc
 import numba.cuda
 
 
@@ -68,3 +68,8 @@ def get_visible_devices():
     else:
         visible_devices = _visible_devices.strip().split(",")
     return visible_devices
+
+
+def run_gc_on_dask_cluster(client):
+    gc.collect()
+    client.run(gc.collect)
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index f666900b226..319435575cc 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -37,8 +37,8 @@
 from cugraph.dask.common.part_utils import (
     get_persisted_df_worker_map,
     persist_dask_df_equal_parts_per_worker,
-    _chunk_lst,
 )
+from cugraph.dask.common.mg_utils import run_gc_on_dask_cluster
 from cugraph.dask import get_n_workers
 import cugraph.dask.comms.comms as Comms
 
@@ -171,7 +171,6 @@ def __from_edgelist(
         store_transposed=False,
         legacy_renum_only=False,
     ):
-
         if not isinstance(input_ddf, dask_cudf.DataFrame):
             raise TypeError("input should be a dask_cudf dataFrame")
 
@@ -275,7 +274,6 @@ def __from_edgelist(
             )
             value_col = None
         else:
-
             source_col, dest_col, value_col = symmetrize(
                 input_ddf,
                 source,
@@ -350,9 +348,11 @@ def __from_edgelist(
             is_symmetric=not self.properties.directed,
         )
         ddf = ddf.repartition(npartitions=len(workers) * 2)
-        ddf_keys = ddf.to_delayed()
         workers = _client.scheduler_info()["workers"].keys()
-        ddf_keys_ls = _chunk_lst(ddf_keys, len(workers))
+        persisted_keys_d = persist_dask_df_equal_parts_per_worker(
+            ddf, _client, return_type="dict"
+        )
+        del ddf
 
         delayed_tasks_d = {
             w: delayed(simpleDistributedGraphImpl._make_plc_graph)(
@@ -367,19 +367,19 @@ def __from_edgelist(
                 self.edge_id_type,
                 self.edge_type_id_type,
             )
-            for w, edata in zip(workers, ddf_keys_ls)
+            for w, edata in persisted_keys_d.items()
         }
+        del persisted_keys_d
         self._plc_graph = {
             w: _client.compute(
                 delayed_task, workers=w, allow_other_workers=False, pure=False
             )
             for w, delayed_task in delayed_tasks_d.items()
         }
-        wait(list(self._plc_graph.values()))
-        del ddf_keys
         del delayed_tasks_d
-        gc.collect()
-        _client.run(gc.collect)
+        run_gc_on_dask_cluster(_client)
+        wait(list(self._plc_graph.values()))
+        run_gc_on_dask_cluster(_client)
 
     @property
     def renumbered(self):
@@ -945,7 +945,6 @@ def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.Series:
         def _call_plc_select_random_vertices(
             mg_graph_x, sID: bytes, random_state: int, num_vertices: int
         ) -> cudf.Series:
-
             cp_arrays = pylibcugraph_select_random_vertices(
                 graph=mg_graph_x,
                 resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
@@ -961,7 +960,6 @@ def _mg_call_plc_select_random_vertices(
             random_state: int,
             num_vertices: int,
         ) -> dask_cudf.Series:
-
             result = [
                 client.submit(
                     _call_plc_select_random_vertices,

From 65df1a271011b65c039eac354259250b1e96b5d1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:35:17 -0800
Subject: [PATCH 7/9] Fix HITS convergence error. (#4043)

We set epsilon in hits call to 1e-6 or 1e-8 in our tests. HITS internally uses max. norm to normalize HITS values after each iteration before computing HITS value changes in two consecutive iterations. Sum of HITS values tends to grow with the number of vertices. Using a fixed epsilon leads to convergence failure in large graphs.

This PR updates HITS to compare sum of HITS value changes in two consecutive iterations with `epsilon` * graph_view.number_of_vertices() following networkx documentation (https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.hits_alg.hits.html).

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Joseph Nke (https://github.com/jnke2016)
  - Naim (https://github.com/naimnv)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4043
---
 cpp/src/link_analysis/hits_impl.cuh      |  3 ++-
 cpp/tests/c_api/hits_test.c              |  6 ++---
 cpp/tests/c_api/mg_hits_test.c           |  4 ++--
 cpp/tests/link_analysis/hits_test.cpp    | 28 ++++++++++++++----------
 cpp/tests/link_analysis/mg_hits_test.cpp | 18 +++++++--------
 5 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/cpp/src/link_analysis/hits_impl.cuh b/cpp/src/link_analysis/hits_impl.cuh
index 674046745b1..5cdf1b9dc6a 100644
--- a/cpp/src/link_analysis/hits_impl.cuh
+++ b/cpp/src/link_analysis/hits_impl.cuh
@@ -80,6 +80,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
   if (num_vertices == 0) { return std::make_tuple(diff_sum, final_iteration_count); }
 
   CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
+  auto tolerance = static_cast<result_t>(graph_view.number_of_vertices()) * epsilon;
 
   // Check validity of initial guess if supplied
   if (has_initial_hubs_guess && do_expensive_check) {
@@ -171,7 +172,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
     std::swap(prev_hubs, curr_hubs);
     iter++;
 
-    if (diff_sum < epsilon) {
+    if (diff_sum < tolerance) {
       break;
     } else if (iter >= max_iterations) {
       CUGRAPH_FAIL("HITS failed to converge.");
diff --git a/cpp/tests/c_api/hits_test.c b/cpp/tests/c_api/hits_test.c
index c275d883d11..1ebd4f82a51 100644
--- a/cpp/tests/c_api/hits_test.c
+++ b/cpp/tests/c_api/hits_test.c
@@ -163,7 +163,7 @@ int test_hits()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -195,7 +195,7 @@ int test_hits_with_transpose()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
@@ -232,7 +232,7 @@ int test_hits_with_initial()
   vertex_t h_initial_vertices[] = {0, 1, 2, 3, 4};
   weight_t h_initial_hubs[]     = {0.347296, 0.532089, 1, 0.00003608, 0.00003608};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   return generic_hits_test(h_src,
diff --git a/cpp/tests/c_api/mg_hits_test.c b/cpp/tests/c_api/mg_hits_test.c
index 87371093613..3e10bfc05d6 100644
--- a/cpp/tests/c_api/mg_hits_test.c
+++ b/cpp/tests/c_api/mg_hits_test.c
@@ -171,7 +171,7 @@ int test_hits(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -203,7 +203,7 @@ int test_hits_with_transpose(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
diff --git a/cpp/tests/link_analysis/hits_test.cpp b/cpp/tests/link_analysis/hits_test.cpp
index d0e77769034..6796761e212 100644
--- a/cpp/tests/link_analysis/hits_test.cpp
+++ b/cpp/tests/link_analysis/hits_test.cpp
@@ -52,9 +52,11 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
   size_t max_iterations,
   std::optional<result_t const*> starting_hub_values,
   bool normalized,
-  double tolerance)
+  double epsilon)
 {
   CUGRAPH_EXPECTS(num_vertices > 1, "number of vertices expected to be non-zero");
+  auto tolerance = static_cast<result_t>(num_vertices) * epsilon;
+
   std::vector<result_t> prev_hubs(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> prev_authorities(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> curr_hubs(num_vertices);
@@ -127,8 +129,8 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
 }
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -175,8 +177,8 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
     // 3. run hits
 
     auto graph_view         = graph.view();
-    auto maximum_iterations = 500;
-    weight_t tolerance      = 1e-5;
+    auto maximum_iterations = 200;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_hubs(graph_view.local_vertex_partition_range_size(),
                                          handle.get_stream());
 
@@ -201,7 +203,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
                                 graph_view,
                                 d_hubs.data(),
                                 d_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -232,7 +234,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
         (hits_usecase.check_initial_input) ? std::make_optional(initial_random_hubs.data())
                                            : std::nullopt,
         true,
-        tolerance);
+        epsilon);
 
       std::vector<weight_t> h_cugraph_hits{};
       if (renumber) {
@@ -246,8 +248,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
       handle.sync_stream();
       auto threshold_ratio = 1e-3;
       auto threshold_magnitude =
-        (1.0 / static_cast<weight_t>(graph_view.number_of_vertices())) *
-        threshold_ratio;  // skip comparison for low hits vertices (lowly ranked vertices)
+        1e-6;  // skip comparison for low hits vertices (lowly ranked vertices)
       auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
         return std::abs(lhs - rhs) <=
                std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -294,14 +295,17 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_Hits_Rmat,
                          // enable correctness checks
-                         ::testing::Combine(::testing::Values(Hits_Usecase{true, false},
+                         ::testing::Combine(::testing::Values(Hits_Usecase{false, true},
                                                               Hits_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
@@ -315,7 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // disable correctness checks
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -327,7 +331,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_Rmat,
   // disable correctness checks for large graphs
   ::testing::Combine(
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp
index cf95d03681d..5c89bafd08e 100644
--- a/cpp/tests/link_analysis/mg_hits_test.cpp
+++ b/cpp/tests/link_analysis/mg_hits_test.cpp
@@ -33,8 +33,8 @@
 #include <gtest/gtest.h>
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -81,7 +81,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
     auto mg_graph_view = mg_graph.view();
 
     auto maximum_iterations = 200;
-    weight_t tolerance      = 1e-8;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_mg_hubs(mg_graph_view.local_vertex_partition_range_size(),
                                             handle_->get_stream());
 
@@ -110,7 +110,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                 mg_graph_view,
                                 d_mg_hubs.data(),
                                 d_mg_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -205,7 +205,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                     sg_graph_view,
                                     d_sg_hubs.data(),
                                     d_sg_authorities.data(),
-                                    tolerance,
+                                    epsilon,
                                     maximum_iterations,
                                     hits_usecase.check_initial_input,
                                     true,
@@ -218,9 +218,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
 
         auto threshold_ratio = 1e-3;
         auto threshold_magnitude =
-          (1.0 / static_cast<result_t>(mg_graph_view.number_of_vertices())) *
-          threshold_ratio;  // skip comparison for low Hits verties (lowly ranked
-                            // vertices)
+          1e-6;  // skip comparison for low Hits verties (lowly ranked vertices)
         auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
           return std::abs(lhs - rhs) <
                  std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -274,7 +272,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -285,7 +283,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -297,7 +295,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Hits_Usecase{false, false}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()

From c709fc9c4e8d1a434a567b3b2c407f3d51ea9030 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 Dec 2023 12:52:59 -0800
Subject: [PATCH 8/9] Remove CUGRAPH_BUILD_WHEELS and standardize Python builds
 (#4041)

Some minor simplification in advance of the scikit-build-core migration to better align wheel and non-wheel Python builds.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/4041
---
 ci/build_wheel_cugraph.sh          |  2 +-
 ci/build_wheel_pylibcugraph.sh     |  2 +-
 python/cugraph/CMakeLists.txt      | 25 ++++++++-----------------
 python/pylibcugraph/CMakeLists.txt | 25 ++++++++-----------------
 4 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 5b5061f67c2..0a722c88c3e 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -12,6 +12,6 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME=pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibcugraph
 export PIP_FIND_LINKS=$(pwd)/local-pylibcugraph
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh cugraph python/cugraph
diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh
index 8d365bc250b..9e236c145ce 100755
--- a/ci/build_wheel_pylibcugraph.sh
+++ b/ci/build_wheel_pylibcugraph.sh
@@ -3,6 +3,6 @@
 
 set -euo pipefail
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh pylibcugraph python/pylibcugraph
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index a1ec12c6e07..99936b23a8c 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if(NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir cugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 7d5dc790ad0..e1250cb2edb 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether we're building a wheel for pypi" OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if (NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir pylibcugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})

From 1df62176c3d5a8addaad2a910c489c7426e8f6a4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 Dec 2023 16:12:44 -0800
Subject: [PATCH 9/9] Update dependencies.yaml to new pip index (#4045)

This PR changes all references to pypi.nvidia.com to pypi.anaconda.org/rapidsai-wheels-nightly.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/4045
---
 dependencies.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dependencies.yaml b/dependencies.yaml
index baa08c37413..b5c1fb2fa2d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -511,6 +511,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -536,6 +537,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -561,6 +563,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -586,6 +589,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -611,6 +615,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -636,6 +641,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -661,6 +667,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices: